blob: c4f8920789ac17e1e6e16b37870cc68bcb0a117b [file] [log] [blame]
Bram Moolenaar071d4272004-06-13 20:20:40 +00001/* vi:set ts=8 sts=4 sw=4:
2 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
4 *
5 * NOTICE:
6 *
7 * This is NOT the original regular expression code as written by Henry
8 * Spencer. This code has been modified specifically for use with the VIM
9 * editor, and should not be used separately from Vim. If you want a good
10 * regular expression library, get the original code. The copyright notice
11 * that follows is from the original.
12 *
13 * END NOTICE
14 *
15 * Copyright (c) 1986 by University of Toronto.
16 * Written by Henry Spencer. Not derived from licensed software.
17 *
18 * Permission is granted to anyone to use this software for any
19 * purpose on any computer system, and to redistribute it freely,
20 * subject to the following restrictions:
21 *
22 * 1. The author is not responsible for the consequences of use of
23 * this software, no matter how awful, even if they arise
24 * from defects in it.
25 *
26 * 2. The origin of this software must not be misrepresented, either
27 * by explicit claim or by omission.
28 *
29 * 3. Altered versions must be plainly marked as such, and must not
30 * be misrepresented as being the original software.
31 *
32 * Beware that some of this code is subtly aware of the way operator
33 * precedence is structured in regular expressions. Serious changes in
34 * regular-expression syntax might require a total rethink.
35 *
Bram Moolenaarc0197e22004-09-13 20:26:32 +000036 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
37 * Webb, Ciaran McCreesh and Bram Moolenaar.
Bram Moolenaar071d4272004-06-13 20:20:40 +000038 * Named character class support added by Walter Briscoe (1998 Jul 01)
39 */
40
41#include "vim.h"
42
43#undef DEBUG
44
45/*
46 * The "internal use only" fields in regexp.h are present to pass info from
47 * compile to execute that permits the execute phase to run lots faster on
48 * simple cases. They are:
49 *
50 * regstart char that must begin a match; NUL if none obvious; Can be a
51 * multi-byte character.
52 * reganch is the match anchored (at beginning-of-line only)?
53 * regmust string (pointer into program) that match must include, or NULL
54 * regmlen length of regmust string
55 * regflags RF_ values or'ed together
56 *
57 * Regstart and reganch permit very fast decisions on suitable starting points
58 * for a match, cutting down the work a lot. Regmust permits fast rejection
59 * of lines that cannot possibly match. The regmust tests are costly enough
60 * that vim_regcomp() supplies a regmust only if the r.e. contains something
61 * potentially expensive (at present, the only such thing detected is * or +
62 * at the start of the r.e., which can involve a lot of backup). Regmlen is
63 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
64 * computing it anyway.
65 */
66
67/*
68 * Structure for regexp "program". This is essentially a linear encoding
69 * of a nondeterministic finite-state machine (aka syntax charts or
70 * "railroad normal form" in parsing technology). Each node is an opcode
71 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
72 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
73 * pointer with a BRANCH on both ends of it is connecting two alternatives.
74 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
75 * (as opposed to a collection of them) is never concatenated with anything
76 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
Bram Moolenaardf177f62005-02-22 08:39:57 +000077 * node points to the node after the stuff to be repeated.
78 * The operand of some types of node is a literal string; for others, it is a
79 * node leading into a sub-FSM. In particular, the operand of a BRANCH node
80 * is the first node of the branch.
81 * (NB this is *not* a tree structure: the tail of the branch connects to the
82 * thing following the set of BRANCHes.)
Bram Moolenaar071d4272004-06-13 20:20:40 +000083 *
84 * pattern is coded like:
85 *
86 * +-----------------+
87 * | V
88 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
89 * | ^ | ^
90 * +------+ +----------+
91 *
92 *
93 * +------------------+
94 * V |
95 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
96 * | | ^ ^
97 * | +---------------+ |
98 * +---------------------------------------------+
99 *
100 *
Bram Moolenaardf177f62005-02-22 08:39:57 +0000101 * +----------------------+
102 * V |
103 * <aa>\+ BRANCH <aa> --> BRANCH --> BACK BRANCH --> NOTHING --> END
104 * | | ^ ^
105 * | +----------+ |
106 * +-------------------------------------------------+
107 *
108 *
Bram Moolenaar071d4272004-06-13 20:20:40 +0000109 * +-------------------------+
110 * V |
111 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
112 * | | ^
113 * | +----------------+
114 * +-----------------------------------------------+
115 *
116 *
117 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
118 * | | ^ ^
119 * | +----------------+ |
120 * +--------------------------------+
121 *
122 * +---------+
123 * | V
124 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
125 * | | | | ^ ^
126 * | | | +-----+ |
127 * | | +----------------+ |
128 * | +---------------------------+ |
129 * +------------------------------------------------------+
130 *
131 * They all start with a BRANCH for "\|" alternaties, even when there is only
132 * one alternative.
133 */
134
135/*
136 * The opcodes are:
137 */
138
139/* definition number opnd? meaning */
140#define END 0 /* End of program or NOMATCH operand. */
141#define BOL 1 /* Match "" at beginning of line. */
142#define EOL 2 /* Match "" at end of line. */
143#define BRANCH 3 /* node Match this alternative, or the
144 * next... */
145#define BACK 4 /* Match "", "next" ptr points backward. */
146#define EXACTLY 5 /* str Match this string. */
147#define NOTHING 6 /* Match empty string. */
148#define STAR 7 /* node Match this (simple) thing 0 or more
149 * times. */
150#define PLUS 8 /* node Match this (simple) thing 1 or more
151 * times. */
152#define MATCH 9 /* node match the operand zero-width */
153#define NOMATCH 10 /* node check for no match with operand */
154#define BEHIND 11 /* node look behind for a match with operand */
155#define NOBEHIND 12 /* node look behind for no match with operand */
156#define SUBPAT 13 /* node match the operand here */
157#define BRACE_SIMPLE 14 /* node Match this (simple) thing between m and
158 * n times (\{m,n\}). */
159#define BOW 15 /* Match "" after [^a-zA-Z0-9_] */
160#define EOW 16 /* Match "" at [^a-zA-Z0-9_] */
161#define BRACE_LIMITS 17 /* nr nr define the min & max for BRACE_SIMPLE
162 * and BRACE_COMPLEX. */
163#define NEWL 18 /* Match line-break */
164#define BHPOS 19 /* End position for BEHIND or NOBEHIND */
165
166
167/* character classes: 20-48 normal, 50-78 include a line-break */
168#define ADD_NL 30
169#define FIRST_NL ANY + ADD_NL
170#define ANY 20 /* Match any one character. */
171#define ANYOF 21 /* str Match any character in this string. */
172#define ANYBUT 22 /* str Match any character not in this
173 * string. */
174#define IDENT 23 /* Match identifier char */
175#define SIDENT 24 /* Match identifier char but no digit */
176#define KWORD 25 /* Match keyword char */
177#define SKWORD 26 /* Match word char but no digit */
178#define FNAME 27 /* Match file name char */
179#define SFNAME 28 /* Match file name char but no digit */
180#define PRINT 29 /* Match printable char */
181#define SPRINT 30 /* Match printable char but no digit */
182#define WHITE 31 /* Match whitespace char */
183#define NWHITE 32 /* Match non-whitespace char */
184#define DIGIT 33 /* Match digit char */
185#define NDIGIT 34 /* Match non-digit char */
186#define HEX 35 /* Match hex char */
187#define NHEX 36 /* Match non-hex char */
188#define OCTAL 37 /* Match octal char */
189#define NOCTAL 38 /* Match non-octal char */
190#define WORD 39 /* Match word char */
191#define NWORD 40 /* Match non-word char */
192#define HEAD 41 /* Match head char */
193#define NHEAD 42 /* Match non-head char */
194#define ALPHA 43 /* Match alpha char */
195#define NALPHA 44 /* Match non-alpha char */
196#define LOWER 45 /* Match lowercase char */
197#define NLOWER 46 /* Match non-lowercase char */
198#define UPPER 47 /* Match uppercase char */
199#define NUPPER 48 /* Match non-uppercase char */
200#define LAST_NL NUPPER + ADD_NL
201#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
202
203#define MOPEN 80 /* -89 Mark this point in input as start of
204 * \( subexpr. MOPEN + 0 marks start of
205 * match. */
206#define MCLOSE 90 /* -99 Analogous to MOPEN. MCLOSE + 0 marks
207 * end of match. */
208#define BACKREF 100 /* -109 node Match same string again \1-\9 */
209
210#ifdef FEAT_SYN_HL
211# define ZOPEN 110 /* -119 Mark this point in input as start of
212 * \z( subexpr. */
213# define ZCLOSE 120 /* -129 Analogous to ZOPEN. */
214# define ZREF 130 /* -139 node Match external submatch \z1-\z9 */
215#endif
216
217#define BRACE_COMPLEX 140 /* -149 node Match nodes between m & n times */
218
219#define NOPEN 150 /* Mark this point in input as start of
220 \%( subexpr. */
221#define NCLOSE 151 /* Analogous to NOPEN. */
222
223#define MULTIBYTECODE 200 /* mbc Match one multi-byte character */
224#define RE_BOF 201 /* Match "" at beginning of file. */
225#define RE_EOF 202 /* Match "" at end of file. */
226#define CURSOR 203 /* Match location of cursor. */
227
228#define RE_LNUM 204 /* nr cmp Match line number */
229#define RE_COL 205 /* nr cmp Match column number */
230#define RE_VCOL 206 /* nr cmp Match virtual column number */
231
232/*
233 * Magic characters have a special meaning, they don't match literally.
234 * Magic characters are negative. This separates them from literal characters
235 * (possibly multi-byte). Only ASCII characters can be Magic.
236 */
237#define Magic(x) ((int)(x) - 256)
238#define un_Magic(x) ((x) + 256)
239#define is_Magic(x) ((x) < 0)
240
241static int no_Magic __ARGS((int x));
242static int toggle_Magic __ARGS((int x));
243
244 static int
245no_Magic(x)
246 int x;
247{
248 if (is_Magic(x))
249 return un_Magic(x);
250 return x;
251}
252
253 static int
254toggle_Magic(x)
255 int x;
256{
257 if (is_Magic(x))
258 return un_Magic(x);
259 return Magic(x);
260}
261
262/*
263 * The first byte of the regexp internal "program" is actually this magic
264 * number; the start node begins in the second byte. It's used to catch the
265 * most severe mutilation of the program by the caller.
266 */
267
268#define REGMAGIC 0234
269
270/*
271 * Opcode notes:
272 *
273 * BRANCH The set of branches constituting a single choice are hooked
274 * together with their "next" pointers, since precedence prevents
275 * anything being concatenated to any individual branch. The
276 * "next" pointer of the last BRANCH in a choice points to the
277 * thing following the whole choice. This is also where the
278 * final "next" pointer of each individual branch points; each
279 * branch starts with the operand node of a BRANCH node.
280 *
281 * BACK Normal "next" pointers all implicitly point forward; BACK
282 * exists to make loop structures possible.
283 *
284 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
285 * BRANCH structures using BACK. Simple cases (one character
286 * per match) are implemented with STAR and PLUS for speed
287 * and to minimize recursive plunges.
288 *
289 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
290 * node, and defines the min and max limits to be used for that
291 * node.
292 *
293 * MOPEN,MCLOSE ...are numbered at compile time.
294 * ZOPEN,ZCLOSE ...ditto
295 */
296
297/*
298 * A node is one char of opcode followed by two chars of "next" pointer.
299 * "Next" pointers are stored as two 8-bit bytes, high order first. The
300 * value is a positive offset from the opcode of the node containing it.
301 * An operand, if any, simply follows the node. (Note that much of the
302 * code generation knows about this implicit relationship.)
303 *
304 * Using two bytes for the "next" pointer is vast overkill for most things,
305 * but allows patterns to get big without disasters.
306 */
307#define OP(p) ((int)*(p))
308#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
309#define OPERAND(p) ((p) + 3)
310/* Obtain an operand that was stored as four bytes, MSB first. */
311#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
312 + ((long)(p)[5] << 8) + (long)(p)[6])
313/* Obtain a second operand stored as four bytes. */
314#define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
315/* Obtain a second single-byte operand stored after a four bytes operand. */
316#define OPERAND_CMP(p) (p)[7]
317
318/*
319 * Utility definitions.
320 */
321#define UCHARAT(p) ((int)*(char_u *)(p))
322
323/* Used for an error (down from) vim_regcomp(): give the error message, set
324 * rc_did_emsg and return NULL */
325#define EMSG_RET_NULL(m) { EMSG(m); rc_did_emsg = TRUE; return NULL; }
326#define EMSG_M_RET_NULL(m, c) { EMSG2(m, c ? "" : "\\"); rc_did_emsg = TRUE; return NULL; }
327#define EMSG_RET_FAIL(m) { EMSG(m); rc_did_emsg = TRUE; return FAIL; }
328#define EMSG_ONE_RET_NULL EMSG_M_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
329
330#define MAX_LIMIT (32767L << 16L)
331
332static int re_multi_type __ARGS((int));
333static int cstrncmp __ARGS((char_u *s1, char_u *s2, int *n));
334static char_u *cstrchr __ARGS((char_u *, int));
335
336#ifdef DEBUG
337static void regdump __ARGS((char_u *, regprog_T *));
338static char_u *regprop __ARGS((char_u *));
339#endif
340
341#define NOT_MULTI 0
342#define MULTI_ONE 1
343#define MULTI_MULT 2
344/*
345 * Return NOT_MULTI if c is not a "multi" operator.
346 * Return MULTI_ONE if c is a single "multi" operator.
347 * Return MULTI_MULT if c is a multi "multi" operator.
348 */
349 static int
350re_multi_type(c)
351 int c;
352{
353 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
354 return MULTI_ONE;
355 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
356 return MULTI_MULT;
357 return NOT_MULTI;
358}
359
360/*
361 * Flags to be passed up and down.
362 */
363#define HASWIDTH 0x1 /* Known never to match null string. */
364#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
365#define SPSTART 0x4 /* Starts with * or +. */
366#define HASNL 0x8 /* Contains some \n. */
367#define HASLOOKBH 0x10 /* Contains "\@<=" or "\@<!". */
368#define WORST 0 /* Worst case. */
369
370/*
371 * When regcode is set to this value, code is not emitted and size is computed
372 * instead.
373 */
374#define JUST_CALC_SIZE ((char_u *) -1)
375
376static char_u *reg_prev_sub;
377
378/*
379 * REGEXP_INRANGE contains all characters which are always special in a []
380 * range after '\'.
381 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
382 * These are:
383 * \n - New line (NL).
384 * \r - Carriage Return (CR).
385 * \t - Tab (TAB).
386 * \e - Escape (ESC).
387 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000388 * \d - Character code in decimal, eg \d123
389 * \o - Character code in octal, eg \o80
390 * \x - Character code in hex, eg \x4a
391 * \u - Multibyte character code, eg \u20ac
392 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000393 */
394static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000395static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000396
397static int backslash_trans __ARGS((int c));
Bram Moolenaardf177f62005-02-22 08:39:57 +0000398static int get_char_class __ARGS((char_u **pp));
399static int get_equi_class __ARGS((char_u **pp));
400static void reg_equi_class __ARGS((int c));
401static int get_coll_element __ARGS((char_u **pp));
Bram Moolenaar071d4272004-06-13 20:20:40 +0000402static char_u *skip_anyof __ARGS((char_u *p));
403static void init_class_tab __ARGS((void));
404
405/*
406 * Translate '\x' to its control character, except "\n", which is Magic.
407 */
408 static int
409backslash_trans(c)
410 int c;
411{
412 switch (c)
413 {
414 case 'r': return CAR;
415 case 't': return TAB;
416 case 'e': return ESC;
417 case 'b': return BS;
418 }
419 return c;
420}
421
422/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000423 * Check for a character class name "[:name:]". "pp" points to the '['.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000424 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
425 * recognized. Otherwise "pp" is advanced to after the item.
426 */
427 static int
Bram Moolenaardf177f62005-02-22 08:39:57 +0000428get_char_class(pp)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000429 char_u **pp;
430{
431 static const char *(class_names[]) =
432 {
433 "alnum:]",
434#define CLASS_ALNUM 0
435 "alpha:]",
436#define CLASS_ALPHA 1
437 "blank:]",
438#define CLASS_BLANK 2
439 "cntrl:]",
440#define CLASS_CNTRL 3
441 "digit:]",
442#define CLASS_DIGIT 4
443 "graph:]",
444#define CLASS_GRAPH 5
445 "lower:]",
446#define CLASS_LOWER 6
447 "print:]",
448#define CLASS_PRINT 7
449 "punct:]",
450#define CLASS_PUNCT 8
451 "space:]",
452#define CLASS_SPACE 9
453 "upper:]",
454#define CLASS_UPPER 10
455 "xdigit:]",
456#define CLASS_XDIGIT 11
457 "tab:]",
458#define CLASS_TAB 12
459 "return:]",
460#define CLASS_RETURN 13
461 "backspace:]",
462#define CLASS_BACKSPACE 14
463 "escape:]",
464#define CLASS_ESCAPE 15
465 };
466#define CLASS_NONE 99
467 int i;
468
469 if ((*pp)[1] == ':')
470 {
471 for (i = 0; i < sizeof(class_names) / sizeof(*class_names); ++i)
472 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
473 {
474 *pp += STRLEN(class_names[i]) + 2;
475 return i;
476 }
477 }
478 return CLASS_NONE;
479}
480
481/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000482 * Specific version of character class functions.
483 * Using a table to keep this fast.
484 */
485static short class_tab[256];
486
487#define RI_DIGIT 0x01
488#define RI_HEX 0x02
489#define RI_OCTAL 0x04
490#define RI_WORD 0x08
491#define RI_HEAD 0x10
492#define RI_ALPHA 0x20
493#define RI_LOWER 0x40
494#define RI_UPPER 0x80
495#define RI_WHITE 0x100
496
497 static void
498init_class_tab()
499{
500 int i;
501 static int done = FALSE;
502
503 if (done)
504 return;
505
506 for (i = 0; i < 256; ++i)
507 {
508 if (i >= '0' && i <= '7')
509 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
510 else if (i >= '8' && i <= '9')
511 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
512 else if (i >= 'a' && i <= 'f')
513 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
514#ifdef EBCDIC
515 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
516 || (i >= 's' && i <= 'z'))
517#else
518 else if (i >= 'g' && i <= 'z')
519#endif
520 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
521 else if (i >= 'A' && i <= 'F')
522 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
523#ifdef EBCDIC
524 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
525 || (i >= 'S' && i <= 'Z'))
526#else
527 else if (i >= 'G' && i <= 'Z')
528#endif
529 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
530 else if (i == '_')
531 class_tab[i] = RI_WORD + RI_HEAD;
532 else
533 class_tab[i] = 0;
534 }
535 class_tab[' '] |= RI_WHITE;
536 class_tab['\t'] |= RI_WHITE;
537 done = TRUE;
538}
539
540#ifdef FEAT_MBYTE
541# define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
542# define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
543# define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
544# define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
545# define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
546# define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
547# define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
548# define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
549# define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
550#else
551# define ri_digit(c) (class_tab[c] & RI_DIGIT)
552# define ri_hex(c) (class_tab[c] & RI_HEX)
553# define ri_octal(c) (class_tab[c] & RI_OCTAL)
554# define ri_word(c) (class_tab[c] & RI_WORD)
555# define ri_head(c) (class_tab[c] & RI_HEAD)
556# define ri_alpha(c) (class_tab[c] & RI_ALPHA)
557# define ri_lower(c) (class_tab[c] & RI_LOWER)
558# define ri_upper(c) (class_tab[c] & RI_UPPER)
559# define ri_white(c) (class_tab[c] & RI_WHITE)
560#endif
561
562/* flags for regflags */
563#define RF_ICASE 1 /* ignore case */
564#define RF_NOICASE 2 /* don't ignore case */
565#define RF_HASNL 4 /* can match a NL */
566#define RF_ICOMBINE 8 /* ignore combining characters */
567#define RF_LOOKBH 16 /* uses "\@<=" or "\@<!" */
568
569/*
570 * Global work variables for vim_regcomp().
571 */
572
573static char_u *regparse; /* Input-scan pointer. */
574static int prevchr_len; /* byte length of previous char */
575static int num_complex_braces; /* Complex \{...} count */
576static int regnpar; /* () count. */
577#ifdef FEAT_SYN_HL
578static int regnzpar; /* \z() count. */
579static int re_has_z; /* \z item detected */
580#endif
581static char_u *regcode; /* Code-emit pointer, or JUST_CALC_SIZE */
582static long regsize; /* Code size. */
583static char_u had_endbrace[NSUBEXP]; /* flags, TRUE if end of () found */
584static unsigned regflags; /* RF_ flags for prog */
585static long brace_min[10]; /* Minimums for complex brace repeats */
586static long brace_max[10]; /* Maximums for complex brace repeats */
587static int brace_count[10]; /* Current counts for complex brace repeats */
588#if defined(FEAT_SYN_HL) || defined(PROTO)
589static int had_eol; /* TRUE when EOL found by vim_regcomp() */
590#endif
591static int one_exactly = FALSE; /* only do one char for EXACTLY */
592
593static int reg_magic; /* magicness of the pattern: */
594#define MAGIC_NONE 1 /* "\V" very unmagic */
595#define MAGIC_OFF 2 /* "\M" or 'magic' off */
596#define MAGIC_ON 3 /* "\m" or 'magic' */
597#define MAGIC_ALL 4 /* "\v" very magic */
598
599static int reg_string; /* matching with a string instead of a buffer
600 line */
601
602/*
603 * META contains all characters that may be magic, except '^' and '$'.
604 */
605
606#ifdef EBCDIC
607static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
608#else
609/* META[] is used often enough to justify turning it into a table. */
610static char_u META_flags[] = {
611 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
612 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
613/* % & ( ) * + . */
614 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
615/* 1 2 3 4 5 6 7 8 9 < = > ? */
616 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
617/* @ A C D F H I K L M O */
618 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
619/* P S U V W X Z [ _ */
620 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
621/* a c d f h i k l m n o */
622 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
623/* p s u v w x z { | ~ */
624 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
625};
626#endif
627
628static int curchr;
629
630/* arguments for reg() */
631#define REG_NOPAREN 0 /* toplevel reg() */
632#define REG_PAREN 1 /* \(\) */
633#define REG_ZPAREN 2 /* \z(\) */
634#define REG_NPAREN 3 /* \%(\) */
635
636/*
637 * Forward declarations for vim_regcomp()'s friends.
638 */
639static void initchr __ARGS((char_u *));
640static int getchr __ARGS((void));
641static void skipchr_keepstart __ARGS((void));
642static int peekchr __ARGS((void));
643static void skipchr __ARGS((void));
644static void ungetchr __ARGS((void));
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000645static int gethexchrs __ARGS((int maxinputlen));
646static int getoctchrs __ARGS((void));
647static int getdecchrs __ARGS((void));
648static int coll_get_char __ARGS((void));
Bram Moolenaar071d4272004-06-13 20:20:40 +0000649static void regcomp_start __ARGS((char_u *expr, int flags));
650static char_u *reg __ARGS((int, int *));
651static char_u *regbranch __ARGS((int *flagp));
652static char_u *regconcat __ARGS((int *flagp));
653static char_u *regpiece __ARGS((int *));
654static char_u *regatom __ARGS((int *));
655static char_u *regnode __ARGS((int));
656static int prog_magic_wrong __ARGS((void));
657static char_u *regnext __ARGS((char_u *));
658static void regc __ARGS((int b));
659#ifdef FEAT_MBYTE
660static void regmbc __ARGS((int c));
Bram Moolenaardf177f62005-02-22 08:39:57 +0000661#else
662# define regmbc(c) regc(c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000663#endif
664static void reginsert __ARGS((int, char_u *));
665static void reginsert_limits __ARGS((int, long, long, char_u *));
666static char_u *re_put_long __ARGS((char_u *pr, long_u val));
667static int read_limits __ARGS((long *, long *));
668static void regtail __ARGS((char_u *, char_u *));
669static void regoptail __ARGS((char_u *, char_u *));
670
671/*
672 * Return TRUE if compiled regular expression "prog" can match a line break.
673 */
674 int
675re_multiline(prog)
676 regprog_T *prog;
677{
678 return (prog->regflags & RF_HASNL);
679}
680
681/*
682 * Return TRUE if compiled regular expression "prog" looks before the start
683 * position (pattern contains "\@<=" or "\@<!").
684 */
685 int
686re_lookbehind(prog)
687 regprog_T *prog;
688{
689 return (prog->regflags & RF_LOOKBH);
690}
691
692/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000693 * Check for an equivalence class name "[=a=]". "pp" points to the '['.
694 * Returns a character representing the class. Zero means that no item was
695 * recognized. Otherwise "pp" is advanced to after the item.
696 */
697 static int
698get_equi_class(pp)
699 char_u **pp;
700{
701 int c;
702 int l = 1;
703 char_u *p = *pp;
704
705 if (p[1] == '=')
706 {
707#ifdef FEAT_MBYTE
708 if (has_mbyte)
709 l = mb_ptr2len_check(p + 2);
710#endif
711 if (p[l + 2] == '=' && p[l + 3] == ']')
712 {
713#ifdef FEAT_MBYTE
714 if (has_mbyte)
715 c = mb_ptr2char(p + 2);
716 else
717#endif
718 c = p[2];
719 *pp += l + 4;
720 return c;
721 }
722 }
723 return 0;
724}
725
726/*
727 * Produce the bytes for equivalence class "c".
728 * Currently only handles latin1, latin9 and utf-8.
729 */
730 static void
731reg_equi_class(c)
732 int c;
733{
734#ifdef FEAT_MBYTE
735 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
736 || STRCMP(p_enc, "latin9") == 0)
737#endif
738 {
739 switch (c)
740 {
741 case 'A': case 'À': case 'Á': case 'Â':
742 case 'Ã': case 'Ä': case 'Å':
743 regmbc('A'); regmbc('À'); regmbc('Á'); regmbc('Â');
744 regmbc('Ã'); regmbc('Ä'); regmbc('Å');
745 return;
746 case 'C': case 'Ç':
747 regmbc('C'); regmbc('Ç');
748 return;
749 case 'E': case 'È': case 'É': case 'Ê': case 'Ë':
750 regmbc('E'); regmbc('È'); regmbc('É'); regmbc('Ê');
751 regmbc('Ë');
752 return;
753 case 'I': case 'Ì': case 'Í': case 'Î': case 'Ï':
754 regmbc('I'); regmbc('Ì'); regmbc('Í'); regmbc('Î');
755 regmbc('Ï');
756 return;
757 case 'N': case 'Ñ':
758 regmbc('N'); regmbc('Ñ');
759 return;
760 case 'O': case 'Ò': case 'Ó': case 'Ô': case 'Õ': case 'Ö':
761 regmbc('O'); regmbc('Ò'); regmbc('Ó'); regmbc('Ô');
762 regmbc('Õ'); regmbc('Ö');
763 return;
764 case 'U': case 'Ù': case 'Ú': case 'Û': case 'Ü':
765 regmbc('U'); regmbc('Ù'); regmbc('Ú'); regmbc('Û');
766 regmbc('Ü');
767 return;
768 case 'Y': case 'Ý':
769 regmbc('Y'); regmbc('Ý');
770 return;
771 case 'a': case 'à': case 'á': case 'â':
772 case 'ã': case 'ä': case 'å':
773 regmbc('a'); regmbc('à'); regmbc('á'); regmbc('â');
774 regmbc('ã'); regmbc('ä'); regmbc('å');
775 return;
776 case 'c': case 'ç':
777 regmbc('c'); regmbc('ç');
778 return;
779 case 'e': case 'è': case 'é': case 'ê': case 'ë':
780 regmbc('e'); regmbc('è'); regmbc('é'); regmbc('ê');
781 regmbc('ë');
782 return;
783 case 'i': case 'ì': case 'í': case 'î': case 'ï':
784 regmbc('i'); regmbc('ì'); regmbc('í'); regmbc('î');
785 regmbc('ï');
786 return;
787 case 'n': case 'ñ':
788 regmbc('n'); regmbc('ñ');
789 return;
790 case 'o': case 'ò': case 'ó': case 'ô': case 'õ': case 'ö':
791 regmbc('o'); regmbc('ò'); regmbc('ó'); regmbc('ô');
792 regmbc('õ'); regmbc('ö');
793 return;
794 case 'u': case 'ù': case 'ú': case 'û': case 'ü':
795 regmbc('u'); regmbc('ù'); regmbc('ú'); regmbc('û');
796 regmbc('ü');
797 return;
798 case 'y': case 'ý': case 'ÿ':
799 regmbc('y'); regmbc('ý'); regmbc('ÿ');
800 return;
801 }
802 }
803 regmbc(c);
804}
805
806/*
807 * Check for a collating element "[.a.]". "pp" points to the '['.
808 * Returns a character. Zero means that no item was recognized. Otherwise
809 * "pp" is advanced to after the item.
810 * Currently only single characters are recognized!
811 */
812 static int
813get_coll_element(pp)
814 char_u **pp;
815{
816 int c;
817 int l = 1;
818 char_u *p = *pp;
819
820 if (p[1] == '.')
821 {
822#ifdef FEAT_MBYTE
823 if (has_mbyte)
824 l = mb_ptr2len_check(p + 2);
825#endif
826 if (p[l + 2] == '.' && p[l + 3] == ']')
827 {
828#ifdef FEAT_MBYTE
829 if (has_mbyte)
830 c = mb_ptr2char(p + 2);
831 else
832#endif
833 c = p[2];
834 *pp += l + 4;
835 return c;
836 }
837 }
838 return 0;
839}
840
841
842/*
843 * Skip over a "[]" range.
844 * "p" must point to the character after the '['.
845 * The returned pointer is on the matching ']', or the terminating NUL.
846 */
847 static char_u *
848skip_anyof(p)
849 char_u *p;
850{
851 int cpo_lit; /* 'cpoptions' contains 'l' flag */
852 int cpo_bsl; /* 'cpoptions' contains '\' flag */
853#ifdef FEAT_MBYTE
854 int l;
855#endif
856
857 cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
858 cpo_bsl = (!reg_syn && vim_strchr(p_cpo, CPO_BACKSL) != NULL);
859
860 if (*p == '^') /* Complement of range. */
861 ++p;
862 if (*p == ']' || *p == '-')
863 ++p;
864 while (*p != NUL && *p != ']')
865 {
866#ifdef FEAT_MBYTE
867 if (has_mbyte && (l = (*mb_ptr2len_check)(p)) > 1)
868 p += l;
869 else
870#endif
871 if (*p == '-')
872 {
873 ++p;
874 if (*p != ']' && *p != NUL)
875 mb_ptr_adv(p);
876 }
877 else if (*p == '\\'
878 && !cpo_bsl
879 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
880 || (!cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
881 p += 2;
882 else if (*p == '[')
883 {
884 if (get_char_class(&p) == CLASS_NONE
885 && get_equi_class(&p) == 0
886 && get_coll_element(&p) == 0)
887 ++p; /* It was not a class name */
888 }
889 else
890 ++p;
891 }
892
893 return p;
894}
895
896/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000897 * Skip past regular expression.
Bram Moolenaar748bf032005-02-02 23:04:36 +0000898 * Stop at end of "startp" or where "dirc" is found ('/', '?', etc).
Bram Moolenaar071d4272004-06-13 20:20:40 +0000899 * Take care of characters with a backslash in front of it.
900 * Skip strings inside [ and ].
901 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
902 * expression and change "\?" to "?". If "*newp" is not NULL the expression
903 * is changed in-place.
904 */
905 char_u *
906skip_regexp(startp, dirc, magic, newp)
907 char_u *startp;
908 int dirc;
909 int magic;
910 char_u **newp;
911{
912 int mymagic;
913 char_u *p = startp;
914
915 if (magic)
916 mymagic = MAGIC_ON;
917 else
918 mymagic = MAGIC_OFF;
919
Bram Moolenaar1cd871b2004-12-19 22:46:22 +0000920 for (; p[0] != NUL; mb_ptr_adv(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000921 {
922 if (p[0] == dirc) /* found end of regexp */
923 break;
924 if ((p[0] == '[' && mymagic >= MAGIC_ON)
925 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
926 {
927 p = skip_anyof(p + 1);
928 if (p[0] == NUL)
929 break;
930 }
931 else if (p[0] == '\\' && p[1] != NUL)
932 {
933 if (dirc == '?' && newp != NULL && p[1] == '?')
934 {
935 /* change "\?" to "?", make a copy first. */
936 if (*newp == NULL)
937 {
938 *newp = vim_strsave(startp);
939 if (*newp != NULL)
940 p = *newp + (p - startp);
941 }
942 if (*newp != NULL)
943 mch_memmove(p, p + 1, STRLEN(p));
944 else
945 ++p;
946 }
947 else
948 ++p; /* skip next character */
949 if (*p == 'v')
950 mymagic = MAGIC_ALL;
951 else if (*p == 'V')
952 mymagic = MAGIC_NONE;
953 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000954 }
955 return p;
956}
957
958/*
Bram Moolenaar86b68352004-12-27 21:59:20 +0000959 * vim_regcomp() - compile a regular expression into internal code
960 * Returns the program in allocated space. Returns NULL for an error.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000961 *
962 * We can't allocate space until we know how big the compiled form will be,
963 * but we can't compile it (and thus know how big it is) until we've got a
964 * place to put the code. So we cheat: we compile it twice, once with code
965 * generation turned off and size counting turned on, and once "for real".
966 * This also means that we don't allocate space until we are sure that the
967 * thing really will compile successfully, and we never have to move the
968 * code and thus invalidate pointers into it. (Note that it has to be in
969 * one piece because vim_free() must be able to free it all.)
970 *
971 * Whether upper/lower case is to be ignored is decided when executing the
972 * program, it does not matter here.
973 *
974 * Beware that the optimization-preparation code in here knows about some
975 * of the structure of the compiled regexp.
976 * "re_flags": RE_MAGIC and/or RE_STRING.
977 */
978 regprog_T *
979vim_regcomp(expr, re_flags)
980 char_u *expr;
981 int re_flags;
982{
983 regprog_T *r;
984 char_u *scan;
985 char_u *longest;
986 int len;
987 int flags;
988
989 if (expr == NULL)
990 EMSG_RET_NULL(_(e_null));
991
992 init_class_tab();
993
994 /*
995 * First pass: determine size, legality.
996 */
997 regcomp_start(expr, re_flags);
998 regcode = JUST_CALC_SIZE;
999 regc(REGMAGIC);
1000 if (reg(REG_NOPAREN, &flags) == NULL)
1001 return NULL;
1002
1003 /* Small enough for pointer-storage convention? */
1004#ifdef SMALL_MALLOC /* 16 bit storage allocation */
1005 if (regsize >= 65536L - 256L)
1006 EMSG_RET_NULL(_("E339: Pattern too long"));
1007#endif
1008
1009 /* Allocate space. */
1010 r = (regprog_T *)lalloc(sizeof(regprog_T) + regsize, TRUE);
1011 if (r == NULL)
1012 return NULL;
1013
1014 /*
1015 * Second pass: emit code.
1016 */
1017 regcomp_start(expr, re_flags);
1018 regcode = r->program;
1019 regc(REGMAGIC);
1020 if (reg(REG_NOPAREN, &flags) == NULL)
1021 {
1022 vim_free(r);
1023 return NULL;
1024 }
1025
1026 /* Dig out information for optimizations. */
1027 r->regstart = NUL; /* Worst-case defaults. */
1028 r->reganch = 0;
1029 r->regmust = NULL;
1030 r->regmlen = 0;
1031 r->regflags = regflags;
1032 if (flags & HASNL)
1033 r->regflags |= RF_HASNL;
1034 if (flags & HASLOOKBH)
1035 r->regflags |= RF_LOOKBH;
1036#ifdef FEAT_SYN_HL
1037 /* Remember whether this pattern has any \z specials in it. */
1038 r->reghasz = re_has_z;
1039#endif
1040 scan = r->program + 1; /* First BRANCH. */
1041 if (OP(regnext(scan)) == END) /* Only one top-level choice. */
1042 {
1043 scan = OPERAND(scan);
1044
1045 /* Starting-point info. */
1046 if (OP(scan) == BOL || OP(scan) == RE_BOF)
1047 {
1048 r->reganch++;
1049 scan = regnext(scan);
1050 }
1051
1052 if (OP(scan) == EXACTLY)
1053 {
1054#ifdef FEAT_MBYTE
1055 if (has_mbyte)
1056 r->regstart = (*mb_ptr2char)(OPERAND(scan));
1057 else
1058#endif
1059 r->regstart = *OPERAND(scan);
1060 }
1061 else if ((OP(scan) == BOW
1062 || OP(scan) == EOW
1063 || OP(scan) == NOTHING
1064 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
1065 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
1066 && OP(regnext(scan)) == EXACTLY)
1067 {
1068#ifdef FEAT_MBYTE
1069 if (has_mbyte)
1070 r->regstart = (*mb_ptr2char)(OPERAND(regnext(scan)));
1071 else
1072#endif
1073 r->regstart = *OPERAND(regnext(scan));
1074 }
1075
1076 /*
1077 * If there's something expensive in the r.e., find the longest
1078 * literal string that must appear and make it the regmust. Resolve
1079 * ties in favor of later strings, since the regstart check works
1080 * with the beginning of the r.e. and avoiding duplication
1081 * strengthens checking. Not a strong reason, but sufficient in the
1082 * absence of others.
1083 */
1084 /*
1085 * When the r.e. starts with BOW, it is faster to look for a regmust
1086 * first. Used a lot for "#" and "*" commands. (Added by mool).
1087 */
1088 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
1089 && !(flags & HASNL))
1090 {
1091 longest = NULL;
1092 len = 0;
1093 for (; scan != NULL; scan = regnext(scan))
1094 if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len)
1095 {
1096 longest = OPERAND(scan);
1097 len = (int)STRLEN(OPERAND(scan));
1098 }
1099 r->regmust = longest;
1100 r->regmlen = len;
1101 }
1102 }
1103#ifdef DEBUG
1104 regdump(expr, r);
1105#endif
1106 return r;
1107}
1108
1109/*
1110 * Setup to parse the regexp. Used once to get the length and once to do it.
1111 */
1112 static void
1113regcomp_start(expr, re_flags)
1114 char_u *expr;
1115 int re_flags; /* see vim_regcomp() */
1116{
1117 initchr(expr);
1118 if (re_flags & RE_MAGIC)
1119 reg_magic = MAGIC_ON;
1120 else
1121 reg_magic = MAGIC_OFF;
1122 reg_string = (re_flags & RE_STRING);
1123
1124 num_complex_braces = 0;
1125 regnpar = 1;
1126 vim_memset(had_endbrace, 0, sizeof(had_endbrace));
1127#ifdef FEAT_SYN_HL
1128 regnzpar = 1;
1129 re_has_z = 0;
1130#endif
1131 regsize = 0L;
1132 regflags = 0;
1133#if defined(FEAT_SYN_HL) || defined(PROTO)
1134 had_eol = FALSE;
1135#endif
1136}
1137
1138#if defined(FEAT_SYN_HL) || defined(PROTO)
1139/*
1140 * Check if during the previous call to vim_regcomp the EOL item "$" has been
1141 * found. This is messy, but it works fine.
1142 */
1143 int
1144vim_regcomp_had_eol()
1145{
1146 return had_eol;
1147}
1148#endif
1149
1150/*
1151 * reg - regular expression, i.e. main body or parenthesized thing
1152 *
1153 * Caller must absorb opening parenthesis.
1154 *
1155 * Combining parenthesis handling with the base level of regular expression
1156 * is a trifle forced, but the need to tie the tails of the branches to what
1157 * follows makes it hard to avoid.
1158 */
1159 static char_u *
1160reg(paren, flagp)
1161 int paren; /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */
1162 int *flagp;
1163{
1164 char_u *ret;
1165 char_u *br;
1166 char_u *ender;
1167 int parno = 0;
1168 int flags;
1169
1170 *flagp = HASWIDTH; /* Tentatively. */
1171
1172#ifdef FEAT_SYN_HL
1173 if (paren == REG_ZPAREN)
1174 {
1175 /* Make a ZOPEN node. */
1176 if (regnzpar >= NSUBEXP)
1177 EMSG_RET_NULL(_("E50: Too many \\z("));
1178 parno = regnzpar;
1179 regnzpar++;
1180 ret = regnode(ZOPEN + parno);
1181 }
1182 else
1183#endif
1184 if (paren == REG_PAREN)
1185 {
1186 /* Make a MOPEN node. */
1187 if (regnpar >= NSUBEXP)
1188 EMSG_M_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL);
1189 parno = regnpar;
1190 ++regnpar;
1191 ret = regnode(MOPEN + parno);
1192 }
1193 else if (paren == REG_NPAREN)
1194 {
1195 /* Make a NOPEN node. */
1196 ret = regnode(NOPEN);
1197 }
1198 else
1199 ret = NULL;
1200
1201 /* Pick up the branches, linking them together. */
1202 br = regbranch(&flags);
1203 if (br == NULL)
1204 return NULL;
1205 if (ret != NULL)
1206 regtail(ret, br); /* [MZ]OPEN -> first. */
1207 else
1208 ret = br;
1209 /* If one of the branches can be zero-width, the whole thing can.
1210 * If one of the branches has * at start or matches a line-break, the
1211 * whole thing can. */
1212 if (!(flags & HASWIDTH))
1213 *flagp &= ~HASWIDTH;
1214 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1215 while (peekchr() == Magic('|'))
1216 {
1217 skipchr();
1218 br = regbranch(&flags);
1219 if (br == NULL)
1220 return NULL;
1221 regtail(ret, br); /* BRANCH -> BRANCH. */
1222 if (!(flags & HASWIDTH))
1223 *flagp &= ~HASWIDTH;
1224 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1225 }
1226
1227 /* Make a closing node, and hook it on the end. */
1228 ender = regnode(
1229#ifdef FEAT_SYN_HL
1230 paren == REG_ZPAREN ? ZCLOSE + parno :
1231#endif
1232 paren == REG_PAREN ? MCLOSE + parno :
1233 paren == REG_NPAREN ? NCLOSE : END);
1234 regtail(ret, ender);
1235
1236 /* Hook the tails of the branches to the closing node. */
1237 for (br = ret; br != NULL; br = regnext(br))
1238 regoptail(br, ender);
1239
1240 /* Check for proper termination. */
1241 if (paren != REG_NOPAREN && getchr() != Magic(')'))
1242 {
1243#ifdef FEAT_SYN_HL
1244 if (paren == REG_ZPAREN)
1245 EMSG_RET_NULL(_("E52: Unmatched \\z("))
1246 else
1247#endif
1248 if (paren == REG_NPAREN)
1249 EMSG_M_RET_NULL(_("E53: Unmatched %s%%("), reg_magic == MAGIC_ALL)
1250 else
1251 EMSG_M_RET_NULL(_("E54: Unmatched %s("), reg_magic == MAGIC_ALL)
1252 }
1253 else if (paren == REG_NOPAREN && peekchr() != NUL)
1254 {
1255 if (curchr == Magic(')'))
1256 EMSG_M_RET_NULL(_("E55: Unmatched %s)"), reg_magic == MAGIC_ALL)
1257 else
1258 EMSG_RET_NULL(_(e_trailing)) /* "Can't happen". */
1259 /* NOTREACHED */
1260 }
1261 /*
1262 * Here we set the flag allowing back references to this set of
1263 * parentheses.
1264 */
1265 if (paren == REG_PAREN)
1266 had_endbrace[parno] = TRUE; /* have seen the close paren */
1267 return ret;
1268}
1269
1270/*
1271 * regbranch - one alternative of an | operator
1272 *
1273 * Implements the & operator.
1274 */
1275 static char_u *
1276regbranch(flagp)
1277 int *flagp;
1278{
1279 char_u *ret;
1280 char_u *chain = NULL;
1281 char_u *latest;
1282 int flags;
1283
1284 *flagp = WORST | HASNL; /* Tentatively. */
1285
1286 ret = regnode(BRANCH);
1287 for (;;)
1288 {
1289 latest = regconcat(&flags);
1290 if (latest == NULL)
1291 return NULL;
1292 /* If one of the branches has width, the whole thing has. If one of
1293 * the branches anchors at start-of-line, the whole thing does.
1294 * If one of the branches uses look-behind, the whole thing does. */
1295 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
1296 /* If one of the branches doesn't match a line-break, the whole thing
1297 * doesn't. */
1298 *flagp &= ~HASNL | (flags & HASNL);
1299 if (chain != NULL)
1300 regtail(chain, latest);
1301 if (peekchr() != Magic('&'))
1302 break;
1303 skipchr();
1304 regtail(latest, regnode(END)); /* operand ends */
1305 reginsert(MATCH, latest);
1306 chain = latest;
1307 }
1308
1309 return ret;
1310}
1311
1312/*
1313 * regbranch - one alternative of an | or & operator
1314 *
1315 * Implements the concatenation operator.
1316 */
1317 static char_u *
1318regconcat(flagp)
1319 int *flagp;
1320{
1321 char_u *first = NULL;
1322 char_u *chain = NULL;
1323 char_u *latest;
1324 int flags;
1325 int cont = TRUE;
1326
1327 *flagp = WORST; /* Tentatively. */
1328
1329 while (cont)
1330 {
1331 switch (peekchr())
1332 {
1333 case NUL:
1334 case Magic('|'):
1335 case Magic('&'):
1336 case Magic(')'):
1337 cont = FALSE;
1338 break;
1339 case Magic('Z'):
1340#ifdef FEAT_MBYTE
1341 regflags |= RF_ICOMBINE;
1342#endif
1343 skipchr_keepstart();
1344 break;
1345 case Magic('c'):
1346 regflags |= RF_ICASE;
1347 skipchr_keepstart();
1348 break;
1349 case Magic('C'):
1350 regflags |= RF_NOICASE;
1351 skipchr_keepstart();
1352 break;
1353 case Magic('v'):
1354 reg_magic = MAGIC_ALL;
1355 skipchr_keepstart();
1356 curchr = -1;
1357 break;
1358 case Magic('m'):
1359 reg_magic = MAGIC_ON;
1360 skipchr_keepstart();
1361 curchr = -1;
1362 break;
1363 case Magic('M'):
1364 reg_magic = MAGIC_OFF;
1365 skipchr_keepstart();
1366 curchr = -1;
1367 break;
1368 case Magic('V'):
1369 reg_magic = MAGIC_NONE;
1370 skipchr_keepstart();
1371 curchr = -1;
1372 break;
1373 default:
1374 latest = regpiece(&flags);
1375 if (latest == NULL)
1376 return NULL;
1377 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
1378 if (chain == NULL) /* First piece. */
1379 *flagp |= flags & SPSTART;
1380 else
1381 regtail(chain, latest);
1382 chain = latest;
1383 if (first == NULL)
1384 first = latest;
1385 break;
1386 }
1387 }
1388 if (first == NULL) /* Loop ran zero times. */
1389 first = regnode(NOTHING);
1390 return first;
1391}
1392
1393/*
1394 * regpiece - something followed by possible [*+=]
1395 *
1396 * Note that the branching code sequences used for = and the general cases
1397 * of * and + are somewhat optimized: they use the same NOTHING node as
1398 * both the endmarker for their branch list and the body of the last branch.
1399 * It might seem that this node could be dispensed with entirely, but the
1400 * endmarker role is not redundant.
1401 */
1402 static char_u *
1403regpiece(flagp)
1404 int *flagp;
1405{
1406 char_u *ret;
1407 int op;
1408 char_u *next;
1409 int flags;
1410 long minval;
1411 long maxval;
1412
1413 ret = regatom(&flags);
1414 if (ret == NULL)
1415 return NULL;
1416
1417 op = peekchr();
1418 if (re_multi_type(op) == NOT_MULTI)
1419 {
1420 *flagp = flags;
1421 return ret;
1422 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001423 /* default flags */
1424 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
1425
1426 skipchr();
1427 switch (op)
1428 {
1429 case Magic('*'):
1430 if (flags & SIMPLE)
1431 reginsert(STAR, ret);
1432 else
1433 {
1434 /* Emit x* as (x&|), where & means "self". */
1435 reginsert(BRANCH, ret); /* Either x */
1436 regoptail(ret, regnode(BACK)); /* and loop */
1437 regoptail(ret, ret); /* back */
1438 regtail(ret, regnode(BRANCH)); /* or */
1439 regtail(ret, regnode(NOTHING)); /* null. */
1440 }
1441 break;
1442
1443 case Magic('+'):
1444 if (flags & SIMPLE)
1445 reginsert(PLUS, ret);
1446 else
1447 {
1448 /* Emit x+ as x(&|), where & means "self". */
1449 next = regnode(BRANCH); /* Either */
1450 regtail(ret, next);
1451 regtail(regnode(BACK), ret); /* loop back */
1452 regtail(next, regnode(BRANCH)); /* or */
1453 regtail(ret, regnode(NOTHING)); /* null. */
1454 }
1455 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1456 break;
1457
1458 case Magic('@'):
1459 {
1460 int lop = END;
1461
1462 switch (no_Magic(getchr()))
1463 {
1464 case '=': lop = MATCH; break; /* \@= */
1465 case '!': lop = NOMATCH; break; /* \@! */
1466 case '>': lop = SUBPAT; break; /* \@> */
1467 case '<': switch (no_Magic(getchr()))
1468 {
1469 case '=': lop = BEHIND; break; /* \@<= */
1470 case '!': lop = NOBEHIND; break; /* \@<! */
1471 }
1472 }
1473 if (lop == END)
1474 EMSG_M_RET_NULL(_("E59: invalid character after %s@"),
1475 reg_magic == MAGIC_ALL);
1476 /* Look behind must match with behind_pos. */
1477 if (lop == BEHIND || lop == NOBEHIND)
1478 {
1479 regtail(ret, regnode(BHPOS));
1480 *flagp |= HASLOOKBH;
1481 }
1482 regtail(ret, regnode(END)); /* operand ends */
1483 reginsert(lop, ret);
1484 break;
1485 }
1486
1487 case Magic('?'):
1488 case Magic('='):
1489 /* Emit x= as (x|) */
1490 reginsert(BRANCH, ret); /* Either x */
1491 regtail(ret, regnode(BRANCH)); /* or */
1492 next = regnode(NOTHING); /* null. */
1493 regtail(ret, next);
1494 regoptail(ret, next);
1495 break;
1496
1497 case Magic('{'):
1498 if (!read_limits(&minval, &maxval))
1499 return NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001500 if (flags & SIMPLE)
1501 {
1502 reginsert(BRACE_SIMPLE, ret);
1503 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1504 }
1505 else
1506 {
1507 if (num_complex_braces >= 10)
1508 EMSG_M_RET_NULL(_("E60: Too many complex %s{...}s"),
1509 reg_magic == MAGIC_ALL);
1510 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
1511 regoptail(ret, regnode(BACK));
1512 regoptail(ret, ret);
1513 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1514 ++num_complex_braces;
1515 }
1516 if (minval > 0 && maxval > 0)
1517 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1518 break;
1519 }
1520 if (re_multi_type(peekchr()) != NOT_MULTI)
1521 {
1522 /* Can't have a multi follow a multi. */
1523 if (peekchr() == Magic('*'))
1524 sprintf((char *)IObuff, _("E61: Nested %s*"),
1525 reg_magic >= MAGIC_ON ? "" : "\\");
1526 else
1527 sprintf((char *)IObuff, _("E62: Nested %s%c"),
1528 reg_magic == MAGIC_ALL ? "" : "\\", no_Magic(peekchr()));
1529 EMSG_RET_NULL(IObuff);
1530 }
1531
1532 return ret;
1533}
1534
1535/*
1536 * regatom - the lowest level
1537 *
1538 * Optimization: gobbles an entire sequence of ordinary characters so that
1539 * it can turn them into a single node, which is smaller to store and
1540 * faster to run. Don't do this when one_exactly is set.
1541 */
1542 static char_u *
1543regatom(flagp)
1544 int *flagp;
1545{
1546 char_u *ret;
1547 int flags;
1548 int cpo_lit; /* 'cpoptions' contains 'l' flag */
Bram Moolenaardf177f62005-02-22 08:39:57 +00001549 int cpo_bsl; /* 'cpoptions' contains '\' flag */
Bram Moolenaar071d4272004-06-13 20:20:40 +00001550 int c;
1551 static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
1552 static int classcodes[] = {ANY, IDENT, SIDENT, KWORD, SKWORD,
1553 FNAME, SFNAME, PRINT, SPRINT,
1554 WHITE, NWHITE, DIGIT, NDIGIT,
1555 HEX, NHEX, OCTAL, NOCTAL,
1556 WORD, NWORD, HEAD, NHEAD,
1557 ALPHA, NALPHA, LOWER, NLOWER,
1558 UPPER, NUPPER
1559 };
1560 char_u *p;
1561 int extra = 0;
1562
1563 *flagp = WORST; /* Tentatively. */
1564 cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
Bram Moolenaardf177f62005-02-22 08:39:57 +00001565 cpo_bsl = (!reg_syn && vim_strchr(p_cpo, CPO_BACKSL) != NULL);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001566
1567 c = getchr();
1568 switch (c)
1569 {
1570 case Magic('^'):
1571 ret = regnode(BOL);
1572 break;
1573
1574 case Magic('$'):
1575 ret = regnode(EOL);
1576#if defined(FEAT_SYN_HL) || defined(PROTO)
1577 had_eol = TRUE;
1578#endif
1579 break;
1580
1581 case Magic('<'):
1582 ret = regnode(BOW);
1583 break;
1584
1585 case Magic('>'):
1586 ret = regnode(EOW);
1587 break;
1588
1589 case Magic('_'):
1590 c = no_Magic(getchr());
1591 if (c == '^') /* "\_^" is start-of-line */
1592 {
1593 ret = regnode(BOL);
1594 break;
1595 }
1596 if (c == '$') /* "\_$" is end-of-line */
1597 {
1598 ret = regnode(EOL);
1599#if defined(FEAT_SYN_HL) || defined(PROTO)
1600 had_eol = TRUE;
1601#endif
1602 break;
1603 }
1604
1605 extra = ADD_NL;
1606 *flagp |= HASNL;
1607
1608 /* "\_[" is character range plus newline */
1609 if (c == '[')
1610 goto collection;
1611
1612 /* "\_x" is character class plus newline */
1613 /*FALLTHROUGH*/
1614
1615 /*
1616 * Character classes.
1617 */
1618 case Magic('.'):
1619 case Magic('i'):
1620 case Magic('I'):
1621 case Magic('k'):
1622 case Magic('K'):
1623 case Magic('f'):
1624 case Magic('F'):
1625 case Magic('p'):
1626 case Magic('P'):
1627 case Magic('s'):
1628 case Magic('S'):
1629 case Magic('d'):
1630 case Magic('D'):
1631 case Magic('x'):
1632 case Magic('X'):
1633 case Magic('o'):
1634 case Magic('O'):
1635 case Magic('w'):
1636 case Magic('W'):
1637 case Magic('h'):
1638 case Magic('H'):
1639 case Magic('a'):
1640 case Magic('A'):
1641 case Magic('l'):
1642 case Magic('L'):
1643 case Magic('u'):
1644 case Magic('U'):
1645 p = vim_strchr(classchars, no_Magic(c));
1646 if (p == NULL)
1647 EMSG_RET_NULL(_("E63: invalid use of \\_"));
1648 ret = regnode(classcodes[p - classchars] + extra);
1649 *flagp |= HASWIDTH | SIMPLE;
1650 break;
1651
1652 case Magic('n'):
1653 if (reg_string)
1654 {
1655 /* In a string "\n" matches a newline character. */
1656 ret = regnode(EXACTLY);
1657 regc(NL);
1658 regc(NUL);
1659 *flagp |= HASWIDTH | SIMPLE;
1660 }
1661 else
1662 {
1663 /* In buffer text "\n" matches the end of a line. */
1664 ret = regnode(NEWL);
1665 *flagp |= HASWIDTH | HASNL;
1666 }
1667 break;
1668
1669 case Magic('('):
1670 if (one_exactly)
1671 EMSG_ONE_RET_NULL;
1672 ret = reg(REG_PAREN, &flags);
1673 if (ret == NULL)
1674 return NULL;
1675 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1676 break;
1677
1678 case NUL:
1679 case Magic('|'):
1680 case Magic('&'):
1681 case Magic(')'):
1682 EMSG_RET_NULL(_(e_internal)); /* Supposed to be caught earlier. */
1683 /* NOTREACHED */
1684
1685 case Magic('='):
1686 case Magic('?'):
1687 case Magic('+'):
1688 case Magic('@'):
1689 case Magic('{'):
1690 case Magic('*'):
1691 c = no_Magic(c);
1692 sprintf((char *)IObuff, _("E64: %s%c follows nothing"),
1693 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL)
1694 ? "" : "\\", c);
1695 EMSG_RET_NULL(IObuff);
1696 /* NOTREACHED */
1697
1698 case Magic('~'): /* previous substitute pattern */
1699 if (reg_prev_sub)
1700 {
1701 char_u *lp;
1702
1703 ret = regnode(EXACTLY);
1704 lp = reg_prev_sub;
1705 while (*lp != NUL)
1706 regc(*lp++);
1707 regc(NUL);
1708 if (*reg_prev_sub != NUL)
1709 {
1710 *flagp |= HASWIDTH;
1711 if ((lp - reg_prev_sub) == 1)
1712 *flagp |= SIMPLE;
1713 }
1714 }
1715 else
1716 EMSG_RET_NULL(_(e_nopresub));
1717 break;
1718
1719 case Magic('1'):
1720 case Magic('2'):
1721 case Magic('3'):
1722 case Magic('4'):
1723 case Magic('5'):
1724 case Magic('6'):
1725 case Magic('7'):
1726 case Magic('8'):
1727 case Magic('9'):
1728 {
1729 int refnum;
1730
1731 refnum = c - Magic('0');
1732 /*
1733 * Check if the back reference is legal. We must have seen the
1734 * close brace.
1735 * TODO: Should also check that we don't refer to something
1736 * that is repeated (+*=): what instance of the repetition
1737 * should we match?
1738 */
1739 if (!had_endbrace[refnum])
1740 {
1741 /* Trick: check if "@<=" or "@<!" follows, in which case
1742 * the \1 can appear before the referenced match. */
1743 for (p = regparse; *p != NUL; ++p)
1744 if (p[0] == '@' && p[1] == '<'
1745 && (p[2] == '!' || p[2] == '='))
1746 break;
1747 if (*p == NUL)
1748 EMSG_RET_NULL(_("E65: Illegal back reference"));
1749 }
1750 ret = regnode(BACKREF + refnum);
1751 }
1752 break;
1753
1754#ifdef FEAT_SYN_HL
1755 case Magic('z'):
1756 {
1757 c = no_Magic(getchr());
1758 switch (c)
1759 {
1760 case '(': if (reg_do_extmatch != REX_SET)
1761 EMSG_RET_NULL(_("E66: \\z( not allowed here"));
1762 if (one_exactly)
1763 EMSG_ONE_RET_NULL;
1764 ret = reg(REG_ZPAREN, &flags);
1765 if (ret == NULL)
1766 return NULL;
1767 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1768 re_has_z = REX_SET;
1769 break;
1770
1771 case '1':
1772 case '2':
1773 case '3':
1774 case '4':
1775 case '5':
1776 case '6':
1777 case '7':
1778 case '8':
1779 case '9': if (reg_do_extmatch != REX_USE)
1780 EMSG_RET_NULL(_("E67: \\z1 et al. not allowed here"));
1781 ret = regnode(ZREF + c - '0');
1782 re_has_z = REX_USE;
1783 break;
1784
1785 case 's': ret = regnode(MOPEN + 0);
1786 break;
1787
1788 case 'e': ret = regnode(MCLOSE + 0);
1789 break;
1790
1791 default: EMSG_RET_NULL(_("E68: Invalid character after \\z"));
1792 }
1793 }
1794 break;
1795#endif
1796
1797 case Magic('%'):
1798 {
1799 c = no_Magic(getchr());
1800 switch (c)
1801 {
1802 /* () without a back reference */
1803 case '(':
1804 if (one_exactly)
1805 EMSG_ONE_RET_NULL;
1806 ret = reg(REG_NPAREN, &flags);
1807 if (ret == NULL)
1808 return NULL;
1809 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1810 break;
1811
1812 /* Catch \%^ and \%$ regardless of where they appear in the
1813 * pattern -- regardless of whether or not it makes sense. */
1814 case '^':
1815 ret = regnode(RE_BOF);
1816 break;
1817
1818 case '$':
1819 ret = regnode(RE_EOF);
1820 break;
1821
1822 case '#':
1823 ret = regnode(CURSOR);
1824 break;
1825
1826 /* \%[abc]: Emit as a list of branches, all ending at the last
1827 * branch which matches nothing. */
1828 case '[':
1829 if (one_exactly) /* doesn't nest */
1830 EMSG_ONE_RET_NULL;
1831 {
1832 char_u *lastbranch;
1833 char_u *lastnode = NULL;
1834 char_u *br;
1835
1836 ret = NULL;
1837 while ((c = getchr()) != ']')
1838 {
1839 if (c == NUL)
1840 EMSG_M_RET_NULL(_("E69: Missing ] after %s%%["),
1841 reg_magic == MAGIC_ALL);
1842 br = regnode(BRANCH);
1843 if (ret == NULL)
1844 ret = br;
1845 else
1846 regtail(lastnode, br);
1847
1848 ungetchr();
1849 one_exactly = TRUE;
1850 lastnode = regatom(flagp);
1851 one_exactly = FALSE;
1852 if (lastnode == NULL)
1853 return NULL;
1854 }
1855 if (ret == NULL)
1856 EMSG_M_RET_NULL(_("E70: Empty %s%%[]"),
1857 reg_magic == MAGIC_ALL);
1858 lastbranch = regnode(BRANCH);
1859 br = regnode(NOTHING);
1860 if (ret != JUST_CALC_SIZE)
1861 {
1862 regtail(lastnode, br);
1863 regtail(lastbranch, br);
1864 /* connect all branches to the NOTHING
1865 * branch at the end */
1866 for (br = ret; br != lastnode; )
1867 {
1868 if (OP(br) == BRANCH)
1869 {
1870 regtail(br, lastbranch);
1871 br = OPERAND(br);
1872 }
1873 else
1874 br = regnext(br);
1875 }
1876 }
1877 *flagp &= ~HASWIDTH;
1878 break;
1879 }
1880
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001881 case 'd': /* %d123 decimal */
1882 case 'o': /* %o123 octal */
1883 case 'x': /* %xab hex 2 */
1884 case 'u': /* %uabcd hex 4 */
1885 case 'U': /* %U1234abcd hex 8 */
1886 {
1887 int i;
1888
1889 switch (c)
1890 {
1891 case 'd': i = getdecchrs(); break;
1892 case 'o': i = getoctchrs(); break;
1893 case 'x': i = gethexchrs(2); break;
1894 case 'u': i = gethexchrs(4); break;
1895 case 'U': i = gethexchrs(8); break;
1896 default: i = -1; break;
1897 }
1898
1899 if (i < 0)
1900 EMSG_M_RET_NULL(
1901 _("E678: Invalid character after %s%%[dxouU]"),
1902 reg_magic == MAGIC_ALL);
1903 ret = regnode(EXACTLY);
1904 if (i == 0)
1905 regc(0x0a);
1906 else
1907#ifdef FEAT_MBYTE
1908 regmbc(i);
1909#else
1910 regc(i);
1911#endif
1912 regc(NUL);
1913 *flagp |= HASWIDTH;
1914 break;
1915 }
1916
Bram Moolenaar071d4272004-06-13 20:20:40 +00001917 default:
1918 if (VIM_ISDIGIT(c) || c == '<' || c == '>')
1919 {
1920 long_u n = 0;
1921 int cmp;
1922
1923 cmp = c;
1924 if (cmp == '<' || cmp == '>')
1925 c = getchr();
1926 while (VIM_ISDIGIT(c))
1927 {
1928 n = n * 10 + (c - '0');
1929 c = getchr();
1930 }
1931 if (c == 'l' || c == 'c' || c == 'v')
1932 {
1933 if (c == 'l')
1934 ret = regnode(RE_LNUM);
1935 else if (c == 'c')
1936 ret = regnode(RE_COL);
1937 else
1938 ret = regnode(RE_VCOL);
1939 if (ret == JUST_CALC_SIZE)
1940 regsize += 5;
1941 else
1942 {
1943 /* put the number and the optional
1944 * comparator after the opcode */
1945 regcode = re_put_long(regcode, n);
1946 *regcode++ = cmp;
1947 }
1948 break;
1949 }
1950 }
1951
1952 EMSG_M_RET_NULL(_("E71: Invalid character after %s%%"),
1953 reg_magic == MAGIC_ALL);
1954 }
1955 }
1956 break;
1957
1958 case Magic('['):
1959collection:
1960 {
1961 char_u *lp;
1962
1963 /*
1964 * If there is no matching ']', we assume the '[' is a normal
1965 * character. This makes 'incsearch' and ":help [" work.
1966 */
1967 lp = skip_anyof(regparse);
1968 if (*lp == ']') /* there is a matching ']' */
1969 {
1970 int startc = -1; /* > 0 when next '-' is a range */
1971 int endc;
1972
1973 /*
1974 * In a character class, different parsing rules apply.
1975 * Not even \ is special anymore, nothing is.
1976 */
1977 if (*regparse == '^') /* Complement of range. */
1978 {
1979 ret = regnode(ANYBUT + extra);
1980 regparse++;
1981 }
1982 else
1983 ret = regnode(ANYOF + extra);
1984
1985 /* At the start ']' and '-' mean the literal character. */
1986 if (*regparse == ']' || *regparse == '-')
Bram Moolenaardf177f62005-02-22 08:39:57 +00001987 {
1988 startc = *regparse;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001989 regc(*regparse++);
Bram Moolenaardf177f62005-02-22 08:39:57 +00001990 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001991
1992 while (*regparse != NUL && *regparse != ']')
1993 {
1994 if (*regparse == '-')
1995 {
1996 ++regparse;
1997 /* The '-' is not used for a range at the end and
1998 * after or before a '\n'. */
1999 if (*regparse == ']' || *regparse == NUL
2000 || startc == -1
2001 || (regparse[0] == '\\' && regparse[1] == 'n'))
2002 {
2003 regc('-');
2004 startc = '-'; /* [--x] is a range */
2005 }
2006 else
2007 {
Bram Moolenaardf177f62005-02-22 08:39:57 +00002008 /* Also accept "a-[.z.]" */
2009 endc = 0;
2010 if (*regparse == '[')
2011 endc = get_coll_element(&regparse);
2012 if (endc == 0)
2013 {
Bram Moolenaar071d4272004-06-13 20:20:40 +00002014#ifdef FEAT_MBYTE
Bram Moolenaardf177f62005-02-22 08:39:57 +00002015 if (has_mbyte)
2016 endc = mb_ptr2char_adv(&regparse);
2017 else
Bram Moolenaar071d4272004-06-13 20:20:40 +00002018#endif
Bram Moolenaardf177f62005-02-22 08:39:57 +00002019 endc = *regparse++;
2020 }
Bram Moolenaarc0197e22004-09-13 20:26:32 +00002021
2022 /* Handle \o40, \x20 and \u20AC style sequences */
Bram Moolenaardf177f62005-02-22 08:39:57 +00002023 if (endc == '\\' && !cpo_lit && !cpo_bsl)
Bram Moolenaarc0197e22004-09-13 20:26:32 +00002024 endc = coll_get_char();
2025
Bram Moolenaar071d4272004-06-13 20:20:40 +00002026 if (startc > endc)
2027 EMSG_RET_NULL(_(e_invrange));
2028#ifdef FEAT_MBYTE
2029 if (has_mbyte && ((*mb_char2len)(startc) > 1
2030 || (*mb_char2len)(endc) > 1))
2031 {
2032 /* Limit to a range of 256 chars */
2033 if (endc > startc + 256)
2034 EMSG_RET_NULL(_(e_invrange));
2035 while (++startc <= endc)
2036 regmbc(startc);
2037 }
2038 else
2039#endif
2040 {
2041#ifdef EBCDIC
2042 int alpha_only = FALSE;
2043
2044 /* for alphabetical range skip the gaps
2045 * 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'. */
2046 if (isalpha(startc) && isalpha(endc))
2047 alpha_only = TRUE;
2048#endif
2049 while (++startc <= endc)
2050#ifdef EBCDIC
2051 if (!alpha_only || isalpha(startc))
2052#endif
2053 regc(startc);
2054 }
2055 startc = -1;
2056 }
2057 }
2058 /*
2059 * Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
2060 * accepts "\t", "\e", etc., but only when the 'l' flag in
2061 * 'cpoptions' is not included.
Bram Moolenaardf177f62005-02-22 08:39:57 +00002062 * Posix doesn't recognize backslash at all.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002063 */
2064 else if (*regparse == '\\'
Bram Moolenaardf177f62005-02-22 08:39:57 +00002065 && !cpo_bsl
Bram Moolenaar071d4272004-06-13 20:20:40 +00002066 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
2067 || (!cpo_lit
2068 && vim_strchr(REGEXP_ABBR,
2069 regparse[1]) != NULL)))
2070 {
2071 regparse++;
2072 if (*regparse == 'n')
2073 {
2074 /* '\n' in range: also match NL */
2075 if (ret != JUST_CALC_SIZE)
2076 {
2077 if (*ret == ANYBUT)
2078 *ret = ANYBUT + ADD_NL;
2079 else if (*ret == ANYOF)
2080 *ret = ANYOF + ADD_NL;
2081 /* else: must have had a \n already */
2082 }
2083 *flagp |= HASNL;
2084 regparse++;
2085 startc = -1;
2086 }
Bram Moolenaarc0197e22004-09-13 20:26:32 +00002087 else if (*regparse == 'd'
2088 || *regparse == 'o'
2089 || *regparse == 'x'
2090 || *regparse == 'u'
2091 || *regparse == 'U')
2092 {
2093 startc = coll_get_char();
2094 if (startc == 0)
2095 regc(0x0a);
2096 else
2097#ifdef FEAT_MBYTE
2098 regmbc(startc);
2099#else
2100 regc(startc);
2101#endif
2102 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002103 else
2104 {
2105 startc = backslash_trans(*regparse++);
2106 regc(startc);
2107 }
2108 }
2109 else if (*regparse == '[')
2110 {
2111 int c_class;
2112 int cu;
2113
Bram Moolenaardf177f62005-02-22 08:39:57 +00002114 c_class = get_char_class(&regparse);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002115 startc = -1;
2116 /* Characters assumed to be 8 bits! */
2117 switch (c_class)
2118 {
2119 case CLASS_NONE:
Bram Moolenaardf177f62005-02-22 08:39:57 +00002120 c_class = get_equi_class(&regparse);
2121 if (c_class != 0)
2122 {
2123 /* produce equivalence class */
2124 reg_equi_class(c_class);
2125 }
2126 else if ((c_class =
2127 get_coll_element(&regparse)) != 0)
2128 {
2129 /* produce a collating element */
2130 regmbc(c_class);
2131 }
2132 else
2133 {
2134 /* literal '[', allow [[-x] as a range */
2135 startc = *regparse++;
2136 regc(startc);
2137 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002138 break;
2139 case CLASS_ALNUM:
2140 for (cu = 1; cu <= 255; cu++)
2141 if (isalnum(cu))
2142 regc(cu);
2143 break;
2144 case CLASS_ALPHA:
2145 for (cu = 1; cu <= 255; cu++)
2146 if (isalpha(cu))
2147 regc(cu);
2148 break;
2149 case CLASS_BLANK:
2150 regc(' ');
2151 regc('\t');
2152 break;
2153 case CLASS_CNTRL:
2154 for (cu = 1; cu <= 255; cu++)
2155 if (iscntrl(cu))
2156 regc(cu);
2157 break;
2158 case CLASS_DIGIT:
2159 for (cu = 1; cu <= 255; cu++)
2160 if (VIM_ISDIGIT(cu))
2161 regc(cu);
2162 break;
2163 case CLASS_GRAPH:
2164 for (cu = 1; cu <= 255; cu++)
2165 if (isgraph(cu))
2166 regc(cu);
2167 break;
2168 case CLASS_LOWER:
2169 for (cu = 1; cu <= 255; cu++)
2170 if (islower(cu))
2171 regc(cu);
2172 break;
2173 case CLASS_PRINT:
2174 for (cu = 1; cu <= 255; cu++)
2175 if (vim_isprintc(cu))
2176 regc(cu);
2177 break;
2178 case CLASS_PUNCT:
2179 for (cu = 1; cu <= 255; cu++)
2180 if (ispunct(cu))
2181 regc(cu);
2182 break;
2183 case CLASS_SPACE:
2184 for (cu = 9; cu <= 13; cu++)
2185 regc(cu);
2186 regc(' ');
2187 break;
2188 case CLASS_UPPER:
2189 for (cu = 1; cu <= 255; cu++)
2190 if (isupper(cu))
2191 regc(cu);
2192 break;
2193 case CLASS_XDIGIT:
2194 for (cu = 1; cu <= 255; cu++)
2195 if (vim_isxdigit(cu))
2196 regc(cu);
2197 break;
2198 case CLASS_TAB:
2199 regc('\t');
2200 break;
2201 case CLASS_RETURN:
2202 regc('\r');
2203 break;
2204 case CLASS_BACKSPACE:
2205 regc('\b');
2206 break;
2207 case CLASS_ESCAPE:
2208 regc('\033');
2209 break;
2210 }
2211 }
2212 else
2213 {
2214#ifdef FEAT_MBYTE
2215 if (has_mbyte)
2216 {
2217 int len;
2218
2219 /* produce a multibyte character, including any
2220 * following composing characters */
2221 startc = mb_ptr2char(regparse);
2222 len = (*mb_ptr2len_check)(regparse);
2223 if (enc_utf8 && utf_char2len(startc) != len)
2224 startc = -1; /* composing chars */
2225 while (--len >= 0)
2226 regc(*regparse++);
2227 }
2228 else
2229#endif
2230 {
2231 startc = *regparse++;
2232 regc(startc);
2233 }
2234 }
2235 }
2236 regc(NUL);
2237 prevchr_len = 1; /* last char was the ']' */
2238 if (*regparse != ']')
2239 EMSG_RET_NULL(_(e_toomsbra)); /* Cannot happen? */
2240 skipchr(); /* let's be friends with the lexer again */
2241 *flagp |= HASWIDTH | SIMPLE;
2242 break;
2243 }
2244 }
2245 /* FALLTHROUGH */
2246
2247 default:
2248 {
2249 int len;
2250
2251#ifdef FEAT_MBYTE
2252 /* A multi-byte character is handled as a separate atom if it's
2253 * before a multi. */
2254 if (has_mbyte && (*mb_char2len)(c) > 1
2255 && re_multi_type(peekchr()) != NOT_MULTI)
2256 {
2257 ret = regnode(MULTIBYTECODE);
2258 regmbc(c);
2259 *flagp |= HASWIDTH | SIMPLE;
2260 break;
2261 }
2262#endif
2263
2264 ret = regnode(EXACTLY);
2265
2266 /*
2267 * Append characters as long as:
2268 * - there is no following multi, we then need the character in
2269 * front of it as a single character operand
2270 * - not running into a Magic character
2271 * - "one_exactly" is not set
2272 * But always emit at least one character. Might be a Multi,
2273 * e.g., a "[" without matching "]".
2274 */
2275 for (len = 0; c != NUL && (len == 0
2276 || (re_multi_type(peekchr()) == NOT_MULTI
2277 && !one_exactly
2278 && !is_Magic(c))); ++len)
2279 {
2280 c = no_Magic(c);
2281#ifdef FEAT_MBYTE
2282 if (has_mbyte)
2283 {
2284 regmbc(c);
2285 if (enc_utf8)
2286 {
2287 int off;
2288 int l;
2289
2290 /* Need to get composing character too, directly
2291 * access regparse for that, because skipchr() skips
2292 * over composing chars. */
2293 ungetchr();
2294 if (*regparse == '\\' && regparse[1] != NUL)
2295 off = 1;
2296 else
2297 off = 0;
2298 for (;;)
2299 {
2300 l = utf_ptr2len_check(regparse + off);
2301 if (!UTF_COMPOSINGLIKE(regparse + off,
2302 regparse + off + l))
2303 break;
2304 off += l;
2305 regmbc(utf_ptr2char(regparse + off));
2306 }
2307 skipchr();
2308 }
2309 }
2310 else
2311#endif
2312 regc(c);
2313 c = getchr();
2314 }
2315 ungetchr();
2316
2317 regc(NUL);
2318 *flagp |= HASWIDTH;
2319 if (len == 1)
2320 *flagp |= SIMPLE;
2321 }
2322 break;
2323 }
2324
2325 return ret;
2326}
2327
2328/*
2329 * emit a node
2330 * Return pointer to generated code.
2331 */
2332 static char_u *
2333regnode(op)
2334 int op;
2335{
2336 char_u *ret;
2337
2338 ret = regcode;
2339 if (ret == JUST_CALC_SIZE)
2340 regsize += 3;
2341 else
2342 {
2343 *regcode++ = op;
2344 *regcode++ = NUL; /* Null "next" pointer. */
2345 *regcode++ = NUL;
2346 }
2347 return ret;
2348}
2349
2350/*
2351 * Emit (if appropriate) a byte of code
2352 */
2353 static void
2354regc(b)
2355 int b;
2356{
2357 if (regcode == JUST_CALC_SIZE)
2358 regsize++;
2359 else
2360 *regcode++ = b;
2361}
2362
2363#ifdef FEAT_MBYTE
2364/*
2365 * Emit (if appropriate) a multi-byte character of code
2366 */
2367 static void
2368regmbc(c)
2369 int c;
2370{
2371 if (regcode == JUST_CALC_SIZE)
2372 regsize += (*mb_char2len)(c);
2373 else
2374 regcode += (*mb_char2bytes)(c, regcode);
2375}
2376#endif
2377
2378/*
2379 * reginsert - insert an operator in front of already-emitted operand
2380 *
2381 * Means relocating the operand.
2382 */
2383 static void
2384reginsert(op, opnd)
2385 int op;
2386 char_u *opnd;
2387{
2388 char_u *src;
2389 char_u *dst;
2390 char_u *place;
2391
2392 if (regcode == JUST_CALC_SIZE)
2393 {
2394 regsize += 3;
2395 return;
2396 }
2397 src = regcode;
2398 regcode += 3;
2399 dst = regcode;
2400 while (src > opnd)
2401 *--dst = *--src;
2402
2403 place = opnd; /* Op node, where operand used to be. */
2404 *place++ = op;
2405 *place++ = NUL;
2406 *place = NUL;
2407}
2408
2409/*
2410 * reginsert_limits - insert an operator in front of already-emitted operand.
2411 * The operator has the given limit values as operands. Also set next pointer.
2412 *
2413 * Means relocating the operand.
2414 */
2415 static void
2416reginsert_limits(op, minval, maxval, opnd)
2417 int op;
2418 long minval;
2419 long maxval;
2420 char_u *opnd;
2421{
2422 char_u *src;
2423 char_u *dst;
2424 char_u *place;
2425
2426 if (regcode == JUST_CALC_SIZE)
2427 {
2428 regsize += 11;
2429 return;
2430 }
2431 src = regcode;
2432 regcode += 11;
2433 dst = regcode;
2434 while (src > opnd)
2435 *--dst = *--src;
2436
2437 place = opnd; /* Op node, where operand used to be. */
2438 *place++ = op;
2439 *place++ = NUL;
2440 *place++ = NUL;
2441 place = re_put_long(place, (long_u)minval);
2442 place = re_put_long(place, (long_u)maxval);
2443 regtail(opnd, place);
2444}
2445
2446/*
2447 * Write a long as four bytes at "p" and return pointer to the next char.
2448 */
2449 static char_u *
2450re_put_long(p, val)
2451 char_u *p;
2452 long_u val;
2453{
2454 *p++ = (char_u) ((val >> 24) & 0377);
2455 *p++ = (char_u) ((val >> 16) & 0377);
2456 *p++ = (char_u) ((val >> 8) & 0377);
2457 *p++ = (char_u) (val & 0377);
2458 return p;
2459}
2460
2461/*
2462 * regtail - set the next-pointer at the end of a node chain
2463 */
2464 static void
2465regtail(p, val)
2466 char_u *p;
2467 char_u *val;
2468{
2469 char_u *scan;
2470 char_u *temp;
2471 int offset;
2472
2473 if (p == JUST_CALC_SIZE)
2474 return;
2475
2476 /* Find last node. */
2477 scan = p;
2478 for (;;)
2479 {
2480 temp = regnext(scan);
2481 if (temp == NULL)
2482 break;
2483 scan = temp;
2484 }
2485
2486 if (OP(scan) == BACK)
2487 offset = (int)(scan - val);
2488 else
2489 offset = (int)(val - scan);
2490 *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377);
2491 *(scan + 2) = (char_u) (offset & 0377);
2492}
2493
2494/*
2495 * regoptail - regtail on item after a BRANCH; nop if none
2496 */
2497 static void
2498regoptail(p, val)
2499 char_u *p;
2500 char_u *val;
2501{
2502 /* When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless" */
2503 if (p == NULL || p == JUST_CALC_SIZE
2504 || (OP(p) != BRANCH
2505 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
2506 return;
2507 regtail(OPERAND(p), val);
2508}
2509
2510/*
2511 * getchr() - get the next character from the pattern. We know about
2512 * magic and such, so therefore we need a lexical analyzer.
2513 */
2514
2515/* static int curchr; */
2516static int prevprevchr;
2517static int prevchr;
2518static int nextchr; /* used for ungetchr() */
2519/*
2520 * Note: prevchr is sometimes -1 when we are not at the start,
2521 * eg in /[ ^I]^ the pattern was never found even if it existed, because ^ was
2522 * taken to be magic -- webb
2523 */
2524static int at_start; /* True when on the first character */
2525static int prev_at_start; /* True when on the second character */
2526
2527 static void
2528initchr(str)
2529 char_u *str;
2530{
2531 regparse = str;
2532 prevchr_len = 0;
2533 curchr = prevprevchr = prevchr = nextchr = -1;
2534 at_start = TRUE;
2535 prev_at_start = FALSE;
2536}
2537
2538 static int
2539peekchr()
2540{
Bram Moolenaardf177f62005-02-22 08:39:57 +00002541 static int after_slash = FALSE;
2542
Bram Moolenaar071d4272004-06-13 20:20:40 +00002543 if (curchr == -1)
2544 {
2545 switch (curchr = regparse[0])
2546 {
2547 case '.':
2548 case '[':
2549 case '~':
2550 /* magic when 'magic' is on */
2551 if (reg_magic >= MAGIC_ON)
2552 curchr = Magic(curchr);
2553 break;
2554 case '(':
2555 case ')':
2556 case '{':
2557 case '%':
2558 case '+':
2559 case '=':
2560 case '?':
2561 case '@':
2562 case '!':
2563 case '&':
2564 case '|':
2565 case '<':
2566 case '>':
2567 case '#': /* future ext. */
2568 case '"': /* future ext. */
2569 case '\'': /* future ext. */
2570 case ',': /* future ext. */
2571 case '-': /* future ext. */
2572 case ':': /* future ext. */
2573 case ';': /* future ext. */
2574 case '`': /* future ext. */
2575 case '/': /* Can't be used in / command */
2576 /* magic only after "\v" */
2577 if (reg_magic == MAGIC_ALL)
2578 curchr = Magic(curchr);
2579 break;
2580 case '*':
Bram Moolenaardf177f62005-02-22 08:39:57 +00002581 /* * is not magic as the very first character, eg "?*ptr", when
2582 * after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But
2583 * "\(\*" is not magic, thus must be magic if "after_slash" */
2584 if (reg_magic >= MAGIC_ON
2585 && !at_start
2586 && !(prev_at_start && prevchr == Magic('^'))
2587 && (after_slash
2588 || (prevchr != Magic('(')
2589 && prevchr != Magic('&')
2590 && prevchr != Magic('|'))))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002591 curchr = Magic('*');
2592 break;
2593 case '^':
2594 /* '^' is only magic as the very first character and if it's after
2595 * "\(", "\|", "\&' or "\n" */
2596 if (reg_magic >= MAGIC_OFF
2597 && (at_start
2598 || reg_magic == MAGIC_ALL
2599 || prevchr == Magic('(')
2600 || prevchr == Magic('|')
2601 || prevchr == Magic('&')
2602 || prevchr == Magic('n')
2603 || (no_Magic(prevchr) == '('
2604 && prevprevchr == Magic('%'))))
2605 {
2606 curchr = Magic('^');
2607 at_start = TRUE;
2608 prev_at_start = FALSE;
2609 }
2610 break;
2611 case '$':
2612 /* '$' is only magic as the very last char and if it's in front of
2613 * either "\|", "\)", "\&", or "\n" */
2614 if (reg_magic >= MAGIC_OFF)
2615 {
2616 char_u *p = regparse + 1;
2617
2618 /* ignore \c \C \m and \M after '$' */
2619 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
2620 || p[1] == 'm' || p[1] == 'M' || p[1] == 'Z'))
2621 p += 2;
2622 if (p[0] == NUL
2623 || (p[0] == '\\'
2624 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
2625 || p[1] == 'n'))
2626 || reg_magic == MAGIC_ALL)
2627 curchr = Magic('$');
2628 }
2629 break;
2630 case '\\':
2631 {
2632 int c = regparse[1];
2633
2634 if (c == NUL)
2635 curchr = '\\'; /* trailing '\' */
2636 else if (
2637#ifdef EBCDIC
2638 vim_strchr(META, c)
2639#else
2640 c <= '~' && META_flags[c]
2641#endif
2642 )
2643 {
2644 /*
2645 * META contains everything that may be magic sometimes,
2646 * except ^ and $ ("\^" and "\$" are only magic after
2647 * "\v"). We now fetch the next character and toggle its
2648 * magicness. Therefore, \ is so meta-magic that it is
2649 * not in META.
2650 */
2651 curchr = -1;
2652 prev_at_start = at_start;
2653 at_start = FALSE; /* be able to say "/\*ptr" */
2654 ++regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +00002655 ++after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002656 peekchr();
2657 --regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +00002658 --after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002659 curchr = toggle_Magic(curchr);
2660 }
2661 else if (vim_strchr(REGEXP_ABBR, c))
2662 {
2663 /*
2664 * Handle abbreviations, like "\t" for TAB -- webb
2665 */
2666 curchr = backslash_trans(c);
2667 }
2668 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
2669 curchr = toggle_Magic(c);
2670 else
2671 {
2672 /*
2673 * Next character can never be (made) magic?
2674 * Then backslashing it won't do anything.
2675 */
2676#ifdef FEAT_MBYTE
2677 if (has_mbyte)
2678 curchr = (*mb_ptr2char)(regparse + 1);
2679 else
2680#endif
2681 curchr = c;
2682 }
2683 break;
2684 }
2685
2686#ifdef FEAT_MBYTE
2687 default:
2688 if (has_mbyte)
2689 curchr = (*mb_ptr2char)(regparse);
2690#endif
2691 }
2692 }
2693
2694 return curchr;
2695}
2696
2697/*
2698 * Eat one lexed character. Do this in a way that we can undo it.
2699 */
2700 static void
2701skipchr()
2702{
2703 /* peekchr() eats a backslash, do the same here */
2704 if (*regparse == '\\')
2705 prevchr_len = 1;
2706 else
2707 prevchr_len = 0;
2708 if (regparse[prevchr_len] != NUL)
2709 {
2710#ifdef FEAT_MBYTE
2711 if (has_mbyte)
2712 prevchr_len += (*mb_ptr2len_check)(regparse + prevchr_len);
2713 else
2714#endif
2715 ++prevchr_len;
2716 }
2717 regparse += prevchr_len;
2718 prev_at_start = at_start;
2719 at_start = FALSE;
2720 prevprevchr = prevchr;
2721 prevchr = curchr;
2722 curchr = nextchr; /* use previously unget char, or -1 */
2723 nextchr = -1;
2724}
2725
2726/*
2727 * Skip a character while keeping the value of prev_at_start for at_start.
2728 * prevchr and prevprevchr are also kept.
2729 */
2730 static void
2731skipchr_keepstart()
2732{
2733 int as = prev_at_start;
2734 int pr = prevchr;
2735 int prpr = prevprevchr;
2736
2737 skipchr();
2738 at_start = as;
2739 prevchr = pr;
2740 prevprevchr = prpr;
2741}
2742
2743 static int
2744getchr()
2745{
2746 int chr = peekchr();
2747
2748 skipchr();
2749 return chr;
2750}
2751
2752/*
2753 * put character back. Works only once!
2754 */
2755 static void
2756ungetchr()
2757{
2758 nextchr = curchr;
2759 curchr = prevchr;
2760 prevchr = prevprevchr;
2761 at_start = prev_at_start;
2762 prev_at_start = FALSE;
2763
2764 /* Backup regparse, so that it's at the same position as before the
2765 * getchr(). */
2766 regparse -= prevchr_len;
2767}
2768
2769/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +00002770 * Get and return the value of the hex string at the current position.
2771 * Return -1 if there is no valid hex number.
2772 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +00002773 * blahblah\%x20asdf
2774 * before-^ ^-after
2775 * The parameter controls the maximum number of input characters. This will be
2776 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
2777 */
2778 static int
2779gethexchrs(maxinputlen)
2780 int maxinputlen;
2781{
2782 int nr = 0;
2783 int c;
2784 int i;
2785
2786 for (i = 0; i < maxinputlen; ++i)
2787 {
2788 c = regparse[0];
2789 if (!vim_isxdigit(c))
2790 break;
2791 nr <<= 4;
2792 nr |= hex2nr(c);
2793 ++regparse;
2794 }
2795
2796 if (i == 0)
2797 return -1;
2798 return nr;
2799}
2800
2801/*
2802 * get and return the value of the decimal string immediately after the
2803 * current position. Return -1 for invalid. Consumes all digits.
2804 */
2805 static int
2806getdecchrs()
2807{
2808 int nr = 0;
2809 int c;
2810 int i;
2811
2812 for (i = 0; ; ++i)
2813 {
2814 c = regparse[0];
2815 if (c < '0' || c > '9')
2816 break;
2817 nr *= 10;
2818 nr += c - '0';
2819 ++regparse;
2820 }
2821
2822 if (i == 0)
2823 return -1;
2824 return nr;
2825}
2826
2827/*
2828 * get and return the value of the octal string immediately after the current
2829 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
2830 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
2831 * treat 8 or 9 as recognised characters. Position is updated:
2832 * blahblah\%o210asdf
2833 * before-^ ^-after
2834 */
2835 static int
2836getoctchrs()
2837{
2838 int nr = 0;
2839 int c;
2840 int i;
2841
2842 for (i = 0; i < 3 && nr < 040; ++i)
2843 {
2844 c = regparse[0];
2845 if (c < '0' || c > '7')
2846 break;
2847 nr <<= 3;
2848 nr |= hex2nr(c);
2849 ++regparse;
2850 }
2851
2852 if (i == 0)
2853 return -1;
2854 return nr;
2855}
2856
2857/*
2858 * Get a number after a backslash that is inside [].
2859 * When nothing is recognized return a backslash.
2860 */
2861 static int
2862coll_get_char()
2863{
2864 int nr = -1;
2865
2866 switch (*regparse++)
2867 {
2868 case 'd': nr = getdecchrs(); break;
2869 case 'o': nr = getoctchrs(); break;
2870 case 'x': nr = gethexchrs(2); break;
2871 case 'u': nr = gethexchrs(4); break;
2872 case 'U': nr = gethexchrs(8); break;
2873 }
2874 if (nr < 0)
2875 {
2876 /* If getting the number fails be backwards compatible: the character
2877 * is a backslash. */
2878 --regparse;
2879 nr = '\\';
2880 }
2881 return nr;
2882}
2883
2884/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00002885 * read_limits - Read two integers to be taken as a minimum and maximum.
2886 * If the first character is '-', then the range is reversed.
2887 * Should end with 'end'. If minval is missing, zero is default, if maxval is
2888 * missing, a very big number is the default.
2889 */
2890 static int
2891read_limits(minval, maxval)
2892 long *minval;
2893 long *maxval;
2894{
2895 int reverse = FALSE;
2896 char_u *first_char;
2897 long tmp;
2898
2899 if (*regparse == '-')
2900 {
2901 /* Starts with '-', so reverse the range later */
2902 regparse++;
2903 reverse = TRUE;
2904 }
2905 first_char = regparse;
2906 *minval = getdigits(&regparse);
2907 if (*regparse == ',') /* There is a comma */
2908 {
2909 if (vim_isdigit(*++regparse))
2910 *maxval = getdigits(&regparse);
2911 else
2912 *maxval = MAX_LIMIT;
2913 }
2914 else if (VIM_ISDIGIT(*first_char))
2915 *maxval = *minval; /* It was \{n} or \{-n} */
2916 else
2917 *maxval = MAX_LIMIT; /* It was \{} or \{-} */
2918 if (*regparse == '\\')
2919 regparse++; /* Allow either \{...} or \{...\} */
Bram Moolenaardf177f62005-02-22 08:39:57 +00002920 if (*regparse != '}')
Bram Moolenaar071d4272004-06-13 20:20:40 +00002921 {
2922 sprintf((char *)IObuff, _("E554: Syntax error in %s{...}"),
2923 reg_magic == MAGIC_ALL ? "" : "\\");
2924 EMSG_RET_FAIL(IObuff);
2925 }
2926
2927 /*
2928 * Reverse the range if there was a '-', or make sure it is in the right
2929 * order otherwise.
2930 */
2931 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
2932 {
2933 tmp = *minval;
2934 *minval = *maxval;
2935 *maxval = tmp;
2936 }
2937 skipchr(); /* let's be friends with the lexer again */
2938 return OK;
2939}
2940
2941/*
2942 * vim_regexec and friends
2943 */
2944
2945/*
2946 * Global work variables for vim_regexec().
2947 */
2948
2949/* The current match-position is remembered with these variables: */
2950static linenr_T reglnum; /* line number, relative to first line */
2951static char_u *regline; /* start of current line */
2952static char_u *reginput; /* current input, points into "regline" */
2953
2954static int need_clear_subexpr; /* subexpressions still need to be
2955 * cleared */
2956#ifdef FEAT_SYN_HL
2957static int need_clear_zsubexpr = FALSE; /* extmatch subexpressions
2958 * still need to be cleared */
2959#endif
2960
2961static int out_of_stack; /* TRUE when ran out of stack space */
2962
2963/*
2964 * Structure used to save the current input state, when it needs to be
2965 * restored after trying a match. Used by reg_save() and reg_restore().
2966 */
2967typedef struct
2968{
2969 union
2970 {
2971 char_u *ptr; /* reginput pointer, for single-line regexp */
2972 lpos_T pos; /* reginput pos, for multi-line regexp */
2973 } rs_u;
2974} regsave_T;
2975
2976/* struct to save start/end pointer/position in for \(\) */
2977typedef struct
2978{
2979 union
2980 {
2981 char_u *ptr;
2982 lpos_T pos;
2983 } se_u;
2984} save_se_T;
2985
2986static char_u *reg_getline __ARGS((linenr_T lnum));
2987static long vim_regexec_both __ARGS((char_u *line, colnr_T col));
2988static long regtry __ARGS((regprog_T *prog, colnr_T col));
2989static void cleanup_subexpr __ARGS((void));
2990#ifdef FEAT_SYN_HL
2991static void cleanup_zsubexpr __ARGS((void));
2992#endif
2993static void reg_nextline __ARGS((void));
2994static void reg_save __ARGS((regsave_T *save));
2995static void reg_restore __ARGS((regsave_T *save));
2996static int reg_save_equal __ARGS((regsave_T *save));
2997static void save_se_multi __ARGS((save_se_T *savep, lpos_T *posp));
2998static void save_se_one __ARGS((save_se_T *savep, char_u **pp));
2999
3000/* Save the sub-expressions before attempting a match. */
3001#define save_se(savep, posp, pp) \
3002 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
3003
3004/* After a failed match restore the sub-expressions. */
3005#define restore_se(savep, posp, pp) { \
3006 if (REG_MULTI) \
3007 *(posp) = (savep)->se_u.pos; \
3008 else \
3009 *(pp) = (savep)->se_u.ptr; }
3010
3011static int re_num_cmp __ARGS((long_u val, char_u *scan));
Bram Moolenaardf177f62005-02-22 08:39:57 +00003012static int regmatch __ARGS((char_u *prog, regsave_T *startp));
Bram Moolenaar071d4272004-06-13 20:20:40 +00003013static int regrepeat __ARGS((char_u *p, long maxcount));
3014
3015#ifdef DEBUG
3016int regnarrate = 0;
3017#endif
3018
3019/*
3020 * Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
3021 * Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
3022 * contains '\c' or '\C' the value is overruled.
3023 */
3024static int ireg_ic;
3025
3026#ifdef FEAT_MBYTE
3027/*
3028 * Similar to ireg_ic, but only for 'combining' characters. Set with \Z flag
3029 * in the regexp. Defaults to false, always.
3030 */
3031static int ireg_icombine;
3032#endif
3033
3034/*
3035 * Sometimes need to save a copy of a line. Since alloc()/free() is very
3036 * slow, we keep one allocated piece of memory and only re-allocate it when
3037 * it's too small. It's freed in vim_regexec_both() when finished.
3038 */
3039static char_u *reg_tofree;
3040static unsigned reg_tofreelen;
3041
3042/*
3043 * These variables are set when executing a regexp to speed up the execution.
3044 * Which ones are set depends on whethere a single-line or multi-line match is
3045 * done:
3046 * single-line multi-line
3047 * reg_match &regmatch_T NULL
3048 * reg_mmatch NULL &regmmatch_T
3049 * reg_startp reg_match->startp <invalid>
3050 * reg_endp reg_match->endp <invalid>
3051 * reg_startpos <invalid> reg_mmatch->startpos
3052 * reg_endpos <invalid> reg_mmatch->endpos
3053 * reg_win NULL window in which to search
3054 * reg_buf <invalid> buffer in which to search
3055 * reg_firstlnum <invalid> first line in which to search
3056 * reg_maxline 0 last line nr
3057 * reg_line_lbr FALSE or TRUE FALSE
3058 */
3059static regmatch_T *reg_match;
3060static regmmatch_T *reg_mmatch;
3061static char_u **reg_startp = NULL;
3062static char_u **reg_endp = NULL;
3063static lpos_T *reg_startpos = NULL;
3064static lpos_T *reg_endpos = NULL;
3065static win_T *reg_win;
3066static buf_T *reg_buf;
3067static linenr_T reg_firstlnum;
3068static linenr_T reg_maxline;
3069static int reg_line_lbr; /* "\n" in string is line break */
3070
3071/*
3072 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
3073 */
3074 static char_u *
3075reg_getline(lnum)
3076 linenr_T lnum;
3077{
3078 /* when looking behind for a match/no-match lnum is negative. But we
3079 * can't go before line 1 */
3080 if (reg_firstlnum + lnum < 1)
3081 return NULL;
3082 return ml_get_buf(reg_buf, reg_firstlnum + lnum, FALSE);
3083}
3084
3085static regsave_T behind_pos;
3086
3087#ifdef FEAT_SYN_HL
3088static char_u *reg_startzp[NSUBEXP]; /* Workspace to mark beginning */
3089static char_u *reg_endzp[NSUBEXP]; /* and end of \z(...\) matches */
3090static lpos_T reg_startzpos[NSUBEXP]; /* idem, beginning pos */
3091static lpos_T reg_endzpos[NSUBEXP]; /* idem, end pos */
3092#endif
3093
3094/* TRUE if using multi-line regexp. */
3095#define REG_MULTI (reg_match == NULL)
3096
3097/*
3098 * Match a regexp against a string.
3099 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
3100 * Uses curbuf for line count and 'iskeyword'.
3101 *
3102 * Return TRUE if there is a match, FALSE if not.
3103 */
3104 int
3105vim_regexec(rmp, line, col)
3106 regmatch_T *rmp;
3107 char_u *line; /* string to match against */
3108 colnr_T col; /* column to start looking for match */
3109{
3110 reg_match = rmp;
3111 reg_mmatch = NULL;
3112 reg_maxline = 0;
3113 reg_line_lbr = FALSE;
3114 reg_win = NULL;
3115 ireg_ic = rmp->rm_ic;
3116#ifdef FEAT_MBYTE
3117 ireg_icombine = FALSE;
3118#endif
3119 return (vim_regexec_both(line, col) != 0);
3120}
3121
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00003122#if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \
3123 || defined(FIND_REPLACE_DIALOG) || defined(PROTO)
Bram Moolenaar071d4272004-06-13 20:20:40 +00003124/*
3125 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
3126 */
3127 int
3128vim_regexec_nl(rmp, line, col)
3129 regmatch_T *rmp;
3130 char_u *line; /* string to match against */
3131 colnr_T col; /* column to start looking for match */
3132{
3133 reg_match = rmp;
3134 reg_mmatch = NULL;
3135 reg_maxline = 0;
3136 reg_line_lbr = TRUE;
3137 reg_win = NULL;
3138 ireg_ic = rmp->rm_ic;
3139#ifdef FEAT_MBYTE
3140 ireg_icombine = FALSE;
3141#endif
3142 return (vim_regexec_both(line, col) != 0);
3143}
3144#endif
3145
3146/*
3147 * Match a regexp against multiple lines.
3148 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
3149 * Uses curbuf for line count and 'iskeyword'.
3150 *
3151 * Return zero if there is no match. Return number of lines contained in the
3152 * match otherwise.
3153 */
3154 long
3155vim_regexec_multi(rmp, win, buf, lnum, col)
3156 regmmatch_T *rmp;
3157 win_T *win; /* window in which to search or NULL */
3158 buf_T *buf; /* buffer in which to search */
3159 linenr_T lnum; /* nr of line to start looking for match */
3160 colnr_T col; /* column to start looking for match */
3161{
3162 long r;
3163 buf_T *save_curbuf = curbuf;
3164
3165 reg_match = NULL;
3166 reg_mmatch = rmp;
3167 reg_buf = buf;
3168 reg_win = win;
3169 reg_firstlnum = lnum;
3170 reg_maxline = reg_buf->b_ml.ml_line_count - lnum;
3171 reg_line_lbr = FALSE;
3172 ireg_ic = rmp->rmm_ic;
3173#ifdef FEAT_MBYTE
3174 ireg_icombine = FALSE;
3175#endif
3176
3177 /* Need to switch to buffer "buf" to make vim_iswordc() work. */
3178 curbuf = buf;
3179 r = vim_regexec_both(NULL, col);
3180 curbuf = save_curbuf;
3181
3182 return r;
3183}
3184
3185/*
3186 * Match a regexp against a string ("line" points to the string) or multiple
3187 * lines ("line" is NULL, use reg_getline()).
3188 */
3189#ifdef HAVE_SETJMP_H
3190 static long
3191vim_regexec_both(line_arg, col_arg)
3192 char_u *line_arg;
3193 colnr_T col_arg; /* column to start looking for match */
3194#else
3195 static long
3196vim_regexec_both(line, col)
3197 char_u *line;
3198 colnr_T col; /* column to start looking for match */
3199#endif
3200{
3201 regprog_T *prog;
3202 char_u *s;
3203 long retval;
3204#ifdef HAVE_SETJMP_H
3205 char_u *line;
3206 colnr_T col;
Bram Moolenaar748bf032005-02-02 23:04:36 +00003207 int did_mch_startjmp = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00003208#endif
3209
3210 reg_tofree = NULL;
3211
Bram Moolenaar071d4272004-06-13 20:20:40 +00003212#ifdef HAVE_SETJMP_H
Bram Moolenaar071d4272004-06-13 20:20:40 +00003213 /* Trick to avoid "might be clobbered by `longjmp'" warning from gcc. */
3214 line = line_arg;
3215 col = col_arg;
3216#endif
3217 retval = 0L;
3218
3219 if (REG_MULTI)
3220 {
3221 prog = reg_mmatch->regprog;
3222 line = reg_getline((linenr_T)0);
3223 reg_startpos = reg_mmatch->startpos;
3224 reg_endpos = reg_mmatch->endpos;
3225 }
3226 else
3227 {
3228 prog = reg_match->regprog;
3229 reg_startp = reg_match->startp;
3230 reg_endp = reg_match->endp;
3231 }
3232
3233 /* Be paranoid... */
3234 if (prog == NULL || line == NULL)
3235 {
3236 EMSG(_(e_null));
3237 goto theend;
3238 }
3239
3240 /* Check validity of program. */
3241 if (prog_magic_wrong())
3242 goto theend;
3243
3244 /* If pattern contains "\c" or "\C": overrule value of ireg_ic */
3245 if (prog->regflags & RF_ICASE)
3246 ireg_ic = TRUE;
3247 else if (prog->regflags & RF_NOICASE)
3248 ireg_ic = FALSE;
3249
3250#ifdef FEAT_MBYTE
3251 /* If pattern contains "\Z" overrule value of ireg_icombine */
3252 if (prog->regflags & RF_ICOMBINE)
3253 ireg_icombine = TRUE;
3254#endif
3255
3256 /* If there is a "must appear" string, look for it. */
3257 if (prog->regmust != NULL)
3258 {
3259 int c;
3260
3261#ifdef FEAT_MBYTE
3262 if (has_mbyte)
3263 c = (*mb_ptr2char)(prog->regmust);
3264 else
3265#endif
3266 c = *prog->regmust;
3267 s = line + col;
3268 while ((s = cstrchr(s, c)) != NULL)
3269 {
3270 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
3271 break; /* Found it. */
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00003272 mb_ptr_adv(s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00003273 }
3274 if (s == NULL) /* Not present. */
3275 goto theend;
3276 }
3277
Bram Moolenaar748bf032005-02-02 23:04:36 +00003278#ifdef HAVE_TRY_EXCEPT
3279 __try
3280 {
3281#endif
3282
3283#ifdef HAVE_SETJMP_H
3284 /*
3285 * Matching with a regexp may cause a very deep recursive call of
3286 * regmatch(). Vim will crash when running out of stack space. Catch
3287 * this here if the system supports it.
3288 * It's a bit slow, do it after the check for "regmust".
3289 * Don't do it if the caller already set it up.
3290 */
3291 if (!lc_active)
3292 {
3293 did_mch_startjmp = TRUE;
3294 mch_startjmp();
3295 if (SETJMP(lc_jump_env) != 0)
3296 {
3297 mch_didjmp();
3298# ifdef SIGHASARG
3299 if (lc_signal != SIGINT)
3300# endif
3301 EMSG(_(e_complex));
3302 retval = 0L;
3303 goto inner_end;
3304 }
3305 }
3306#endif
3307
Bram Moolenaar071d4272004-06-13 20:20:40 +00003308 regline = line;
3309 reglnum = 0;
3310 out_of_stack = FALSE;
3311
3312 /* Simplest case: Anchored match need be tried only once. */
3313 if (prog->reganch)
3314 {
3315 int c;
3316
3317#ifdef FEAT_MBYTE
3318 if (has_mbyte)
3319 c = (*mb_ptr2char)(regline + col);
3320 else
3321#endif
3322 c = regline[col];
3323 if (prog->regstart == NUL
3324 || prog->regstart == c
3325 || (ireg_ic && ((
3326#ifdef FEAT_MBYTE
3327 (enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
3328 || (c < 255 && prog->regstart < 255 &&
3329#endif
3330 TOLOWER_LOC(prog->regstart) == TOLOWER_LOC(c)))))
3331 retval = regtry(prog, col);
3332 else
3333 retval = 0;
3334 }
3335 else
3336 {
3337 /* Messy cases: unanchored match. */
3338 while (!got_int && !out_of_stack)
3339 {
3340 if (prog->regstart != NUL)
3341 {
3342 /* Skip until the char we know it must start with. */
3343 s = cstrchr(regline + col, prog->regstart);
3344 if (s == NULL)
3345 {
3346 retval = 0;
3347 break;
3348 }
3349 col = (int)(s - regline);
3350 }
3351
3352 retval = regtry(prog, col);
3353 if (retval > 0)
3354 break;
3355
3356 /* if not currently on the first line, get it again */
3357 if (reglnum != 0)
3358 {
3359 regline = reg_getline((linenr_T)0);
3360 reglnum = 0;
3361 }
3362 if (regline[col] == NUL)
3363 break;
3364#ifdef FEAT_MBYTE
3365 if (has_mbyte)
3366 col += (*mb_ptr2len_check)(regline + col);
3367 else
3368#endif
3369 ++col;
3370 }
3371 }
3372
3373 if (out_of_stack)
Bram Moolenaar748bf032005-02-02 23:04:36 +00003374 EMSG(_(e_outofstack));
Bram Moolenaar071d4272004-06-13 20:20:40 +00003375
Bram Moolenaar748bf032005-02-02 23:04:36 +00003376#ifdef HAVE_SETJMP_H
3377inner_end:
3378 ;
3379#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +00003380#ifdef HAVE_TRY_EXCEPT
3381 }
3382 __except(EXCEPTION_EXECUTE_HANDLER)
3383 {
3384 if (GetExceptionCode() == EXCEPTION_STACK_OVERFLOW)
3385 {
3386 RESETSTKOFLW();
Bram Moolenaar748bf032005-02-02 23:04:36 +00003387 EMSG(_(e_outofstack));
Bram Moolenaar071d4272004-06-13 20:20:40 +00003388 }
3389 else
Bram Moolenaar748bf032005-02-02 23:04:36 +00003390 EMSG(_(e_complex));
Bram Moolenaar071d4272004-06-13 20:20:40 +00003391 retval = 0L;
3392 }
3393#endif
Bram Moolenaar748bf032005-02-02 23:04:36 +00003394#ifdef HAVE_SETJMP_H
3395 if (did_mch_startjmp)
3396 mch_endjmp();
3397#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +00003398
3399theend:
3400 /* Didn't find a match. */
3401 vim_free(reg_tofree);
Bram Moolenaar071d4272004-06-13 20:20:40 +00003402 return retval;
3403}
3404
3405#ifdef FEAT_SYN_HL
3406static reg_extmatch_T *make_extmatch __ARGS((void));
3407
3408/*
3409 * Create a new extmatch and mark it as referenced once.
3410 */
3411 static reg_extmatch_T *
3412make_extmatch()
3413{
3414 reg_extmatch_T *em;
3415
3416 em = (reg_extmatch_T *)alloc_clear((unsigned)sizeof(reg_extmatch_T));
3417 if (em != NULL)
3418 em->refcnt = 1;
3419 return em;
3420}
3421
3422/*
3423 * Add a reference to an extmatch.
3424 */
3425 reg_extmatch_T *
3426ref_extmatch(em)
3427 reg_extmatch_T *em;
3428{
3429 if (em != NULL)
3430 em->refcnt++;
3431 return em;
3432}
3433
3434/*
3435 * Remove a reference to an extmatch. If there are no references left, free
3436 * the info.
3437 */
3438 void
3439unref_extmatch(em)
3440 reg_extmatch_T *em;
3441{
3442 int i;
3443
3444 if (em != NULL && --em->refcnt <= 0)
3445 {
3446 for (i = 0; i < NSUBEXP; ++i)
3447 vim_free(em->matches[i]);
3448 vim_free(em);
3449 }
3450}
3451#endif
3452
3453/*
3454 * regtry - try match of "prog" with at regline["col"].
3455 * Returns 0 for failure, number of lines contained in the match otherwise.
3456 */
3457 static long
3458regtry(prog, col)
3459 regprog_T *prog;
3460 colnr_T col;
3461{
3462 reginput = regline + col;
3463 need_clear_subexpr = TRUE;
3464#ifdef FEAT_SYN_HL
3465 /* Clear the external match subpointers if necessary. */
3466 if (prog->reghasz == REX_SET)
3467 need_clear_zsubexpr = TRUE;
3468#endif
3469
Bram Moolenaardf177f62005-02-22 08:39:57 +00003470 if (regmatch(prog->program + 1, NULL))
Bram Moolenaar071d4272004-06-13 20:20:40 +00003471 {
3472 cleanup_subexpr();
3473 if (REG_MULTI)
3474 {
3475 if (reg_startpos[0].lnum < 0)
3476 {
3477 reg_startpos[0].lnum = 0;
3478 reg_startpos[0].col = col;
3479 }
3480 if (reg_endpos[0].lnum < 0)
3481 {
3482 reg_endpos[0].lnum = reglnum;
3483 reg_endpos[0].col = (int)(reginput - regline);
3484 }
3485 else
3486 /* Use line number of "\ze". */
3487 reglnum = reg_endpos[0].lnum;
3488 }
3489 else
3490 {
3491 if (reg_startp[0] == NULL)
3492 reg_startp[0] = regline + col;
3493 if (reg_endp[0] == NULL)
3494 reg_endp[0] = reginput;
3495 }
3496#ifdef FEAT_SYN_HL
3497 /* Package any found \z(...\) matches for export. Default is none. */
3498 unref_extmatch(re_extmatch_out);
3499 re_extmatch_out = NULL;
3500
3501 if (prog->reghasz == REX_SET)
3502 {
3503 int i;
3504
3505 cleanup_zsubexpr();
3506 re_extmatch_out = make_extmatch();
3507 for (i = 0; i < NSUBEXP; i++)
3508 {
3509 if (REG_MULTI)
3510 {
3511 /* Only accept single line matches. */
3512 if (reg_startzpos[i].lnum >= 0
3513 && reg_endzpos[i].lnum == reg_startzpos[i].lnum)
3514 re_extmatch_out->matches[i] =
3515 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
3516 + reg_startzpos[i].col,
3517 reg_endzpos[i].col - reg_startzpos[i].col);
3518 }
3519 else
3520 {
3521 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
3522 re_extmatch_out->matches[i] =
3523 vim_strnsave(reg_startzp[i],
3524 (int)(reg_endzp[i] - reg_startzp[i]));
3525 }
3526 }
3527 }
3528#endif
3529 return 1 + reglnum;
3530 }
3531 return 0;
3532}
3533
3534#ifdef FEAT_MBYTE
Bram Moolenaar071d4272004-06-13 20:20:40 +00003535static int reg_prev_class __ARGS((void));
3536
Bram Moolenaar071d4272004-06-13 20:20:40 +00003537/*
3538 * Get class of previous character.
3539 */
3540 static int
3541reg_prev_class()
3542{
3543 if (reginput > regline)
3544 return mb_get_class(reginput - 1
3545 - (*mb_head_off)(regline, reginput - 1));
3546 return -1;
3547}
3548
Bram Moolenaar071d4272004-06-13 20:20:40 +00003549#endif
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00003550#define ADVANCE_REGINPUT() mb_ptr_adv(reginput)
Bram Moolenaar071d4272004-06-13 20:20:40 +00003551
3552/*
3553 * The arguments from BRACE_LIMITS are stored here. They are actually local
3554 * to regmatch(), but they are here to reduce the amount of stack space used
3555 * (it can be called recursively many times).
3556 */
3557static long bl_minval;
3558static long bl_maxval;
3559
3560/*
3561 * regmatch - main matching routine
3562 *
3563 * Conceptually the strategy is simple: Check to see whether the current
3564 * node matches, call self recursively to see whether the rest matches,
3565 * and then act accordingly. In practice we make some effort to avoid
3566 * recursion, in particular by going through "ordinary" nodes (that don't
3567 * need to know whether the rest of the match failed) by a loop instead of
3568 * by recursion.
3569 *
3570 * Returns TRUE when there is a match. Leaves reginput and reglnum just after
3571 * the last matched character.
3572 * Returns FALSE when there is no match. Leaves reginput and reglnum in an
3573 * undefined state!
3574 */
3575 static int
Bram Moolenaardf177f62005-02-22 08:39:57 +00003576regmatch(scan, startp)
Bram Moolenaar071d4272004-06-13 20:20:40 +00003577 char_u *scan; /* Current node. */
Bram Moolenaardf177f62005-02-22 08:39:57 +00003578 regsave_T *startp; /* start position for BACK */
Bram Moolenaar071d4272004-06-13 20:20:40 +00003579{
3580 char_u *next; /* Next node. */
3581 int op;
3582 int c;
3583
3584#ifdef HAVE_GETRLIMIT
3585 /* Check if we are running out of stack space. Could be caused by
3586 * recursively calling ourselves. */
3587 if (out_of_stack || mch_stackcheck((char *)&op) == FAIL)
3588 {
3589 out_of_stack = TRUE;
3590 return FALSE;
3591 }
3592#endif
3593
3594 /* Some patterns my cause a long time to match, even though they are not
3595 * illegal. E.g., "\([a-z]\+\)\+Q". Allow breaking them with CTRL-C. */
3596 fast_breakcheck();
3597
3598#ifdef DEBUG
3599 if (scan != NULL && regnarrate)
3600 {
3601 mch_errmsg(regprop(scan));
3602 mch_errmsg("(\n");
3603 }
3604#endif
3605 while (scan != NULL)
3606 {
3607 if (got_int || out_of_stack)
3608 return FALSE;
3609#ifdef DEBUG
3610 if (regnarrate)
3611 {
3612 mch_errmsg(regprop(scan));
3613 mch_errmsg("...\n");
3614# ifdef FEAT_SYN_HL
3615 if (re_extmatch_in != NULL)
3616 {
3617 int i;
3618
3619 mch_errmsg(_("External submatches:\n"));
3620 for (i = 0; i < NSUBEXP; i++)
3621 {
3622 mch_errmsg(" \"");
3623 if (re_extmatch_in->matches[i] != NULL)
3624 mch_errmsg(re_extmatch_in->matches[i]);
3625 mch_errmsg("\"\n");
3626 }
3627 }
3628# endif
3629 }
3630#endif
3631 next = regnext(scan);
3632
3633 op = OP(scan);
3634 /* Check for character class with NL added. */
3635 if (WITH_NL(op) && *reginput == NUL && reglnum < reg_maxline)
3636 {
3637 reg_nextline();
3638 }
3639 else if (reg_line_lbr && WITH_NL(op) && *reginput == '\n')
3640 {
3641 ADVANCE_REGINPUT();
3642 }
3643 else
3644 {
3645 if (WITH_NL(op))
3646 op -= ADD_NL;
3647#ifdef FEAT_MBYTE
3648 if (has_mbyte)
3649 c = (*mb_ptr2char)(reginput);
3650 else
3651#endif
3652 c = *reginput;
3653 switch (op)
3654 {
3655 case BOL:
3656 if (reginput != regline)
3657 return FALSE;
3658 break;
3659
3660 case EOL:
3661 if (c != NUL)
3662 return FALSE;
3663 break;
3664
3665 case RE_BOF:
3666 /* Passing -1 to the getline() function provided for the search
3667 * should always return NULL if the current line is the first
3668 * line of the file. */
3669 if (reglnum != 0 || reginput != regline
3670 || (REG_MULTI && reg_getline((linenr_T)-1) != NULL))
3671 return FALSE;
3672 break;
3673
3674 case RE_EOF:
3675 if (reglnum != reg_maxline || c != NUL)
3676 return FALSE;
3677 break;
3678
3679 case CURSOR:
3680 /* Check if the buffer is in a window and compare the
3681 * reg_win->w_cursor position to the match position. */
3682 if (reg_win == NULL
3683 || (reglnum + reg_firstlnum != reg_win->w_cursor.lnum)
3684 || ((colnr_T)(reginput - regline) != reg_win->w_cursor.col))
3685 return FALSE;
3686 break;
3687
3688 case RE_LNUM:
3689 if (!REG_MULTI || !re_num_cmp((long_u)(reglnum + reg_firstlnum),
3690 scan))
3691 return FALSE;
3692 break;
3693
3694 case RE_COL:
3695 if (!re_num_cmp((long_u)(reginput - regline) + 1, scan))
3696 return FALSE;
3697 break;
3698
3699 case RE_VCOL:
3700 if (!re_num_cmp((long_u)win_linetabsize(
3701 reg_win == NULL ? curwin : reg_win,
3702 regline, (colnr_T)(reginput - regline)) + 1, scan))
3703 return FALSE;
3704 break;
3705
3706 case BOW: /* \<word; reginput points to w */
3707 if (c == NUL) /* Can't match at end of line */
3708 return FALSE;
3709#ifdef FEAT_MBYTE
3710 if (has_mbyte)
3711 {
3712 int this_class;
3713
3714 /* Get class of current and previous char (if it exists). */
3715 this_class = mb_get_class(reginput);
3716 if (this_class <= 1)
3717 return FALSE; /* not on a word at all */
3718 if (reg_prev_class() == this_class)
3719 return FALSE; /* previous char is in same word */
3720 }
3721#endif
3722 else
3723 {
3724 if (!vim_iswordc(c)
3725 || (reginput > regline && vim_iswordc(reginput[-1])))
3726 return FALSE;
3727 }
3728 break;
3729
3730 case EOW: /* word\>; reginput points after d */
3731 if (reginput == regline) /* Can't match at start of line */
3732 return FALSE;
3733#ifdef FEAT_MBYTE
3734 if (has_mbyte)
3735 {
3736 int this_class, prev_class;
3737
3738 /* Get class of current and previous char (if it exists). */
3739 this_class = mb_get_class(reginput);
3740 prev_class = reg_prev_class();
3741 if (this_class == prev_class)
3742 return FALSE;
3743 if (prev_class == 0 || prev_class == 1)
3744 return FALSE;
3745 }
3746 else
3747#endif
3748 {
3749 if (!vim_iswordc(reginput[-1]))
3750 return FALSE;
3751 if (reginput[0] != NUL && vim_iswordc(c))
3752 return FALSE;
3753 }
3754 break; /* Matched with EOW */
3755
3756 case ANY:
3757 if (c == NUL)
3758 return FALSE;
3759 ADVANCE_REGINPUT();
3760 break;
3761
3762 case IDENT:
3763 if (!vim_isIDc(c))
3764 return FALSE;
3765 ADVANCE_REGINPUT();
3766 break;
3767
3768 case SIDENT:
3769 if (VIM_ISDIGIT(*reginput) || !vim_isIDc(c))
3770 return FALSE;
3771 ADVANCE_REGINPUT();
3772 break;
3773
3774 case KWORD:
3775 if (!vim_iswordp(reginput))
3776 return FALSE;
3777 ADVANCE_REGINPUT();
3778 break;
3779
3780 case SKWORD:
3781 if (VIM_ISDIGIT(*reginput) || !vim_iswordp(reginput))
3782 return FALSE;
3783 ADVANCE_REGINPUT();
3784 break;
3785
3786 case FNAME:
3787 if (!vim_isfilec(c))
3788 return FALSE;
3789 ADVANCE_REGINPUT();
3790 break;
3791
3792 case SFNAME:
3793 if (VIM_ISDIGIT(*reginput) || !vim_isfilec(c))
3794 return FALSE;
3795 ADVANCE_REGINPUT();
3796 break;
3797
3798 case PRINT:
3799 if (ptr2cells(reginput) != 1)
3800 return FALSE;
3801 ADVANCE_REGINPUT();
3802 break;
3803
3804 case SPRINT:
3805 if (VIM_ISDIGIT(*reginput) || ptr2cells(reginput) != 1)
3806 return FALSE;
3807 ADVANCE_REGINPUT();
3808 break;
3809
3810 case WHITE:
3811 if (!vim_iswhite(c))
3812 return FALSE;
3813 ADVANCE_REGINPUT();
3814 break;
3815
3816 case NWHITE:
3817 if (c == NUL || vim_iswhite(c))
3818 return FALSE;
3819 ADVANCE_REGINPUT();
3820 break;
3821
3822 case DIGIT:
3823 if (!ri_digit(c))
3824 return FALSE;
3825 ADVANCE_REGINPUT();
3826 break;
3827
3828 case NDIGIT:
3829 if (c == NUL || ri_digit(c))
3830 return FALSE;
3831 ADVANCE_REGINPUT();
3832 break;
3833
3834 case HEX:
3835 if (!ri_hex(c))
3836 return FALSE;
3837 ADVANCE_REGINPUT();
3838 break;
3839
3840 case NHEX:
3841 if (c == NUL || ri_hex(c))
3842 return FALSE;
3843 ADVANCE_REGINPUT();
3844 break;
3845
3846 case OCTAL:
3847 if (!ri_octal(c))
3848 return FALSE;
3849 ADVANCE_REGINPUT();
3850 break;
3851
3852 case NOCTAL:
3853 if (c == NUL || ri_octal(c))
3854 return FALSE;
3855 ADVANCE_REGINPUT();
3856 break;
3857
3858 case WORD:
3859 if (!ri_word(c))
3860 return FALSE;
3861 ADVANCE_REGINPUT();
3862 break;
3863
3864 case NWORD:
3865 if (c == NUL || ri_word(c))
3866 return FALSE;
3867 ADVANCE_REGINPUT();
3868 break;
3869
3870 case HEAD:
3871 if (!ri_head(c))
3872 return FALSE;
3873 ADVANCE_REGINPUT();
3874 break;
3875
3876 case NHEAD:
3877 if (c == NUL || ri_head(c))
3878 return FALSE;
3879 ADVANCE_REGINPUT();
3880 break;
3881
3882 case ALPHA:
3883 if (!ri_alpha(c))
3884 return FALSE;
3885 ADVANCE_REGINPUT();
3886 break;
3887
3888 case NALPHA:
3889 if (c == NUL || ri_alpha(c))
3890 return FALSE;
3891 ADVANCE_REGINPUT();
3892 break;
3893
3894 case LOWER:
3895 if (!ri_lower(c))
3896 return FALSE;
3897 ADVANCE_REGINPUT();
3898 break;
3899
3900 case NLOWER:
3901 if (c == NUL || ri_lower(c))
3902 return FALSE;
3903 ADVANCE_REGINPUT();
3904 break;
3905
3906 case UPPER:
3907 if (!ri_upper(c))
3908 return FALSE;
3909 ADVANCE_REGINPUT();
3910 break;
3911
3912 case NUPPER:
3913 if (c == NUL || ri_upper(c))
3914 return FALSE;
3915 ADVANCE_REGINPUT();
3916 break;
3917
3918 case EXACTLY:
3919 {
3920 int len;
3921 char_u *opnd;
3922
3923 opnd = OPERAND(scan);
3924 /* Inline the first byte, for speed. */
3925 if (*opnd != *reginput
3926 && (!ireg_ic || (
3927#ifdef FEAT_MBYTE
3928 !enc_utf8 &&
3929#endif
3930 TOLOWER_LOC(*opnd) != TOLOWER_LOC(*reginput))))
3931 return FALSE;
3932 if (*opnd == NUL)
3933 {
3934 /* match empty string always works; happens when "~" is
3935 * empty. */
3936 }
3937 else if (opnd[1] == NUL
3938#ifdef FEAT_MBYTE
3939 && !(enc_utf8 && ireg_ic)
3940#endif
3941 )
3942 ++reginput; /* matched a single char */
3943 else
3944 {
3945 len = (int)STRLEN(opnd);
3946 /* Need to match first byte again for multi-byte. */
3947 if (cstrncmp(opnd, reginput, &len) != 0)
3948 return FALSE;
3949#ifdef FEAT_MBYTE
3950 /* Check for following composing character. */
3951 if (enc_utf8 && UTF_COMPOSINGLIKE(reginput, reginput + len))
3952 {
3953 /* raaron: This code makes a composing character get
3954 * ignored, which is the correct behavior (sometimes)
3955 * for voweled Hebrew texts. */
3956 if (!ireg_icombine)
3957 return FALSE;
3958 }
3959 else
3960#endif
3961 reginput += len;
3962 }
3963 }
3964 break;
3965
3966 case ANYOF:
3967 case ANYBUT:
3968 if (c == NUL)
3969 return FALSE;
3970 if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
3971 return FALSE;
3972 ADVANCE_REGINPUT();
3973 break;
3974
3975#ifdef FEAT_MBYTE
3976 case MULTIBYTECODE:
3977 if (has_mbyte)
3978 {
3979 int i, len;
3980 char_u *opnd;
3981
3982 opnd = OPERAND(scan);
3983 /* Safety check (just in case 'encoding' was changed since
3984 * compiling the program). */
3985 if ((len = (*mb_ptr2len_check)(opnd)) < 2)
3986 return FALSE;
3987 for (i = 0; i < len; ++i)
3988 if (opnd[i] != reginput[i])
3989 return FALSE;
3990 reginput += len;
3991 }
3992 else
3993 return FALSE;
3994 break;
3995#endif
3996
3997 case NOTHING:
3998 break;
3999
4000 case BACK:
Bram Moolenaardf177f62005-02-22 08:39:57 +00004001 /* When we run into BACK without matching something non-empty, we
4002 * fail. */
4003 if (startp != NULL && reg_save_equal(startp))
4004 return FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00004005 break;
4006
4007 case MOPEN + 0: /* Match start: \zs */
4008 case MOPEN + 1: /* \( */
4009 case MOPEN + 2:
4010 case MOPEN + 3:
4011 case MOPEN + 4:
4012 case MOPEN + 5:
4013 case MOPEN + 6:
4014 case MOPEN + 7:
4015 case MOPEN + 8:
4016 case MOPEN + 9:
4017 {
4018 int no;
4019 save_se_T save;
4020
4021 no = op - MOPEN;
4022 cleanup_subexpr();
4023 save_se(&save, &reg_startpos[no], &reg_startp[no]);
4024
Bram Moolenaardf177f62005-02-22 08:39:57 +00004025 if (regmatch(next, startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004026 return TRUE;
4027
4028 restore_se(&save, &reg_startpos[no], &reg_startp[no]);
4029 return FALSE;
4030 }
4031 /* break; Not Reached */
4032
4033 case NOPEN: /* \%( */
4034 case NCLOSE: /* \) after \%( */
Bram Moolenaardf177f62005-02-22 08:39:57 +00004035 if (regmatch(next, startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004036 return TRUE;
4037 return FALSE;
4038 /* break; Not Reached */
4039
4040#ifdef FEAT_SYN_HL
4041 case ZOPEN + 1:
4042 case ZOPEN + 2:
4043 case ZOPEN + 3:
4044 case ZOPEN + 4:
4045 case ZOPEN + 5:
4046 case ZOPEN + 6:
4047 case ZOPEN + 7:
4048 case ZOPEN + 8:
4049 case ZOPEN + 9:
4050 {
4051 int no;
4052 save_se_T save;
4053
4054 no = op - ZOPEN;
4055 cleanup_zsubexpr();
4056 save_se(&save, &reg_startzpos[no], &reg_startzp[no]);
4057
Bram Moolenaardf177f62005-02-22 08:39:57 +00004058 if (regmatch(next, startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004059 return TRUE;
4060
4061 restore_se(&save, &reg_startzpos[no], &reg_startzp[no]);
4062 return FALSE;
4063 }
4064 /* break; Not Reached */
4065#endif
4066
4067 case MCLOSE + 0: /* Match end: \ze */
4068 case MCLOSE + 1: /* \) */
4069 case MCLOSE + 2:
4070 case MCLOSE + 3:
4071 case MCLOSE + 4:
4072 case MCLOSE + 5:
4073 case MCLOSE + 6:
4074 case MCLOSE + 7:
4075 case MCLOSE + 8:
4076 case MCLOSE + 9:
4077 {
4078 int no;
4079 save_se_T save;
4080
4081 no = op - MCLOSE;
4082 cleanup_subexpr();
4083 save_se(&save, &reg_endpos[no], &reg_endp[no]);
4084
Bram Moolenaardf177f62005-02-22 08:39:57 +00004085 if (regmatch(next, startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004086 return TRUE;
4087
4088 restore_se(&save, &reg_endpos[no], &reg_endp[no]);
4089 return FALSE;
4090 }
4091 /* break; Not Reached */
4092
4093#ifdef FEAT_SYN_HL
4094 case ZCLOSE + 1: /* \) after \z( */
4095 case ZCLOSE + 2:
4096 case ZCLOSE + 3:
4097 case ZCLOSE + 4:
4098 case ZCLOSE + 5:
4099 case ZCLOSE + 6:
4100 case ZCLOSE + 7:
4101 case ZCLOSE + 8:
4102 case ZCLOSE + 9:
4103 {
4104 int no;
4105 save_se_T save;
4106
4107 no = op - ZCLOSE;
4108 cleanup_zsubexpr();
4109 save_se(&save, &reg_endzpos[no], &reg_endzp[no]);
4110
Bram Moolenaardf177f62005-02-22 08:39:57 +00004111 if (regmatch(next, startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004112 return TRUE;
4113
4114 restore_se(&save, &reg_endzpos[no], &reg_endzp[no]);
4115 return FALSE;
4116 }
4117 /* break; Not Reached */
4118#endif
4119
4120 case BACKREF + 1:
4121 case BACKREF + 2:
4122 case BACKREF + 3:
4123 case BACKREF + 4:
4124 case BACKREF + 5:
4125 case BACKREF + 6:
4126 case BACKREF + 7:
4127 case BACKREF + 8:
4128 case BACKREF + 9:
4129 {
4130 int no;
4131 int len;
4132 linenr_T clnum;
4133 colnr_T ccol;
4134 char_u *p;
4135
4136 no = op - BACKREF;
4137 cleanup_subexpr();
4138 if (!REG_MULTI) /* Single-line regexp */
4139 {
4140 if (reg_endp[no] == NULL)
4141 {
4142 /* Backref was not set: Match an empty string. */
4143 len = 0;
4144 }
4145 else
4146 {
4147 /* Compare current input with back-ref in the same
4148 * line. */
4149 len = (int)(reg_endp[no] - reg_startp[no]);
4150 if (cstrncmp(reg_startp[no], reginput, &len) != 0)
4151 return FALSE;
4152 }
4153 }
4154 else /* Multi-line regexp */
4155 {
4156 if (reg_endpos[no].lnum < 0)
4157 {
4158 /* Backref was not set: Match an empty string. */
4159 len = 0;
4160 }
4161 else
4162 {
4163 if (reg_startpos[no].lnum == reglnum
4164 && reg_endpos[no].lnum == reglnum)
4165 {
4166 /* Compare back-ref within the current line. */
4167 len = reg_endpos[no].col - reg_startpos[no].col;
4168 if (cstrncmp(regline + reg_startpos[no].col,
4169 reginput, &len) != 0)
4170 return FALSE;
4171 }
4172 else
4173 {
4174 /* Messy situation: Need to compare between two
4175 * lines. */
4176 ccol = reg_startpos[no].col;
4177 clnum = reg_startpos[no].lnum;
4178 for (;;)
4179 {
4180 /* Since getting one line may invalidate
4181 * the other, need to make copy. Slow! */
4182 if (regline != reg_tofree)
4183 {
4184 len = (int)STRLEN(regline);
4185 if (reg_tofree == NULL
4186 || len >= (int)reg_tofreelen)
4187 {
4188 len += 50; /* get some extra */
4189 vim_free(reg_tofree);
4190 reg_tofree = alloc(len);
4191 if (reg_tofree == NULL)
4192 return FALSE; /* out of memory! */
4193 reg_tofreelen = len;
4194 }
4195 STRCPY(reg_tofree, regline);
4196 reginput = reg_tofree
4197 + (reginput - regline);
4198 regline = reg_tofree;
4199 }
4200
4201 /* Get the line to compare with. */
4202 p = reg_getline(clnum);
4203 if (clnum == reg_endpos[no].lnum)
4204 len = reg_endpos[no].col - ccol;
4205 else
4206 len = (int)STRLEN(p + ccol);
4207
4208 if (cstrncmp(p + ccol, reginput, &len) != 0)
4209 return FALSE; /* doesn't match */
4210 if (clnum == reg_endpos[no].lnum)
4211 break; /* match and at end! */
4212 if (reglnum == reg_maxline)
4213 return FALSE; /* text too short */
4214
4215 /* Advance to next line. */
4216 reg_nextline();
4217 ++clnum;
4218 ccol = 0;
4219 if (got_int || out_of_stack)
4220 return FALSE;
4221 }
4222
4223 /* found a match! Note that regline may now point
4224 * to a copy of the line, that should not matter. */
4225 }
4226 }
4227 }
4228
4229 /* Matched the backref, skip over it. */
4230 reginput += len;
4231 }
4232 break;
4233
4234#ifdef FEAT_SYN_HL
4235 case ZREF + 1:
4236 case ZREF + 2:
4237 case ZREF + 3:
4238 case ZREF + 4:
4239 case ZREF + 5:
4240 case ZREF + 6:
4241 case ZREF + 7:
4242 case ZREF + 8:
4243 case ZREF + 9:
4244 {
4245 int no;
4246 int len;
4247
4248 cleanup_zsubexpr();
4249 no = op - ZREF;
4250 if (re_extmatch_in != NULL
4251 && re_extmatch_in->matches[no] != NULL)
4252 {
4253 len = (int)STRLEN(re_extmatch_in->matches[no]);
4254 if (cstrncmp(re_extmatch_in->matches[no],
4255 reginput, &len) != 0)
4256 return FALSE;
4257 reginput += len;
4258 }
4259 else
4260 {
4261 /* Backref was not set: Match an empty string. */
4262 }
4263 }
4264 break;
4265#endif
4266
4267 case BRANCH:
4268 {
4269 if (OP(next) != BRANCH) /* No choice. */
4270 next = OPERAND(scan); /* Avoid recursion. */
4271 else
4272 {
4273 regsave_T save;
4274
4275 do
4276 {
4277 reg_save(&save);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004278 if (regmatch(OPERAND(scan), &save))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004279 return TRUE;
4280 reg_restore(&save);
4281 scan = regnext(scan);
4282 } while (scan != NULL && OP(scan) == BRANCH);
4283 return FALSE;
4284 /* NOTREACHED */
4285 }
4286 }
4287 break;
4288
4289 case BRACE_LIMITS:
4290 {
4291 int no;
4292
4293 if (OP(next) == BRACE_SIMPLE)
4294 {
4295 bl_minval = OPERAND_MIN(scan);
4296 bl_maxval = OPERAND_MAX(scan);
4297 }
4298 else if (OP(next) >= BRACE_COMPLEX
4299 && OP(next) < BRACE_COMPLEX + 10)
4300 {
4301 no = OP(next) - BRACE_COMPLEX;
4302 brace_min[no] = OPERAND_MIN(scan);
4303 brace_max[no] = OPERAND_MAX(scan);
4304 brace_count[no] = 0;
4305 }
4306 else
4307 {
4308 EMSG(_(e_internal)); /* Shouldn't happen */
4309 return FALSE;
4310 }
4311 }
4312 break;
4313
4314 case BRACE_COMPLEX + 0:
4315 case BRACE_COMPLEX + 1:
4316 case BRACE_COMPLEX + 2:
4317 case BRACE_COMPLEX + 3:
4318 case BRACE_COMPLEX + 4:
4319 case BRACE_COMPLEX + 5:
4320 case BRACE_COMPLEX + 6:
4321 case BRACE_COMPLEX + 7:
4322 case BRACE_COMPLEX + 8:
4323 case BRACE_COMPLEX + 9:
4324 {
4325 int no;
4326 regsave_T save;
4327
4328 no = op - BRACE_COMPLEX;
4329 ++brace_count[no];
4330
4331 /* If not matched enough times yet, try one more */
4332 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
4333 ? brace_min[no] : brace_max[no]))
4334 {
4335 reg_save(&save);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004336 if (regmatch(OPERAND(scan), &save))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004337 return TRUE;
4338 reg_restore(&save);
4339 --brace_count[no]; /* failed, decrement match count */
4340 return FALSE;
4341 }
4342
4343 /* If matched enough times, may try matching some more */
4344 if (brace_min[no] <= brace_max[no])
4345 {
4346 /* Range is the normal way around, use longest match */
4347 if (brace_count[no] <= brace_max[no])
4348 {
4349 reg_save(&save);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004350 if (regmatch(OPERAND(scan), &save))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004351 return TRUE; /* matched some more times */
4352 reg_restore(&save);
4353 --brace_count[no]; /* matched just enough times */
Bram Moolenaardf177f62005-02-22 08:39:57 +00004354 /* { continue with the items after \{} */
Bram Moolenaar071d4272004-06-13 20:20:40 +00004355 }
4356 }
4357 else
4358 {
4359 /* Range is backwards, use shortest match first */
4360 if (brace_count[no] <= brace_min[no])
4361 {
4362 reg_save(&save);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004363 if (regmatch(next, &save))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004364 return TRUE;
4365 reg_restore(&save);
4366 next = OPERAND(scan);
4367 /* must try to match one more item */
4368 }
4369 }
4370 }
4371 break;
4372
4373 case BRACE_SIMPLE:
4374 case STAR:
4375 case PLUS:
4376 {
4377 int nextb; /* next byte */
4378 int nextb_ic; /* next byte reverse case */
4379 long count;
4380 regsave_T save;
4381 long minval;
4382 long maxval;
4383
4384 /*
4385 * Lookahead to avoid useless match attempts when we know
4386 * what character comes next.
4387 */
4388 if (OP(next) == EXACTLY)
4389 {
4390 nextb = *OPERAND(next);
4391 if (ireg_ic)
4392 {
4393 if (isupper(nextb))
4394 nextb_ic = TOLOWER_LOC(nextb);
4395 else
4396 nextb_ic = TOUPPER_LOC(nextb);
4397 }
4398 else
4399 nextb_ic = nextb;
4400 }
4401 else
4402 {
4403 nextb = NUL;
4404 nextb_ic = NUL;
4405 }
4406 if (op != BRACE_SIMPLE)
4407 {
4408 minval = (op == STAR) ? 0 : 1;
4409 maxval = MAX_LIMIT;
4410 }
4411 else
4412 {
4413 minval = bl_minval;
4414 maxval = bl_maxval;
4415 }
4416
4417 /*
4418 * When maxval > minval, try matching as much as possible, up
4419 * to maxval. When maxval < minval, try matching at least the
4420 * minimal number (since the range is backwards, that's also
4421 * maxval!).
4422 */
4423 count = regrepeat(OPERAND(scan), maxval);
4424 if (got_int)
4425 return FALSE;
4426 if (minval <= maxval)
4427 {
4428 /* Range is the normal way around, use longest match */
4429 while (count >= minval)
4430 {
4431 /* If it could match, try it. */
4432 if (nextb == NUL || *reginput == nextb
4433 || *reginput == nextb_ic)
4434 {
4435 reg_save(&save);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004436 if (regmatch(next, startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004437 return TRUE;
4438 reg_restore(&save);
4439 }
4440 /* Couldn't or didn't match -- back up one char. */
4441 if (--count < minval)
4442 break;
4443 if (reginput == regline)
4444 {
4445 /* backup to last char of previous line */
4446 --reglnum;
4447 regline = reg_getline(reglnum);
4448 /* Just in case regrepeat() didn't count right. */
4449 if (regline == NULL)
4450 return FALSE;
4451 reginput = regline + STRLEN(regline);
4452 fast_breakcheck();
4453 if (got_int || out_of_stack)
4454 return FALSE;
4455 }
4456 else
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004457 mb_ptr_back(regline, reginput);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004458 }
4459 }
4460 else
4461 {
4462 /* Range is backwards, use shortest match first.
4463 * Careful: maxval and minval are exchanged! */
4464 if (count < maxval)
4465 return FALSE;
4466 for (;;)
4467 {
4468 /* If it could work, try it. */
4469 if (nextb == NUL || *reginput == nextb
4470 || *reginput == nextb_ic)
4471 {
4472 reg_save(&save);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004473 if (regmatch(next, &save))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004474 return TRUE;
4475 reg_restore(&save);
4476 }
4477 /* Couldn't or didn't match: try advancing one char. */
4478 if (count == minval
4479 || regrepeat(OPERAND(scan), 1L) == 0)
4480 break;
4481 ++count;
4482 if (got_int || out_of_stack)
4483 return FALSE;
4484 }
4485 }
4486 return FALSE;
4487 }
4488 /* break; Not Reached */
4489
4490 case NOMATCH:
4491 {
4492 regsave_T save;
4493
4494 /* If the operand matches, we fail. Otherwise backup and
4495 * continue with the next item. */
4496 reg_save(&save);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004497 if (regmatch(OPERAND(scan), startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004498 return FALSE;
4499 reg_restore(&save);
4500 }
4501 break;
4502
4503 case MATCH:
4504 case SUBPAT:
4505 {
4506 regsave_T save;
4507
4508 /* If the operand doesn't match, we fail. Otherwise backup
4509 * and continue with the next item. */
4510 reg_save(&save);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004511 if (!regmatch(OPERAND(scan), startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004512 return FALSE;
4513 if (op == MATCH) /* zero-width */
4514 reg_restore(&save);
4515 }
4516 break;
4517
4518 case BEHIND:
4519 case NOBEHIND:
4520 {
4521 regsave_T save_after, save_start;
4522 regsave_T save_behind_pos;
4523 int needmatch = (op == BEHIND);
4524
4525 /*
4526 * Look back in the input of the operand matches or not. This
4527 * must be done at every position in the input and checking if
4528 * the match ends at the current position.
4529 * First check if the next item matches, that's probably
4530 * faster.
4531 */
4532 reg_save(&save_start);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004533 if (regmatch(next, startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004534 {
4535 /* save the position after the found match for next */
4536 reg_save(&save_after);
4537
4538 /* start looking for a match with operand at the current
4539 * postion. Go back one character until we find the
4540 * result, hitting the start of the line or the previous
4541 * line (for multi-line matching).
4542 * Set behind_pos to where the match should end, BHPOS
4543 * will match it. */
4544 save_behind_pos = behind_pos;
4545 behind_pos = save_start;
4546 for (;;)
4547 {
4548 reg_restore(&save_start);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004549 if (regmatch(OPERAND(scan), startp)
Bram Moolenaar071d4272004-06-13 20:20:40 +00004550 && reg_save_equal(&behind_pos))
4551 {
4552 behind_pos = save_behind_pos;
4553 /* found a match that ends where "next" started */
4554 if (needmatch)
4555 {
4556 reg_restore(&save_after);
4557 return TRUE;
4558 }
4559 return FALSE;
4560 }
4561 /*
4562 * No match: Go back one character. May go to
4563 * previous line once.
4564 */
4565 if (REG_MULTI)
4566 {
4567 if (save_start.rs_u.pos.col == 0)
4568 {
4569 if (save_start.rs_u.pos.lnum
4570 < behind_pos.rs_u.pos.lnum
4571 || reg_getline(
4572 --save_start.rs_u.pos.lnum) == NULL)
4573 break;
4574 reg_restore(&save_start);
4575 save_start.rs_u.pos.col =
4576 (colnr_T)STRLEN(regline);
4577 }
4578 else
4579 --save_start.rs_u.pos.col;
4580 }
4581 else
4582 {
4583 if (save_start.rs_u.ptr == regline)
4584 break;
4585 --save_start.rs_u.ptr;
4586 }
4587 }
4588
4589 /* NOBEHIND succeeds when no match was found */
4590 behind_pos = save_behind_pos;
4591 if (!needmatch)
4592 {
4593 reg_restore(&save_after);
4594 return TRUE;
4595 }
4596 }
4597 return FALSE;
4598 }
4599
4600 case BHPOS:
4601 if (REG_MULTI)
4602 {
4603 if (behind_pos.rs_u.pos.col != (colnr_T)(reginput - regline)
4604 || behind_pos.rs_u.pos.lnum != reglnum)
4605 return FALSE;
4606 }
4607 else if (behind_pos.rs_u.ptr != reginput)
4608 return FALSE;
4609 break;
4610
4611 case NEWL:
4612 if ((c != NUL || reglnum == reg_maxline)
4613 && (c != '\n' || !reg_line_lbr))
4614 return FALSE;
4615 if (reg_line_lbr)
4616 ADVANCE_REGINPUT();
4617 else
4618 reg_nextline();
4619 break;
4620
4621 case END:
4622 return TRUE; /* Success! */
4623
4624 default:
4625 EMSG(_(e_re_corr));
4626#ifdef DEBUG
4627 printf("Illegal op code %d\n", op);
4628#endif
4629 return FALSE;
4630 }
4631 }
4632
4633 scan = next;
4634 }
4635
4636 /*
4637 * We get here only if there's trouble -- normally "case END" is the
4638 * terminating point.
4639 */
4640 EMSG(_(e_re_corr));
4641#ifdef DEBUG
4642 printf("Premature EOL\n");
4643#endif
4644 return FALSE;
4645}
4646
Bram Moolenaar071d4272004-06-13 20:20:40 +00004647/*
4648 * regrepeat - repeatedly match something simple, return how many.
4649 * Advances reginput (and reglnum) to just after the matched chars.
4650 */
4651 static int
4652regrepeat(p, maxcount)
4653 char_u *p;
4654 long maxcount; /* maximum number of matches allowed */
4655{
4656 long count = 0;
4657 char_u *scan;
4658 char_u *opnd;
4659 int mask;
4660 int testval = 0;
4661
4662 scan = reginput; /* Make local copy of reginput for speed. */
4663 opnd = OPERAND(p);
4664 switch (OP(p))
4665 {
4666 case ANY:
4667 case ANY + ADD_NL:
4668 while (count < maxcount)
4669 {
4670 /* Matching anything means we continue until end-of-line (or
4671 * end-of-file for ANY + ADD_NL), only limited by maxcount. */
4672 while (*scan != NUL && count < maxcount)
4673 {
4674 ++count;
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004675 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004676 }
4677 if (!WITH_NL(OP(p)) || reglnum == reg_maxline || count == maxcount)
4678 break;
4679 ++count; /* count the line-break */
4680 reg_nextline();
4681 scan = reginput;
4682 if (got_int)
4683 break;
4684 }
4685 break;
4686
4687 case IDENT:
4688 case IDENT + ADD_NL:
4689 testval = TRUE;
4690 /*FALLTHROUGH*/
4691 case SIDENT:
4692 case SIDENT + ADD_NL:
4693 while (count < maxcount)
4694 {
4695 if (vim_isIDc(*scan) && (testval || !VIM_ISDIGIT(*scan)))
4696 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004697 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004698 }
4699 else if (*scan == NUL)
4700 {
4701 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4702 break;
4703 reg_nextline();
4704 scan = reginput;
4705 if (got_int)
4706 break;
4707 }
4708 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4709 ++scan;
4710 else
4711 break;
4712 ++count;
4713 }
4714 break;
4715
4716 case KWORD:
4717 case KWORD + ADD_NL:
4718 testval = TRUE;
4719 /*FALLTHROUGH*/
4720 case SKWORD:
4721 case SKWORD + ADD_NL:
4722 while (count < maxcount)
4723 {
4724 if (vim_iswordp(scan) && (testval || !VIM_ISDIGIT(*scan)))
4725 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004726 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004727 }
4728 else if (*scan == NUL)
4729 {
4730 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4731 break;
4732 reg_nextline();
4733 scan = reginput;
4734 if (got_int)
4735 break;
4736 }
4737 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4738 ++scan;
4739 else
4740 break;
4741 ++count;
4742 }
4743 break;
4744
4745 case FNAME:
4746 case FNAME + ADD_NL:
4747 testval = TRUE;
4748 /*FALLTHROUGH*/
4749 case SFNAME:
4750 case SFNAME + ADD_NL:
4751 while (count < maxcount)
4752 {
4753 if (vim_isfilec(*scan) && (testval || !VIM_ISDIGIT(*scan)))
4754 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004755 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004756 }
4757 else if (*scan == NUL)
4758 {
4759 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4760 break;
4761 reg_nextline();
4762 scan = reginput;
4763 if (got_int)
4764 break;
4765 }
4766 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4767 ++scan;
4768 else
4769 break;
4770 ++count;
4771 }
4772 break;
4773
4774 case PRINT:
4775 case PRINT + ADD_NL:
4776 testval = TRUE;
4777 /*FALLTHROUGH*/
4778 case SPRINT:
4779 case SPRINT + ADD_NL:
4780 while (count < maxcount)
4781 {
4782 if (*scan == NUL)
4783 {
4784 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4785 break;
4786 reg_nextline();
4787 scan = reginput;
4788 if (got_int)
4789 break;
4790 }
4791 else if (ptr2cells(scan) == 1 && (testval || !VIM_ISDIGIT(*scan)))
4792 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004793 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004794 }
4795 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4796 ++scan;
4797 else
4798 break;
4799 ++count;
4800 }
4801 break;
4802
4803 case WHITE:
4804 case WHITE + ADD_NL:
4805 testval = mask = RI_WHITE;
4806do_class:
4807 while (count < maxcount)
4808 {
4809#ifdef FEAT_MBYTE
4810 int l;
4811#endif
4812 if (*scan == NUL)
4813 {
4814 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4815 break;
4816 reg_nextline();
4817 scan = reginput;
4818 if (got_int)
4819 break;
4820 }
4821#ifdef FEAT_MBYTE
4822 else if (has_mbyte && (l = (*mb_ptr2len_check)(scan)) > 1)
4823 {
4824 if (testval != 0)
4825 break;
4826 scan += l;
4827 }
4828#endif
4829 else if ((class_tab[*scan] & mask) == testval)
4830 ++scan;
4831 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4832 ++scan;
4833 else
4834 break;
4835 ++count;
4836 }
4837 break;
4838
4839 case NWHITE:
4840 case NWHITE + ADD_NL:
4841 mask = RI_WHITE;
4842 goto do_class;
4843 case DIGIT:
4844 case DIGIT + ADD_NL:
4845 testval = mask = RI_DIGIT;
4846 goto do_class;
4847 case NDIGIT:
4848 case NDIGIT + ADD_NL:
4849 mask = RI_DIGIT;
4850 goto do_class;
4851 case HEX:
4852 case HEX + ADD_NL:
4853 testval = mask = RI_HEX;
4854 goto do_class;
4855 case NHEX:
4856 case NHEX + ADD_NL:
4857 mask = RI_HEX;
4858 goto do_class;
4859 case OCTAL:
4860 case OCTAL + ADD_NL:
4861 testval = mask = RI_OCTAL;
4862 goto do_class;
4863 case NOCTAL:
4864 case NOCTAL + ADD_NL:
4865 mask = RI_OCTAL;
4866 goto do_class;
4867 case WORD:
4868 case WORD + ADD_NL:
4869 testval = mask = RI_WORD;
4870 goto do_class;
4871 case NWORD:
4872 case NWORD + ADD_NL:
4873 mask = RI_WORD;
4874 goto do_class;
4875 case HEAD:
4876 case HEAD + ADD_NL:
4877 testval = mask = RI_HEAD;
4878 goto do_class;
4879 case NHEAD:
4880 case NHEAD + ADD_NL:
4881 mask = RI_HEAD;
4882 goto do_class;
4883 case ALPHA:
4884 case ALPHA + ADD_NL:
4885 testval = mask = RI_ALPHA;
4886 goto do_class;
4887 case NALPHA:
4888 case NALPHA + ADD_NL:
4889 mask = RI_ALPHA;
4890 goto do_class;
4891 case LOWER:
4892 case LOWER + ADD_NL:
4893 testval = mask = RI_LOWER;
4894 goto do_class;
4895 case NLOWER:
4896 case NLOWER + ADD_NL:
4897 mask = RI_LOWER;
4898 goto do_class;
4899 case UPPER:
4900 case UPPER + ADD_NL:
4901 testval = mask = RI_UPPER;
4902 goto do_class;
4903 case NUPPER:
4904 case NUPPER + ADD_NL:
4905 mask = RI_UPPER;
4906 goto do_class;
4907
4908 case EXACTLY:
4909 {
4910 int cu, cl;
4911
4912 /* This doesn't do a multi-byte character, because a MULTIBYTECODE
4913 * would have been used for it. */
4914 if (ireg_ic)
4915 {
4916 cu = TOUPPER_LOC(*opnd);
4917 cl = TOLOWER_LOC(*opnd);
4918 while (count < maxcount && (*scan == cu || *scan == cl))
4919 {
4920 count++;
4921 scan++;
4922 }
4923 }
4924 else
4925 {
4926 cu = *opnd;
4927 while (count < maxcount && *scan == cu)
4928 {
4929 count++;
4930 scan++;
4931 }
4932 }
4933 break;
4934 }
4935
4936#ifdef FEAT_MBYTE
4937 case MULTIBYTECODE:
4938 {
4939 int i, len, cf = 0;
4940
4941 /* Safety check (just in case 'encoding' was changed since
4942 * compiling the program). */
4943 if ((len = (*mb_ptr2len_check)(opnd)) > 1)
4944 {
4945 if (ireg_ic && enc_utf8)
4946 cf = utf_fold(utf_ptr2char(opnd));
4947 while (count < maxcount)
4948 {
4949 for (i = 0; i < len; ++i)
4950 if (opnd[i] != scan[i])
4951 break;
4952 if (i < len && (!ireg_ic || !enc_utf8
4953 || utf_fold(utf_ptr2char(scan)) != cf))
4954 break;
4955 scan += len;
4956 ++count;
4957 }
4958 }
4959 }
4960 break;
4961#endif
4962
4963 case ANYOF:
4964 case ANYOF + ADD_NL:
4965 testval = TRUE;
4966 /*FALLTHROUGH*/
4967
4968 case ANYBUT:
4969 case ANYBUT + ADD_NL:
4970 while (count < maxcount)
4971 {
4972#ifdef FEAT_MBYTE
4973 int len;
4974#endif
4975 if (*scan == NUL)
4976 {
4977 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4978 break;
4979 reg_nextline();
4980 scan = reginput;
4981 if (got_int)
4982 break;
4983 }
4984 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4985 ++scan;
4986#ifdef FEAT_MBYTE
4987 else if (has_mbyte && (len = (*mb_ptr2len_check)(scan)) > 1)
4988 {
4989 if ((cstrchr(opnd, (*mb_ptr2char)(scan)) == NULL) == testval)
4990 break;
4991 scan += len;
4992 }
4993#endif
4994 else
4995 {
4996 if ((cstrchr(opnd, *scan) == NULL) == testval)
4997 break;
4998 ++scan;
4999 }
5000 ++count;
5001 }
5002 break;
5003
5004 case NEWL:
5005 while (count < maxcount
5006 && ((*scan == NUL && reglnum < reg_maxline)
5007 || (*scan == '\n' && reg_line_lbr)))
5008 {
5009 count++;
5010 if (reg_line_lbr)
5011 ADVANCE_REGINPUT();
5012 else
5013 reg_nextline();
5014 scan = reginput;
5015 if (got_int)
5016 break;
5017 }
5018 break;
5019
5020 default: /* Oh dear. Called inappropriately. */
5021 EMSG(_(e_re_corr));
5022#ifdef DEBUG
5023 printf("Called regrepeat with op code %d\n", OP(p));
5024#endif
5025 break;
5026 }
5027
5028 reginput = scan;
5029
5030 return (int)count;
5031}
5032
5033/*
5034 * regnext - dig the "next" pointer out of a node
5035 */
5036 static char_u *
5037regnext(p)
5038 char_u *p;
5039{
5040 int offset;
5041
5042 if (p == JUST_CALC_SIZE)
5043 return NULL;
5044
5045 offset = NEXT(p);
5046 if (offset == 0)
5047 return NULL;
5048
5049 if (OP(p) == BACK)
5050 return p - offset;
5051 else
5052 return p + offset;
5053}
5054
5055/*
5056 * Check the regexp program for its magic number.
5057 * Return TRUE if it's wrong.
5058 */
5059 static int
5060prog_magic_wrong()
5061{
5062 if (UCHARAT(REG_MULTI
5063 ? reg_mmatch->regprog->program
5064 : reg_match->regprog->program) != REGMAGIC)
5065 {
5066 EMSG(_(e_re_corr));
5067 return TRUE;
5068 }
5069 return FALSE;
5070}
5071
5072/*
5073 * Cleanup the subexpressions, if this wasn't done yet.
5074 * This construction is used to clear the subexpressions only when they are
5075 * used (to increase speed).
5076 */
5077 static void
5078cleanup_subexpr()
5079{
5080 if (need_clear_subexpr)
5081 {
5082 if (REG_MULTI)
5083 {
5084 /* Use 0xff to set lnum to -1 */
5085 vim_memset(reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
5086 vim_memset(reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
5087 }
5088 else
5089 {
5090 vim_memset(reg_startp, 0, sizeof(char_u *) * NSUBEXP);
5091 vim_memset(reg_endp, 0, sizeof(char_u *) * NSUBEXP);
5092 }
5093 need_clear_subexpr = FALSE;
5094 }
5095}
5096
5097#ifdef FEAT_SYN_HL
5098 static void
5099cleanup_zsubexpr()
5100{
5101 if (need_clear_zsubexpr)
5102 {
5103 if (REG_MULTI)
5104 {
5105 /* Use 0xff to set lnum to -1 */
5106 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
5107 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
5108 }
5109 else
5110 {
5111 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
5112 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
5113 }
5114 need_clear_zsubexpr = FALSE;
5115 }
5116}
5117#endif
5118
5119/*
5120 * Advance reglnum, regline and reginput to the next line.
5121 */
5122 static void
5123reg_nextline()
5124{
5125 regline = reg_getline(++reglnum);
5126 reginput = regline;
5127 fast_breakcheck();
5128}
5129
5130/*
5131 * Save the input line and position in a regsave_T.
5132 */
5133 static void
5134reg_save(save)
5135 regsave_T *save;
5136{
5137 if (REG_MULTI)
5138 {
5139 save->rs_u.pos.col = (colnr_T)(reginput - regline);
5140 save->rs_u.pos.lnum = reglnum;
5141 }
5142 else
5143 save->rs_u.ptr = reginput;
5144}
5145
5146/*
5147 * Restore the input line and position from a regsave_T.
5148 */
5149 static void
5150reg_restore(save)
5151 regsave_T *save;
5152{
5153 if (REG_MULTI)
5154 {
5155 if (reglnum != save->rs_u.pos.lnum)
5156 {
5157 /* only call reg_getline() when the line number changed to save
5158 * a bit of time */
5159 reglnum = save->rs_u.pos.lnum;
5160 regline = reg_getline(reglnum);
5161 }
5162 reginput = regline + save->rs_u.pos.col;
5163 }
5164 else
5165 reginput = save->rs_u.ptr;
5166}
5167
5168/*
5169 * Return TRUE if current position is equal to saved position.
5170 */
5171 static int
5172reg_save_equal(save)
5173 regsave_T *save;
5174{
5175 if (REG_MULTI)
5176 return reglnum == save->rs_u.pos.lnum
5177 && reginput == regline + save->rs_u.pos.col;
5178 return reginput == save->rs_u.ptr;
5179}
5180
5181/*
5182 * Tentatively set the sub-expression start to the current position (after
5183 * calling regmatch() they will have changed). Need to save the existing
5184 * values for when there is no match.
5185 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
5186 * depending on REG_MULTI.
5187 */
5188 static void
5189save_se_multi(savep, posp)
5190 save_se_T *savep;
5191 lpos_T *posp;
5192{
5193 savep->se_u.pos = *posp;
5194 posp->lnum = reglnum;
5195 posp->col = (colnr_T)(reginput - regline);
5196}
5197
5198 static void
5199save_se_one(savep, pp)
5200 save_se_T *savep;
5201 char_u **pp;
5202{
5203 savep->se_u.ptr = *pp;
5204 *pp = reginput;
5205}
5206
5207/*
5208 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
5209 */
5210 static int
5211re_num_cmp(val, scan)
5212 long_u val;
5213 char_u *scan;
5214{
5215 long_u n = OPERAND_MIN(scan);
5216
5217 if (OPERAND_CMP(scan) == '>')
5218 return val > n;
5219 if (OPERAND_CMP(scan) == '<')
5220 return val < n;
5221 return val == n;
5222}
5223
5224
5225#ifdef DEBUG
5226
5227/*
5228 * regdump - dump a regexp onto stdout in vaguely comprehensible form
5229 */
5230 static void
5231regdump(pattern, r)
5232 char_u *pattern;
5233 regprog_T *r;
5234{
5235 char_u *s;
5236 int op = EXACTLY; /* Arbitrary non-END op. */
5237 char_u *next;
5238 char_u *end = NULL;
5239
5240 printf("\r\nregcomp(%s):\r\n", pattern);
5241
5242 s = r->program + 1;
5243 /*
5244 * Loop until we find the END that isn't before a referred next (an END
5245 * can also appear in a NOMATCH operand).
5246 */
5247 while (op != END || s <= end)
5248 {
5249 op = OP(s);
5250 printf("%2d%s", (int)(s - r->program), regprop(s)); /* Where, what. */
5251 next = regnext(s);
5252 if (next == NULL) /* Next ptr. */
5253 printf("(0)");
5254 else
5255 printf("(%d)", (int)((s - r->program) + (next - s)));
5256 if (end < next)
5257 end = next;
5258 if (op == BRACE_LIMITS)
5259 {
5260 /* Two short ints */
5261 printf(" minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
5262 s += 8;
5263 }
5264 s += 3;
5265 if (op == ANYOF || op == ANYOF + ADD_NL
5266 || op == ANYBUT || op == ANYBUT + ADD_NL
5267 || op == EXACTLY)
5268 {
5269 /* Literal string, where present. */
5270 while (*s != NUL)
5271 printf("%c", *s++);
5272 s++;
5273 }
5274 printf("\r\n");
5275 }
5276
5277 /* Header fields of interest. */
5278 if (r->regstart != NUL)
5279 printf("start `%s' 0x%x; ", r->regstart < 256
5280 ? (char *)transchar(r->regstart)
5281 : "multibyte", r->regstart);
5282 if (r->reganch)
5283 printf("anchored; ");
5284 if (r->regmust != NULL)
5285 printf("must have \"%s\"", r->regmust);
5286 printf("\r\n");
5287}
5288
5289/*
5290 * regprop - printable representation of opcode
5291 */
5292 static char_u *
5293regprop(op)
5294 char_u *op;
5295{
5296 char_u *p;
5297 static char_u buf[50];
5298
5299 (void) strcpy(buf, ":");
5300
5301 switch (OP(op))
5302 {
5303 case BOL:
5304 p = "BOL";
5305 break;
5306 case EOL:
5307 p = "EOL";
5308 break;
5309 case RE_BOF:
5310 p = "BOF";
5311 break;
5312 case RE_EOF:
5313 p = "EOF";
5314 break;
5315 case CURSOR:
5316 p = "CURSOR";
5317 break;
5318 case RE_LNUM:
5319 p = "RE_LNUM";
5320 break;
5321 case RE_COL:
5322 p = "RE_COL";
5323 break;
5324 case RE_VCOL:
5325 p = "RE_VCOL";
5326 break;
5327 case BOW:
5328 p = "BOW";
5329 break;
5330 case EOW:
5331 p = "EOW";
5332 break;
5333 case ANY:
5334 p = "ANY";
5335 break;
5336 case ANY + ADD_NL:
5337 p = "ANY+NL";
5338 break;
5339 case ANYOF:
5340 p = "ANYOF";
5341 break;
5342 case ANYOF + ADD_NL:
5343 p = "ANYOF+NL";
5344 break;
5345 case ANYBUT:
5346 p = "ANYBUT";
5347 break;
5348 case ANYBUT + ADD_NL:
5349 p = "ANYBUT+NL";
5350 break;
5351 case IDENT:
5352 p = "IDENT";
5353 break;
5354 case IDENT + ADD_NL:
5355 p = "IDENT+NL";
5356 break;
5357 case SIDENT:
5358 p = "SIDENT";
5359 break;
5360 case SIDENT + ADD_NL:
5361 p = "SIDENT+NL";
5362 break;
5363 case KWORD:
5364 p = "KWORD";
5365 break;
5366 case KWORD + ADD_NL:
5367 p = "KWORD+NL";
5368 break;
5369 case SKWORD:
5370 p = "SKWORD";
5371 break;
5372 case SKWORD + ADD_NL:
5373 p = "SKWORD+NL";
5374 break;
5375 case FNAME:
5376 p = "FNAME";
5377 break;
5378 case FNAME + ADD_NL:
5379 p = "FNAME+NL";
5380 break;
5381 case SFNAME:
5382 p = "SFNAME";
5383 break;
5384 case SFNAME + ADD_NL:
5385 p = "SFNAME+NL";
5386 break;
5387 case PRINT:
5388 p = "PRINT";
5389 break;
5390 case PRINT + ADD_NL:
5391 p = "PRINT+NL";
5392 break;
5393 case SPRINT:
5394 p = "SPRINT";
5395 break;
5396 case SPRINT + ADD_NL:
5397 p = "SPRINT+NL";
5398 break;
5399 case WHITE:
5400 p = "WHITE";
5401 break;
5402 case WHITE + ADD_NL:
5403 p = "WHITE+NL";
5404 break;
5405 case NWHITE:
5406 p = "NWHITE";
5407 break;
5408 case NWHITE + ADD_NL:
5409 p = "NWHITE+NL";
5410 break;
5411 case DIGIT:
5412 p = "DIGIT";
5413 break;
5414 case DIGIT + ADD_NL:
5415 p = "DIGIT+NL";
5416 break;
5417 case NDIGIT:
5418 p = "NDIGIT";
5419 break;
5420 case NDIGIT + ADD_NL:
5421 p = "NDIGIT+NL";
5422 break;
5423 case HEX:
5424 p = "HEX";
5425 break;
5426 case HEX + ADD_NL:
5427 p = "HEX+NL";
5428 break;
5429 case NHEX:
5430 p = "NHEX";
5431 break;
5432 case NHEX + ADD_NL:
5433 p = "NHEX+NL";
5434 break;
5435 case OCTAL:
5436 p = "OCTAL";
5437 break;
5438 case OCTAL + ADD_NL:
5439 p = "OCTAL+NL";
5440 break;
5441 case NOCTAL:
5442 p = "NOCTAL";
5443 break;
5444 case NOCTAL + ADD_NL:
5445 p = "NOCTAL+NL";
5446 break;
5447 case WORD:
5448 p = "WORD";
5449 break;
5450 case WORD + ADD_NL:
5451 p = "WORD+NL";
5452 break;
5453 case NWORD:
5454 p = "NWORD";
5455 break;
5456 case NWORD + ADD_NL:
5457 p = "NWORD+NL";
5458 break;
5459 case HEAD:
5460 p = "HEAD";
5461 break;
5462 case HEAD + ADD_NL:
5463 p = "HEAD+NL";
5464 break;
5465 case NHEAD:
5466 p = "NHEAD";
5467 break;
5468 case NHEAD + ADD_NL:
5469 p = "NHEAD+NL";
5470 break;
5471 case ALPHA:
5472 p = "ALPHA";
5473 break;
5474 case ALPHA + ADD_NL:
5475 p = "ALPHA+NL";
5476 break;
5477 case NALPHA:
5478 p = "NALPHA";
5479 break;
5480 case NALPHA + ADD_NL:
5481 p = "NALPHA+NL";
5482 break;
5483 case LOWER:
5484 p = "LOWER";
5485 break;
5486 case LOWER + ADD_NL:
5487 p = "LOWER+NL";
5488 break;
5489 case NLOWER:
5490 p = "NLOWER";
5491 break;
5492 case NLOWER + ADD_NL:
5493 p = "NLOWER+NL";
5494 break;
5495 case UPPER:
5496 p = "UPPER";
5497 break;
5498 case UPPER + ADD_NL:
5499 p = "UPPER+NL";
5500 break;
5501 case NUPPER:
5502 p = "NUPPER";
5503 break;
5504 case NUPPER + ADD_NL:
5505 p = "NUPPER+NL";
5506 break;
5507 case BRANCH:
5508 p = "BRANCH";
5509 break;
5510 case EXACTLY:
5511 p = "EXACTLY";
5512 break;
5513 case NOTHING:
5514 p = "NOTHING";
5515 break;
5516 case BACK:
5517 p = "BACK";
5518 break;
5519 case END:
5520 p = "END";
5521 break;
5522 case MOPEN + 0:
5523 p = "MATCH START";
5524 break;
5525 case MOPEN + 1:
5526 case MOPEN + 2:
5527 case MOPEN + 3:
5528 case MOPEN + 4:
5529 case MOPEN + 5:
5530 case MOPEN + 6:
5531 case MOPEN + 7:
5532 case MOPEN + 8:
5533 case MOPEN + 9:
5534 sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
5535 p = NULL;
5536 break;
5537 case MCLOSE + 0:
5538 p = "MATCH END";
5539 break;
5540 case MCLOSE + 1:
5541 case MCLOSE + 2:
5542 case MCLOSE + 3:
5543 case MCLOSE + 4:
5544 case MCLOSE + 5:
5545 case MCLOSE + 6:
5546 case MCLOSE + 7:
5547 case MCLOSE + 8:
5548 case MCLOSE + 9:
5549 sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
5550 p = NULL;
5551 break;
5552 case BACKREF + 1:
5553 case BACKREF + 2:
5554 case BACKREF + 3:
5555 case BACKREF + 4:
5556 case BACKREF + 5:
5557 case BACKREF + 6:
5558 case BACKREF + 7:
5559 case BACKREF + 8:
5560 case BACKREF + 9:
5561 sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
5562 p = NULL;
5563 break;
5564 case NOPEN:
5565 p = "NOPEN";
5566 break;
5567 case NCLOSE:
5568 p = "NCLOSE";
5569 break;
5570#ifdef FEAT_SYN_HL
5571 case ZOPEN + 1:
5572 case ZOPEN + 2:
5573 case ZOPEN + 3:
5574 case ZOPEN + 4:
5575 case ZOPEN + 5:
5576 case ZOPEN + 6:
5577 case ZOPEN + 7:
5578 case ZOPEN + 8:
5579 case ZOPEN + 9:
5580 sprintf(buf + STRLEN(buf), "ZOPEN%d", OP(op) - ZOPEN);
5581 p = NULL;
5582 break;
5583 case ZCLOSE + 1:
5584 case ZCLOSE + 2:
5585 case ZCLOSE + 3:
5586 case ZCLOSE + 4:
5587 case ZCLOSE + 5:
5588 case ZCLOSE + 6:
5589 case ZCLOSE + 7:
5590 case ZCLOSE + 8:
5591 case ZCLOSE + 9:
5592 sprintf(buf + STRLEN(buf), "ZCLOSE%d", OP(op) - ZCLOSE);
5593 p = NULL;
5594 break;
5595 case ZREF + 1:
5596 case ZREF + 2:
5597 case ZREF + 3:
5598 case ZREF + 4:
5599 case ZREF + 5:
5600 case ZREF + 6:
5601 case ZREF + 7:
5602 case ZREF + 8:
5603 case ZREF + 9:
5604 sprintf(buf + STRLEN(buf), "ZREF%d", OP(op) - ZREF);
5605 p = NULL;
5606 break;
5607#endif
5608 case STAR:
5609 p = "STAR";
5610 break;
5611 case PLUS:
5612 p = "PLUS";
5613 break;
5614 case NOMATCH:
5615 p = "NOMATCH";
5616 break;
5617 case MATCH:
5618 p = "MATCH";
5619 break;
5620 case BEHIND:
5621 p = "BEHIND";
5622 break;
5623 case NOBEHIND:
5624 p = "NOBEHIND";
5625 break;
5626 case SUBPAT:
5627 p = "SUBPAT";
5628 break;
5629 case BRACE_LIMITS:
5630 p = "BRACE_LIMITS";
5631 break;
5632 case BRACE_SIMPLE:
5633 p = "BRACE_SIMPLE";
5634 break;
5635 case BRACE_COMPLEX + 0:
5636 case BRACE_COMPLEX + 1:
5637 case BRACE_COMPLEX + 2:
5638 case BRACE_COMPLEX + 3:
5639 case BRACE_COMPLEX + 4:
5640 case BRACE_COMPLEX + 5:
5641 case BRACE_COMPLEX + 6:
5642 case BRACE_COMPLEX + 7:
5643 case BRACE_COMPLEX + 8:
5644 case BRACE_COMPLEX + 9:
5645 sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
5646 p = NULL;
5647 break;
5648#ifdef FEAT_MBYTE
5649 case MULTIBYTECODE:
5650 p = "MULTIBYTECODE";
5651 break;
5652#endif
5653 case NEWL:
5654 p = "NEWL";
5655 break;
5656 default:
5657 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
5658 p = NULL;
5659 break;
5660 }
5661 if (p != NULL)
5662 (void) strcat(buf, p);
5663 return buf;
5664}
5665#endif
5666
5667#ifdef FEAT_MBYTE
5668static void mb_decompose __ARGS((int c, int *c1, int *c2, int *c3));
5669
5670typedef struct
5671{
5672 int a, b, c;
5673} decomp_T;
5674
5675
5676/* 0xfb20 - 0xfb4f */
5677decomp_T decomp_table[0xfb4f-0xfb20+1] =
5678{
5679 {0x5e2,0,0}, /* 0xfb20 alt ayin */
5680 {0x5d0,0,0}, /* 0xfb21 alt alef */
5681 {0x5d3,0,0}, /* 0xfb22 alt dalet */
5682 {0x5d4,0,0}, /* 0xfb23 alt he */
5683 {0x5db,0,0}, /* 0xfb24 alt kaf */
5684 {0x5dc,0,0}, /* 0xfb25 alt lamed */
5685 {0x5dd,0,0}, /* 0xfb26 alt mem-sofit */
5686 {0x5e8,0,0}, /* 0xfb27 alt resh */
5687 {0x5ea,0,0}, /* 0xfb28 alt tav */
5688 {'+', 0, 0}, /* 0xfb29 alt plus */
5689 {0x5e9, 0x5c1, 0}, /* 0xfb2a shin+shin-dot */
5690 {0x5e9, 0x5c2, 0}, /* 0xfb2b shin+sin-dot */
5691 {0x5e9, 0x5c1, 0x5bc}, /* 0xfb2c shin+shin-dot+dagesh */
5692 {0x5e9, 0x5c2, 0x5bc}, /* 0xfb2d shin+sin-dot+dagesh */
5693 {0x5d0, 0x5b7, 0}, /* 0xfb2e alef+patah */
5694 {0x5d0, 0x5b8, 0}, /* 0xfb2f alef+qamats */
5695 {0x5d0, 0x5b4, 0}, /* 0xfb30 alef+hiriq */
5696 {0x5d1, 0x5bc, 0}, /* 0xfb31 bet+dagesh */
5697 {0x5d2, 0x5bc, 0}, /* 0xfb32 gimel+dagesh */
5698 {0x5d3, 0x5bc, 0}, /* 0xfb33 dalet+dagesh */
5699 {0x5d4, 0x5bc, 0}, /* 0xfb34 he+dagesh */
5700 {0x5d5, 0x5bc, 0}, /* 0xfb35 vav+dagesh */
5701 {0x5d6, 0x5bc, 0}, /* 0xfb36 zayin+dagesh */
5702 {0xfb37, 0, 0}, /* 0xfb37 -- UNUSED */
5703 {0x5d8, 0x5bc, 0}, /* 0xfb38 tet+dagesh */
5704 {0x5d9, 0x5bc, 0}, /* 0xfb39 yud+dagesh */
5705 {0x5da, 0x5bc, 0}, /* 0xfb3a kaf sofit+dagesh */
5706 {0x5db, 0x5bc, 0}, /* 0xfb3b kaf+dagesh */
5707 {0x5dc, 0x5bc, 0}, /* 0xfb3c lamed+dagesh */
5708 {0xfb3d, 0, 0}, /* 0xfb3d -- UNUSED */
5709 {0x5de, 0x5bc, 0}, /* 0xfb3e mem+dagesh */
5710 {0xfb3f, 0, 0}, /* 0xfb3f -- UNUSED */
5711 {0x5e0, 0x5bc, 0}, /* 0xfb40 nun+dagesh */
5712 {0x5e1, 0x5bc, 0}, /* 0xfb41 samech+dagesh */
5713 {0xfb42, 0, 0}, /* 0xfb42 -- UNUSED */
5714 {0x5e3, 0x5bc, 0}, /* 0xfb43 pe sofit+dagesh */
5715 {0x5e4, 0x5bc,0}, /* 0xfb44 pe+dagesh */
5716 {0xfb45, 0, 0}, /* 0xfb45 -- UNUSED */
5717 {0x5e6, 0x5bc, 0}, /* 0xfb46 tsadi+dagesh */
5718 {0x5e7, 0x5bc, 0}, /* 0xfb47 qof+dagesh */
5719 {0x5e8, 0x5bc, 0}, /* 0xfb48 resh+dagesh */
5720 {0x5e9, 0x5bc, 0}, /* 0xfb49 shin+dagesh */
5721 {0x5ea, 0x5bc, 0}, /* 0xfb4a tav+dagesh */
5722 {0x5d5, 0x5b9, 0}, /* 0xfb4b vav+holam */
5723 {0x5d1, 0x5bf, 0}, /* 0xfb4c bet+rafe */
5724 {0x5db, 0x5bf, 0}, /* 0xfb4d kaf+rafe */
5725 {0x5e4, 0x5bf, 0}, /* 0xfb4e pe+rafe */
5726 {0x5d0, 0x5dc, 0} /* 0xfb4f alef-lamed */
5727};
5728
5729 static void
5730mb_decompose(c, c1, c2, c3)
5731 int c, *c1, *c2, *c3;
5732{
5733 decomp_T d;
5734
5735 if (c >= 0x4b20 && c <= 0xfb4f)
5736 {
5737 d = decomp_table[c - 0xfb20];
5738 *c1 = d.a;
5739 *c2 = d.b;
5740 *c3 = d.c;
5741 }
5742 else
5743 {
5744 *c1 = c;
5745 *c2 = *c3 = 0;
5746 }
5747}
5748#endif
5749
5750/*
5751 * Compare two strings, ignore case if ireg_ic set.
5752 * Return 0 if strings match, non-zero otherwise.
5753 * Correct the length "*n" when composing characters are ignored.
5754 */
5755 static int
5756cstrncmp(s1, s2, n)
5757 char_u *s1, *s2;
5758 int *n;
5759{
5760 int result;
5761
5762 if (!ireg_ic)
5763 result = STRNCMP(s1, s2, *n);
5764 else
5765 result = MB_STRNICMP(s1, s2, *n);
5766
5767#ifdef FEAT_MBYTE
5768 /* if it failed and it's utf8 and we want to combineignore: */
5769 if (result != 0 && enc_utf8 && ireg_icombine)
5770 {
5771 char_u *str1, *str2;
5772 int c1, c2, c11, c12;
5773 int ix;
5774 int junk;
5775
5776 /* we have to handle the strcmp ourselves, since it is necessary to
5777 * deal with the composing characters by ignoring them: */
5778 str1 = s1;
5779 str2 = s2;
5780 c1 = c2 = 0;
5781 for (ix = 0; ix < *n; )
5782 {
5783 c1 = mb_ptr2char_adv(&str1);
5784 c2 = mb_ptr2char_adv(&str2);
5785 ix += utf_char2len(c1);
5786
5787 /* decompose the character if necessary, into 'base' characters
5788 * because I don't care about Arabic, I will hard-code the Hebrew
5789 * which I *do* care about! So sue me... */
5790 if (c1 != c2 && (!ireg_ic || utf_fold(c1) != utf_fold(c2)))
5791 {
5792 /* decomposition necessary? */
5793 mb_decompose(c1, &c11, &junk, &junk);
5794 mb_decompose(c2, &c12, &junk, &junk);
5795 c1 = c11;
5796 c2 = c12;
5797 if (c11 != c12 && (!ireg_ic || utf_fold(c11) != utf_fold(c12)))
5798 break;
5799 }
5800 }
5801 result = c2 - c1;
5802 if (result == 0)
5803 *n = (int)(str2 - s2);
5804 }
5805#endif
5806
5807 return result;
5808}
5809
5810/*
5811 * cstrchr: This function is used a lot for simple searches, keep it fast!
5812 */
5813 static char_u *
5814cstrchr(s, c)
5815 char_u *s;
5816 int c;
5817{
5818 char_u *p;
5819 int cc;
5820
5821 if (!ireg_ic
5822#ifdef FEAT_MBYTE
5823 || (!enc_utf8 && mb_char2len(c) > 1)
5824#endif
5825 )
5826 return vim_strchr(s, c);
5827
5828 /* tolower() and toupper() can be slow, comparing twice should be a lot
5829 * faster (esp. when using MS Visual C++!).
5830 * For UTF-8 need to use folded case. */
5831#ifdef FEAT_MBYTE
5832 if (enc_utf8 && c > 0x80)
5833 cc = utf_fold(c);
5834 else
5835#endif
5836 if (isupper(c))
5837 cc = TOLOWER_LOC(c);
5838 else if (islower(c))
5839 cc = TOUPPER_LOC(c);
5840 else
5841 return vim_strchr(s, c);
5842
5843#ifdef FEAT_MBYTE
5844 if (has_mbyte)
5845 {
5846 for (p = s; *p != NUL; p += (*mb_ptr2len_check)(p))
5847 {
5848 if (enc_utf8 && c > 0x80)
5849 {
5850 if (utf_fold(utf_ptr2char(p)) == cc)
5851 return p;
5852 }
5853 else if (*p == c || *p == cc)
5854 return p;
5855 }
5856 }
5857 else
5858#endif
5859 /* Faster version for when there are no multi-byte characters. */
5860 for (p = s; *p != NUL; ++p)
5861 if (*p == c || *p == cc)
5862 return p;
5863
5864 return NULL;
5865}
5866
5867/***************************************************************
5868 * regsub stuff *
5869 ***************************************************************/
5870
5871/* This stuff below really confuses cc on an SGI -- webb */
5872#ifdef __sgi
5873# undef __ARGS
5874# define __ARGS(x) ()
5875#endif
5876
5877/*
5878 * We should define ftpr as a pointer to a function returning a pointer to
5879 * a function returning a pointer to a function ...
5880 * This is impossible, so we declare a pointer to a function returning a
5881 * pointer to a function returning void. This should work for all compilers.
5882 */
5883typedef void (*(*fptr) __ARGS((char_u *, int)))();
5884
5885static fptr do_upper __ARGS((char_u *, int));
5886static fptr do_Upper __ARGS((char_u *, int));
5887static fptr do_lower __ARGS((char_u *, int));
5888static fptr do_Lower __ARGS((char_u *, int));
5889
5890static int vim_regsub_both __ARGS((char_u *source, char_u *dest, int copy, int magic, int backslash));
5891
5892 static fptr
5893do_upper(d, c)
5894 char_u *d;
5895 int c;
5896{
5897 *d = TOUPPER_LOC(c);
5898
5899 return (fptr)NULL;
5900}
5901
5902 static fptr
5903do_Upper(d, c)
5904 char_u *d;
5905 int c;
5906{
5907 *d = TOUPPER_LOC(c);
5908
5909 return (fptr)do_Upper;
5910}
5911
5912 static fptr
5913do_lower(d, c)
5914 char_u *d;
5915 int c;
5916{
5917 *d = TOLOWER_LOC(c);
5918
5919 return (fptr)NULL;
5920}
5921
5922 static fptr
5923do_Lower(d, c)
5924 char_u *d;
5925 int c;
5926{
5927 *d = TOLOWER_LOC(c);
5928
5929 return (fptr)do_Lower;
5930}
5931
5932/*
5933 * regtilde(): Replace tildes in the pattern by the old pattern.
5934 *
5935 * Short explanation of the tilde: It stands for the previous replacement
5936 * pattern. If that previous pattern also contains a ~ we should go back a
5937 * step further... But we insert the previous pattern into the current one
5938 * and remember that.
5939 * This still does not handle the case where "magic" changes. TODO?
5940 *
5941 * The tildes are parsed once before the first call to vim_regsub().
5942 */
5943 char_u *
5944regtilde(source, magic)
5945 char_u *source;
5946 int magic;
5947{
5948 char_u *newsub = source;
5949 char_u *tmpsub;
5950 char_u *p;
5951 int len;
5952 int prevlen;
5953
5954 for (p = newsub; *p; ++p)
5955 {
5956 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
5957 {
5958 if (reg_prev_sub != NULL)
5959 {
5960 /* length = len(newsub) - 1 + len(prev_sub) + 1 */
5961 prevlen = (int)STRLEN(reg_prev_sub);
5962 tmpsub = alloc((unsigned)(STRLEN(newsub) + prevlen));
5963 if (tmpsub != NULL)
5964 {
5965 /* copy prefix */
5966 len = (int)(p - newsub); /* not including ~ */
5967 mch_memmove(tmpsub, newsub, (size_t)len);
5968 /* interpretate tilde */
5969 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
5970 /* copy postfix */
5971 if (!magic)
5972 ++p; /* back off \ */
5973 STRCPY(tmpsub + len + prevlen, p + 1);
5974
5975 if (newsub != source) /* already allocated newsub */
5976 vim_free(newsub);
5977 newsub = tmpsub;
5978 p = newsub + len + prevlen;
5979 }
5980 }
5981 else if (magic)
5982 STRCPY(p, p + 1); /* remove '~' */
5983 else
5984 STRCPY(p, p + 2); /* remove '\~' */
5985 --p;
5986 }
5987 else
5988 {
5989 if (*p == '\\' && p[1]) /* skip escaped characters */
5990 ++p;
5991#ifdef FEAT_MBYTE
5992 if (has_mbyte)
5993 p += (*mb_ptr2len_check)(p) - 1;
5994#endif
5995 }
5996 }
5997
5998 vim_free(reg_prev_sub);
5999 if (newsub != source) /* newsub was allocated, just keep it */
6000 reg_prev_sub = newsub;
6001 else /* no ~ found, need to save newsub */
6002 reg_prev_sub = vim_strsave(newsub);
6003 return newsub;
6004}
6005
6006#ifdef FEAT_EVAL
6007static int can_f_submatch = FALSE; /* TRUE when submatch() can be used */
6008
6009/* These pointers are used instead of reg_match and reg_mmatch for
6010 * reg_submatch(). Needed for when the substitution string is an expression
6011 * that contains a call to substitute() and submatch(). */
6012static regmatch_T *submatch_match;
6013static regmmatch_T *submatch_mmatch;
6014#endif
6015
6016#if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) || defined(PROTO)
6017/*
6018 * vim_regsub() - perform substitutions after a vim_regexec() or
6019 * vim_regexec_multi() match.
6020 *
6021 * If "copy" is TRUE really copy into "dest".
6022 * If "copy" is FALSE nothing is copied, this is just to find out the length
6023 * of the result.
6024 *
6025 * If "backslash" is TRUE, a backslash will be removed later, need to double
6026 * them to keep them, and insert a backslash before a CR to avoid it being
6027 * replaced with a line break later.
6028 *
6029 * Note: The matched text must not change between the call of
6030 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
6031 * references invalid!
6032 *
6033 * Returns the size of the replacement, including terminating NUL.
6034 */
6035 int
6036vim_regsub(rmp, source, dest, copy, magic, backslash)
6037 regmatch_T *rmp;
6038 char_u *source;
6039 char_u *dest;
6040 int copy;
6041 int magic;
6042 int backslash;
6043{
6044 reg_match = rmp;
6045 reg_mmatch = NULL;
6046 reg_maxline = 0;
6047 return vim_regsub_both(source, dest, copy, magic, backslash);
6048}
6049#endif
6050
6051 int
6052vim_regsub_multi(rmp, lnum, source, dest, copy, magic, backslash)
6053 regmmatch_T *rmp;
6054 linenr_T lnum;
6055 char_u *source;
6056 char_u *dest;
6057 int copy;
6058 int magic;
6059 int backslash;
6060{
6061 reg_match = NULL;
6062 reg_mmatch = rmp;
6063 reg_buf = curbuf; /* always works on the current buffer! */
6064 reg_firstlnum = lnum;
6065 reg_maxline = curbuf->b_ml.ml_line_count - lnum;
6066 return vim_regsub_both(source, dest, copy, magic, backslash);
6067}
6068
6069 static int
6070vim_regsub_both(source, dest, copy, magic, backslash)
6071 char_u *source;
6072 char_u *dest;
6073 int copy;
6074 int magic;
6075 int backslash;
6076{
6077 char_u *src;
6078 char_u *dst;
6079 char_u *s;
6080 int c;
6081 int no = -1;
6082 fptr func = (fptr)NULL;
6083 linenr_T clnum = 0; /* init for GCC */
6084 int len = 0; /* init for GCC */
6085#ifdef FEAT_EVAL
6086 static char_u *eval_result = NULL;
6087#endif
6088#ifdef FEAT_MBYTE
6089 int l;
6090#endif
6091
6092
6093 /* Be paranoid... */
6094 if (source == NULL || dest == NULL)
6095 {
6096 EMSG(_(e_null));
6097 return 0;
6098 }
6099 if (prog_magic_wrong())
6100 return 0;
6101 src = source;
6102 dst = dest;
6103
6104 /*
6105 * When the substitute part starts with "\=" evaluate it as an expression.
6106 */
6107 if (source[0] == '\\' && source[1] == '='
6108#ifdef FEAT_EVAL
6109 && !can_f_submatch /* can't do this recursively */
6110#endif
6111 )
6112 {
6113#ifdef FEAT_EVAL
6114 /* To make sure that the length doesn't change between checking the
6115 * length and copying the string, and to speed up things, the
6116 * resulting string is saved from the call with "copy" == FALSE to the
6117 * call with "copy" == TRUE. */
6118 if (copy)
6119 {
6120 if (eval_result != NULL)
6121 {
6122 STRCPY(dest, eval_result);
6123 dst += STRLEN(eval_result);
6124 vim_free(eval_result);
6125 eval_result = NULL;
6126 }
6127 }
6128 else
6129 {
6130 linenr_T save_reg_maxline;
6131 win_T *save_reg_win;
6132 int save_ireg_ic;
6133
6134 vim_free(eval_result);
6135
6136 /* The expression may contain substitute(), which calls us
6137 * recursively. Make sure submatch() gets the text from the first
6138 * level. Don't need to save "reg_buf", because
6139 * vim_regexec_multi() can't be called recursively. */
6140 submatch_match = reg_match;
6141 submatch_mmatch = reg_mmatch;
6142 save_reg_maxline = reg_maxline;
6143 save_reg_win = reg_win;
6144 save_ireg_ic = ireg_ic;
6145 can_f_submatch = TRUE;
6146
6147 eval_result = eval_to_string(source + 2, NULL);
6148 if (eval_result != NULL)
6149 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00006150 for (s = eval_result; *s != NUL; mb_ptr_adv(s))
Bram Moolenaar071d4272004-06-13 20:20:40 +00006151 {
6152 /* Change NL to CR, so that it becomes a line break.
6153 * Skip over a backslashed character. */
6154 if (*s == NL)
6155 *s = CAR;
6156 else if (*s == '\\' && s[1] != NUL)
6157 ++s;
Bram Moolenaar071d4272004-06-13 20:20:40 +00006158 }
6159
6160 dst += STRLEN(eval_result);
6161 }
6162
6163 reg_match = submatch_match;
6164 reg_mmatch = submatch_mmatch;
6165 reg_maxline = save_reg_maxline;
6166 reg_win = save_reg_win;
6167 ireg_ic = save_ireg_ic;
6168 can_f_submatch = FALSE;
6169 }
6170#endif
6171 }
6172 else
6173 while ((c = *src++) != NUL)
6174 {
6175 if (c == '&' && magic)
6176 no = 0;
6177 else if (c == '\\' && *src != NUL)
6178 {
6179 if (*src == '&' && !magic)
6180 {
6181 ++src;
6182 no = 0;
6183 }
6184 else if ('0' <= *src && *src <= '9')
6185 {
6186 no = *src++ - '0';
6187 }
6188 else if (vim_strchr((char_u *)"uUlLeE", *src))
6189 {
6190 switch (*src++)
6191 {
6192 case 'u': func = (fptr)do_upper;
6193 continue;
6194 case 'U': func = (fptr)do_Upper;
6195 continue;
6196 case 'l': func = (fptr)do_lower;
6197 continue;
6198 case 'L': func = (fptr)do_Lower;
6199 continue;
6200 case 'e':
6201 case 'E': func = (fptr)NULL;
6202 continue;
6203 }
6204 }
6205 }
6206 if (no < 0) /* Ordinary character. */
6207 {
6208 if (c == '\\' && *src != NUL)
6209 {
6210 /* Check for abbreviations -- webb */
6211 switch (*src)
6212 {
6213 case 'r': c = CAR; ++src; break;
6214 case 'n': c = NL; ++src; break;
6215 case 't': c = TAB; ++src; break;
6216 /* Oh no! \e already has meaning in subst pat :-( */
6217 /* case 'e': c = ESC; ++src; break; */
6218 case 'b': c = Ctrl_H; ++src; break;
6219
6220 /* If "backslash" is TRUE the backslash will be removed
6221 * later. Used to insert a literal CR. */
6222 default: if (backslash)
6223 {
6224 if (copy)
6225 *dst = '\\';
6226 ++dst;
6227 }
6228 c = *src++;
6229 }
6230 }
6231
6232 /* Write to buffer, if copy is set. */
6233#ifdef FEAT_MBYTE
6234 if (has_mbyte && (l = (*mb_ptr2len_check)(src - 1)) > 1)
6235 {
6236 /* TODO: should use "func" here. */
6237 if (copy)
6238 mch_memmove(dst, src - 1, l);
6239 dst += l - 1;
6240 src += l - 1;
6241 }
6242 else
6243 {
6244#endif
6245 if (copy)
6246 {
6247 if (func == (fptr)NULL) /* just copy */
6248 *dst = c;
6249 else /* change case */
6250 func = (fptr)(func(dst, c));
6251 /* Turbo C complains without the typecast */
6252 }
6253#ifdef FEAT_MBYTE
6254 }
6255#endif
6256 dst++;
6257 }
6258 else
6259 {
6260 if (REG_MULTI)
6261 {
6262 clnum = reg_mmatch->startpos[no].lnum;
6263 if (clnum < 0 || reg_mmatch->endpos[no].lnum < 0)
6264 s = NULL;
6265 else
6266 {
6267 s = reg_getline(clnum) + reg_mmatch->startpos[no].col;
6268 if (reg_mmatch->endpos[no].lnum == clnum)
6269 len = reg_mmatch->endpos[no].col
6270 - reg_mmatch->startpos[no].col;
6271 else
6272 len = (int)STRLEN(s);
6273 }
6274 }
6275 else
6276 {
6277 s = reg_match->startp[no];
6278 if (reg_match->endp[no] == NULL)
6279 s = NULL;
6280 else
6281 len = (int)(reg_match->endp[no] - s);
6282 }
6283 if (s != NULL)
6284 {
6285 for (;;)
6286 {
6287 if (len == 0)
6288 {
6289 if (REG_MULTI)
6290 {
6291 if (reg_mmatch->endpos[no].lnum == clnum)
6292 break;
6293 if (copy)
6294 *dst = CAR;
6295 ++dst;
6296 s = reg_getline(++clnum);
6297 if (reg_mmatch->endpos[no].lnum == clnum)
6298 len = reg_mmatch->endpos[no].col;
6299 else
6300 len = (int)STRLEN(s);
6301 }
6302 else
6303 break;
6304 }
6305 else if (*s == NUL) /* we hit NUL. */
6306 {
6307 if (copy)
6308 EMSG(_(e_re_damg));
6309 goto exit;
6310 }
6311 else
6312 {
6313 if (backslash && (*s == CAR || *s == '\\'))
6314 {
6315 /*
6316 * Insert a backslash in front of a CR, otherwise
6317 * it will be replaced by a line break.
6318 * Number of backslashes will be halved later,
6319 * double them here.
6320 */
6321 if (copy)
6322 {
6323 dst[0] = '\\';
6324 dst[1] = *s;
6325 }
6326 dst += 2;
6327 }
6328#ifdef FEAT_MBYTE
6329 else if (has_mbyte && (l = (*mb_ptr2len_check)(s)) > 1)
6330 {
6331 /* TODO: should use "func" here. */
6332 if (copy)
6333 mch_memmove(dst, s, l);
6334 dst += l;
6335 s += l - 1;
6336 len -= l - 1;
6337 }
6338#endif
6339 else
6340 {
6341 if (copy)
6342 {
6343 if (func == (fptr)NULL) /* just copy */
6344 *dst = *s;
6345 else /* change case */
6346 func = (fptr)(func(dst, *s));
6347 /* Turbo C complains without the typecast */
6348 }
6349 ++dst;
6350 }
6351 ++s;
6352 --len;
6353 }
6354 }
6355 }
6356 no = -1;
6357 }
6358 }
6359 if (copy)
6360 *dst = NUL;
6361
6362exit:
6363 return (int)((dst - dest) + 1);
6364}
6365
6366#ifdef FEAT_EVAL
6367/*
6368 * Used for the submatch() function: get the string from tne n'th submatch in
6369 * allocated memory.
6370 * Returns NULL when not in a ":s" command and for a non-existing submatch.
6371 */
6372 char_u *
6373reg_submatch(no)
6374 int no;
6375{
6376 char_u *retval = NULL;
6377 char_u *s;
6378 int len;
6379 int round;
6380 linenr_T lnum;
6381
6382 if (!can_f_submatch)
6383 return NULL;
6384
6385 if (submatch_match == NULL)
6386 {
6387 /*
6388 * First round: compute the length and allocate memory.
6389 * Second round: copy the text.
6390 */
6391 for (round = 1; round <= 2; ++round)
6392 {
6393 lnum = submatch_mmatch->startpos[no].lnum;
6394 if (lnum < 0 || submatch_mmatch->endpos[no].lnum < 0)
6395 return NULL;
6396
6397 s = reg_getline(lnum) + submatch_mmatch->startpos[no].col;
6398 if (s == NULL) /* anti-crash check, cannot happen? */
6399 break;
6400 if (submatch_mmatch->endpos[no].lnum == lnum)
6401 {
6402 /* Within one line: take form start to end col. */
6403 len = submatch_mmatch->endpos[no].col
6404 - submatch_mmatch->startpos[no].col;
6405 if (round == 2)
6406 {
6407 STRNCPY(retval, s, len);
6408 retval[len] = NUL;
6409 }
6410 ++len;
6411 }
6412 else
6413 {
6414 /* Multiple lines: take start line from start col, middle
6415 * lines completely and end line up to end col. */
6416 len = (int)STRLEN(s);
6417 if (round == 2)
6418 {
6419 STRCPY(retval, s);
6420 retval[len] = '\n';
6421 }
6422 ++len;
6423 ++lnum;
6424 while (lnum < submatch_mmatch->endpos[no].lnum)
6425 {
6426 s = reg_getline(lnum++);
6427 if (round == 2)
6428 STRCPY(retval + len, s);
6429 len += (int)STRLEN(s);
6430 if (round == 2)
6431 retval[len] = '\n';
6432 ++len;
6433 }
6434 if (round == 2)
6435 STRNCPY(retval + len, reg_getline(lnum),
6436 submatch_mmatch->endpos[no].col);
6437 len += submatch_mmatch->endpos[no].col;
6438 if (round == 2)
6439 retval[len] = NUL;
6440 ++len;
6441 }
6442
6443 if (round == 1)
6444 {
6445 retval = lalloc((long_u)len, TRUE);
6446 if (s == NULL)
6447 return NULL;
6448 }
6449 }
6450 }
6451 else
6452 {
6453 if (submatch_match->endp[no] == NULL)
6454 retval = NULL;
6455 else
6456 {
6457 s = submatch_match->startp[no];
6458 retval = vim_strnsave(s, (int)(submatch_match->endp[no] - s));
6459 }
6460 }
6461
6462 return retval;
6463}
6464#endif