blob: 6f15824dfb55c0dd78fde1b3ffcfe05f38955798 [file] [log] [blame]
Bram Moolenaar071d4272004-06-13 20:20:40 +00001/* vi:set ts=8 sts=4 sw=4:
2 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
4 *
5 * NOTICE:
6 *
7 * This is NOT the original regular expression code as written by Henry
8 * Spencer. This code has been modified specifically for use with the VIM
9 * editor, and should not be used separately from Vim. If you want a good
10 * regular expression library, get the original code. The copyright notice
11 * that follows is from the original.
12 *
13 * END NOTICE
14 *
15 * Copyright (c) 1986 by University of Toronto.
16 * Written by Henry Spencer. Not derived from licensed software.
17 *
18 * Permission is granted to anyone to use this software for any
19 * purpose on any computer system, and to redistribute it freely,
20 * subject to the following restrictions:
21 *
22 * 1. The author is not responsible for the consequences of use of
23 * this software, no matter how awful, even if they arise
24 * from defects in it.
25 *
26 * 2. The origin of this software must not be misrepresented, either
27 * by explicit claim or by omission.
28 *
29 * 3. Altered versions must be plainly marked as such, and must not
30 * be misrepresented as being the original software.
31 *
32 * Beware that some of this code is subtly aware of the way operator
33 * precedence is structured in regular expressions. Serious changes in
34 * regular-expression syntax might require a total rethink.
35 *
Bram Moolenaarc0197e22004-09-13 20:26:32 +000036 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
37 * Webb, Ciaran McCreesh and Bram Moolenaar.
Bram Moolenaar071d4272004-06-13 20:20:40 +000038 * Named character class support added by Walter Briscoe (1998 Jul 01)
39 */
40
41#include "vim.h"
42
43#undef DEBUG
44
45/*
46 * The "internal use only" fields in regexp.h are present to pass info from
47 * compile to execute that permits the execute phase to run lots faster on
48 * simple cases. They are:
49 *
50 * regstart char that must begin a match; NUL if none obvious; Can be a
51 * multi-byte character.
52 * reganch is the match anchored (at beginning-of-line only)?
53 * regmust string (pointer into program) that match must include, or NULL
54 * regmlen length of regmust string
55 * regflags RF_ values or'ed together
56 *
57 * Regstart and reganch permit very fast decisions on suitable starting points
58 * for a match, cutting down the work a lot. Regmust permits fast rejection
59 * of lines that cannot possibly match. The regmust tests are costly enough
60 * that vim_regcomp() supplies a regmust only if the r.e. contains something
61 * potentially expensive (at present, the only such thing detected is * or +
62 * at the start of the r.e., which can involve a lot of backup). Regmlen is
63 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
64 * computing it anyway.
65 */
66
67/*
68 * Structure for regexp "program". This is essentially a linear encoding
69 * of a nondeterministic finite-state machine (aka syntax charts or
70 * "railroad normal form" in parsing technology). Each node is an opcode
71 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
72 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
73 * pointer with a BRANCH on both ends of it is connecting two alternatives.
74 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
75 * (as opposed to a collection of them) is never concatenated with anything
76 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
Bram Moolenaardf177f62005-02-22 08:39:57 +000077 * node points to the node after the stuff to be repeated.
78 * The operand of some types of node is a literal string; for others, it is a
79 * node leading into a sub-FSM. In particular, the operand of a BRANCH node
80 * is the first node of the branch.
81 * (NB this is *not* a tree structure: the tail of the branch connects to the
82 * thing following the set of BRANCHes.)
Bram Moolenaar071d4272004-06-13 20:20:40 +000083 *
84 * pattern is coded like:
85 *
86 * +-----------------+
87 * | V
88 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
89 * | ^ | ^
90 * +------+ +----------+
91 *
92 *
93 * +------------------+
94 * V |
95 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
96 * | | ^ ^
97 * | +---------------+ |
98 * +---------------------------------------------+
99 *
100 *
Bram Moolenaardf177f62005-02-22 08:39:57 +0000101 * +----------------------+
102 * V |
103 * <aa>\+ BRANCH <aa> --> BRANCH --> BACK BRANCH --> NOTHING --> END
104 * | | ^ ^
105 * | +----------+ |
106 * +-------------------------------------------------+
107 *
108 *
Bram Moolenaar071d4272004-06-13 20:20:40 +0000109 * +-------------------------+
110 * V |
111 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
112 * | | ^
113 * | +----------------+
114 * +-----------------------------------------------+
115 *
116 *
117 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
118 * | | ^ ^
119 * | +----------------+ |
120 * +--------------------------------+
121 *
122 * +---------+
123 * | V
124 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
125 * | | | | ^ ^
126 * | | | +-----+ |
127 * | | +----------------+ |
128 * | +---------------------------+ |
129 * +------------------------------------------------------+
130 *
131 * They all start with a BRANCH for "\|" alternaties, even when there is only
132 * one alternative.
133 */
134
135/*
136 * The opcodes are:
137 */
138
139/* definition number opnd? meaning */
140#define END 0 /* End of program or NOMATCH operand. */
141#define BOL 1 /* Match "" at beginning of line. */
142#define EOL 2 /* Match "" at end of line. */
143#define BRANCH 3 /* node Match this alternative, or the
144 * next... */
145#define BACK 4 /* Match "", "next" ptr points backward. */
146#define EXACTLY 5 /* str Match this string. */
147#define NOTHING 6 /* Match empty string. */
148#define STAR 7 /* node Match this (simple) thing 0 or more
149 * times. */
150#define PLUS 8 /* node Match this (simple) thing 1 or more
151 * times. */
152#define MATCH 9 /* node match the operand zero-width */
153#define NOMATCH 10 /* node check for no match with operand */
154#define BEHIND 11 /* node look behind for a match with operand */
155#define NOBEHIND 12 /* node look behind for no match with operand */
156#define SUBPAT 13 /* node match the operand here */
157#define BRACE_SIMPLE 14 /* node Match this (simple) thing between m and
158 * n times (\{m,n\}). */
159#define BOW 15 /* Match "" after [^a-zA-Z0-9_] */
160#define EOW 16 /* Match "" at [^a-zA-Z0-9_] */
161#define BRACE_LIMITS 17 /* nr nr define the min & max for BRACE_SIMPLE
162 * and BRACE_COMPLEX. */
163#define NEWL 18 /* Match line-break */
164#define BHPOS 19 /* End position for BEHIND or NOBEHIND */
165
166
167/* character classes: 20-48 normal, 50-78 include a line-break */
168#define ADD_NL 30
169#define FIRST_NL ANY + ADD_NL
170#define ANY 20 /* Match any one character. */
171#define ANYOF 21 /* str Match any character in this string. */
172#define ANYBUT 22 /* str Match any character not in this
173 * string. */
174#define IDENT 23 /* Match identifier char */
175#define SIDENT 24 /* Match identifier char but no digit */
176#define KWORD 25 /* Match keyword char */
177#define SKWORD 26 /* Match word char but no digit */
178#define FNAME 27 /* Match file name char */
179#define SFNAME 28 /* Match file name char but no digit */
180#define PRINT 29 /* Match printable char */
181#define SPRINT 30 /* Match printable char but no digit */
182#define WHITE 31 /* Match whitespace char */
183#define NWHITE 32 /* Match non-whitespace char */
184#define DIGIT 33 /* Match digit char */
185#define NDIGIT 34 /* Match non-digit char */
186#define HEX 35 /* Match hex char */
187#define NHEX 36 /* Match non-hex char */
188#define OCTAL 37 /* Match octal char */
189#define NOCTAL 38 /* Match non-octal char */
190#define WORD 39 /* Match word char */
191#define NWORD 40 /* Match non-word char */
192#define HEAD 41 /* Match head char */
193#define NHEAD 42 /* Match non-head char */
194#define ALPHA 43 /* Match alpha char */
195#define NALPHA 44 /* Match non-alpha char */
196#define LOWER 45 /* Match lowercase char */
197#define NLOWER 46 /* Match non-lowercase char */
198#define UPPER 47 /* Match uppercase char */
199#define NUPPER 48 /* Match non-uppercase char */
200#define LAST_NL NUPPER + ADD_NL
201#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
202
203#define MOPEN 80 /* -89 Mark this point in input as start of
204 * \( subexpr. MOPEN + 0 marks start of
205 * match. */
206#define MCLOSE 90 /* -99 Analogous to MOPEN. MCLOSE + 0 marks
207 * end of match. */
208#define BACKREF 100 /* -109 node Match same string again \1-\9 */
209
210#ifdef FEAT_SYN_HL
211# define ZOPEN 110 /* -119 Mark this point in input as start of
212 * \z( subexpr. */
213# define ZCLOSE 120 /* -129 Analogous to ZOPEN. */
214# define ZREF 130 /* -139 node Match external submatch \z1-\z9 */
215#endif
216
217#define BRACE_COMPLEX 140 /* -149 node Match nodes between m & n times */
218
219#define NOPEN 150 /* Mark this point in input as start of
220 \%( subexpr. */
221#define NCLOSE 151 /* Analogous to NOPEN. */
222
223#define MULTIBYTECODE 200 /* mbc Match one multi-byte character */
224#define RE_BOF 201 /* Match "" at beginning of file. */
225#define RE_EOF 202 /* Match "" at end of file. */
226#define CURSOR 203 /* Match location of cursor. */
227
228#define RE_LNUM 204 /* nr cmp Match line number */
229#define RE_COL 205 /* nr cmp Match column number */
230#define RE_VCOL 206 /* nr cmp Match virtual column number */
231
232/*
233 * Magic characters have a special meaning, they don't match literally.
234 * Magic characters are negative. This separates them from literal characters
235 * (possibly multi-byte). Only ASCII characters can be Magic.
236 */
237#define Magic(x) ((int)(x) - 256)
238#define un_Magic(x) ((x) + 256)
239#define is_Magic(x) ((x) < 0)
240
241static int no_Magic __ARGS((int x));
242static int toggle_Magic __ARGS((int x));
243
244 static int
245no_Magic(x)
246 int x;
247{
248 if (is_Magic(x))
249 return un_Magic(x);
250 return x;
251}
252
253 static int
254toggle_Magic(x)
255 int x;
256{
257 if (is_Magic(x))
258 return un_Magic(x);
259 return Magic(x);
260}
261
262/*
263 * The first byte of the regexp internal "program" is actually this magic
264 * number; the start node begins in the second byte. It's used to catch the
265 * most severe mutilation of the program by the caller.
266 */
267
268#define REGMAGIC 0234
269
270/*
271 * Opcode notes:
272 *
273 * BRANCH The set of branches constituting a single choice are hooked
274 * together with their "next" pointers, since precedence prevents
275 * anything being concatenated to any individual branch. The
276 * "next" pointer of the last BRANCH in a choice points to the
277 * thing following the whole choice. This is also where the
278 * final "next" pointer of each individual branch points; each
279 * branch starts with the operand node of a BRANCH node.
280 *
281 * BACK Normal "next" pointers all implicitly point forward; BACK
282 * exists to make loop structures possible.
283 *
284 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
285 * BRANCH structures using BACK. Simple cases (one character
286 * per match) are implemented with STAR and PLUS for speed
287 * and to minimize recursive plunges.
288 *
289 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
290 * node, and defines the min and max limits to be used for that
291 * node.
292 *
293 * MOPEN,MCLOSE ...are numbered at compile time.
294 * ZOPEN,ZCLOSE ...ditto
295 */
296
297/*
298 * A node is one char of opcode followed by two chars of "next" pointer.
299 * "Next" pointers are stored as two 8-bit bytes, high order first. The
300 * value is a positive offset from the opcode of the node containing it.
301 * An operand, if any, simply follows the node. (Note that much of the
302 * code generation knows about this implicit relationship.)
303 *
304 * Using two bytes for the "next" pointer is vast overkill for most things,
305 * but allows patterns to get big without disasters.
306 */
307#define OP(p) ((int)*(p))
308#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
309#define OPERAND(p) ((p) + 3)
310/* Obtain an operand that was stored as four bytes, MSB first. */
311#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
312 + ((long)(p)[5] << 8) + (long)(p)[6])
313/* Obtain a second operand stored as four bytes. */
314#define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
315/* Obtain a second single-byte operand stored after a four bytes operand. */
316#define OPERAND_CMP(p) (p)[7]
317
318/*
319 * Utility definitions.
320 */
321#define UCHARAT(p) ((int)*(char_u *)(p))
322
323/* Used for an error (down from) vim_regcomp(): give the error message, set
324 * rc_did_emsg and return NULL */
325#define EMSG_RET_NULL(m) { EMSG(m); rc_did_emsg = TRUE; return NULL; }
326#define EMSG_M_RET_NULL(m, c) { EMSG2(m, c ? "" : "\\"); rc_did_emsg = TRUE; return NULL; }
327#define EMSG_RET_FAIL(m) { EMSG(m); rc_did_emsg = TRUE; return FAIL; }
328#define EMSG_ONE_RET_NULL EMSG_M_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
329
330#define MAX_LIMIT (32767L << 16L)
331
332static int re_multi_type __ARGS((int));
333static int cstrncmp __ARGS((char_u *s1, char_u *s2, int *n));
334static char_u *cstrchr __ARGS((char_u *, int));
335
336#ifdef DEBUG
337static void regdump __ARGS((char_u *, regprog_T *));
338static char_u *regprop __ARGS((char_u *));
339#endif
340
341#define NOT_MULTI 0
342#define MULTI_ONE 1
343#define MULTI_MULT 2
344/*
345 * Return NOT_MULTI if c is not a "multi" operator.
346 * Return MULTI_ONE if c is a single "multi" operator.
347 * Return MULTI_MULT if c is a multi "multi" operator.
348 */
349 static int
350re_multi_type(c)
351 int c;
352{
353 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
354 return MULTI_ONE;
355 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
356 return MULTI_MULT;
357 return NOT_MULTI;
358}
359
360/*
361 * Flags to be passed up and down.
362 */
363#define HASWIDTH 0x1 /* Known never to match null string. */
364#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
365#define SPSTART 0x4 /* Starts with * or +. */
366#define HASNL 0x8 /* Contains some \n. */
367#define HASLOOKBH 0x10 /* Contains "\@<=" or "\@<!". */
368#define WORST 0 /* Worst case. */
369
370/*
371 * When regcode is set to this value, code is not emitted and size is computed
372 * instead.
373 */
374#define JUST_CALC_SIZE ((char_u *) -1)
375
376static char_u *reg_prev_sub;
377
378/*
379 * REGEXP_INRANGE contains all characters which are always special in a []
380 * range after '\'.
381 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
382 * These are:
383 * \n - New line (NL).
384 * \r - Carriage Return (CR).
385 * \t - Tab (TAB).
386 * \e - Escape (ESC).
387 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000388 * \d - Character code in decimal, eg \d123
389 * \o - Character code in octal, eg \o80
390 * \x - Character code in hex, eg \x4a
391 * \u - Multibyte character code, eg \u20ac
392 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000393 */
394static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000395static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000396
397static int backslash_trans __ARGS((int c));
Bram Moolenaardf177f62005-02-22 08:39:57 +0000398static int get_char_class __ARGS((char_u **pp));
399static int get_equi_class __ARGS((char_u **pp));
400static void reg_equi_class __ARGS((int c));
401static int get_coll_element __ARGS((char_u **pp));
Bram Moolenaar071d4272004-06-13 20:20:40 +0000402static char_u *skip_anyof __ARGS((char_u *p));
403static void init_class_tab __ARGS((void));
404
405/*
406 * Translate '\x' to its control character, except "\n", which is Magic.
407 */
408 static int
409backslash_trans(c)
410 int c;
411{
412 switch (c)
413 {
414 case 'r': return CAR;
415 case 't': return TAB;
416 case 'e': return ESC;
417 case 'b': return BS;
418 }
419 return c;
420}
421
422/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000423 * Check for a character class name "[:name:]". "pp" points to the '['.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000424 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
425 * recognized. Otherwise "pp" is advanced to after the item.
426 */
427 static int
Bram Moolenaardf177f62005-02-22 08:39:57 +0000428get_char_class(pp)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000429 char_u **pp;
430{
431 static const char *(class_names[]) =
432 {
433 "alnum:]",
434#define CLASS_ALNUM 0
435 "alpha:]",
436#define CLASS_ALPHA 1
437 "blank:]",
438#define CLASS_BLANK 2
439 "cntrl:]",
440#define CLASS_CNTRL 3
441 "digit:]",
442#define CLASS_DIGIT 4
443 "graph:]",
444#define CLASS_GRAPH 5
445 "lower:]",
446#define CLASS_LOWER 6
447 "print:]",
448#define CLASS_PRINT 7
449 "punct:]",
450#define CLASS_PUNCT 8
451 "space:]",
452#define CLASS_SPACE 9
453 "upper:]",
454#define CLASS_UPPER 10
455 "xdigit:]",
456#define CLASS_XDIGIT 11
457 "tab:]",
458#define CLASS_TAB 12
459 "return:]",
460#define CLASS_RETURN 13
461 "backspace:]",
462#define CLASS_BACKSPACE 14
463 "escape:]",
464#define CLASS_ESCAPE 15
465 };
466#define CLASS_NONE 99
467 int i;
468
469 if ((*pp)[1] == ':')
470 {
471 for (i = 0; i < sizeof(class_names) / sizeof(*class_names); ++i)
472 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
473 {
474 *pp += STRLEN(class_names[i]) + 2;
475 return i;
476 }
477 }
478 return CLASS_NONE;
479}
480
481/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000482 * Specific version of character class functions.
483 * Using a table to keep this fast.
484 */
485static short class_tab[256];
486
487#define RI_DIGIT 0x01
488#define RI_HEX 0x02
489#define RI_OCTAL 0x04
490#define RI_WORD 0x08
491#define RI_HEAD 0x10
492#define RI_ALPHA 0x20
493#define RI_LOWER 0x40
494#define RI_UPPER 0x80
495#define RI_WHITE 0x100
496
497 static void
498init_class_tab()
499{
500 int i;
501 static int done = FALSE;
502
503 if (done)
504 return;
505
506 for (i = 0; i < 256; ++i)
507 {
508 if (i >= '0' && i <= '7')
509 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
510 else if (i >= '8' && i <= '9')
511 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
512 else if (i >= 'a' && i <= 'f')
513 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
514#ifdef EBCDIC
515 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
516 || (i >= 's' && i <= 'z'))
517#else
518 else if (i >= 'g' && i <= 'z')
519#endif
520 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
521 else if (i >= 'A' && i <= 'F')
522 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
523#ifdef EBCDIC
524 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
525 || (i >= 'S' && i <= 'Z'))
526#else
527 else if (i >= 'G' && i <= 'Z')
528#endif
529 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
530 else if (i == '_')
531 class_tab[i] = RI_WORD + RI_HEAD;
532 else
533 class_tab[i] = 0;
534 }
535 class_tab[' '] |= RI_WHITE;
536 class_tab['\t'] |= RI_WHITE;
537 done = TRUE;
538}
539
540#ifdef FEAT_MBYTE
541# define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
542# define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
543# define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
544# define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
545# define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
546# define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
547# define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
548# define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
549# define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
550#else
551# define ri_digit(c) (class_tab[c] & RI_DIGIT)
552# define ri_hex(c) (class_tab[c] & RI_HEX)
553# define ri_octal(c) (class_tab[c] & RI_OCTAL)
554# define ri_word(c) (class_tab[c] & RI_WORD)
555# define ri_head(c) (class_tab[c] & RI_HEAD)
556# define ri_alpha(c) (class_tab[c] & RI_ALPHA)
557# define ri_lower(c) (class_tab[c] & RI_LOWER)
558# define ri_upper(c) (class_tab[c] & RI_UPPER)
559# define ri_white(c) (class_tab[c] & RI_WHITE)
560#endif
561
562/* flags for regflags */
563#define RF_ICASE 1 /* ignore case */
564#define RF_NOICASE 2 /* don't ignore case */
565#define RF_HASNL 4 /* can match a NL */
566#define RF_ICOMBINE 8 /* ignore combining characters */
567#define RF_LOOKBH 16 /* uses "\@<=" or "\@<!" */
568
569/*
570 * Global work variables for vim_regcomp().
571 */
572
573static char_u *regparse; /* Input-scan pointer. */
574static int prevchr_len; /* byte length of previous char */
575static int num_complex_braces; /* Complex \{...} count */
576static int regnpar; /* () count. */
577#ifdef FEAT_SYN_HL
578static int regnzpar; /* \z() count. */
579static int re_has_z; /* \z item detected */
580#endif
581static char_u *regcode; /* Code-emit pointer, or JUST_CALC_SIZE */
582static long regsize; /* Code size. */
583static char_u had_endbrace[NSUBEXP]; /* flags, TRUE if end of () found */
584static unsigned regflags; /* RF_ flags for prog */
585static long brace_min[10]; /* Minimums for complex brace repeats */
586static long brace_max[10]; /* Maximums for complex brace repeats */
587static int brace_count[10]; /* Current counts for complex brace repeats */
588#if defined(FEAT_SYN_HL) || defined(PROTO)
589static int had_eol; /* TRUE when EOL found by vim_regcomp() */
590#endif
591static int one_exactly = FALSE; /* only do one char for EXACTLY */
592
593static int reg_magic; /* magicness of the pattern: */
594#define MAGIC_NONE 1 /* "\V" very unmagic */
595#define MAGIC_OFF 2 /* "\M" or 'magic' off */
596#define MAGIC_ON 3 /* "\m" or 'magic' */
597#define MAGIC_ALL 4 /* "\v" very magic */
598
599static int reg_string; /* matching with a string instead of a buffer
600 line */
601
602/*
603 * META contains all characters that may be magic, except '^' and '$'.
604 */
605
606#ifdef EBCDIC
607static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
608#else
609/* META[] is used often enough to justify turning it into a table. */
610static char_u META_flags[] = {
611 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
612 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
613/* % & ( ) * + . */
614 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
615/* 1 2 3 4 5 6 7 8 9 < = > ? */
616 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
617/* @ A C D F H I K L M O */
618 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
619/* P S U V W X Z [ _ */
620 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
621/* a c d f h i k l m n o */
622 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
623/* p s u v w x z { | ~ */
624 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
625};
626#endif
627
628static int curchr;
629
630/* arguments for reg() */
631#define REG_NOPAREN 0 /* toplevel reg() */
632#define REG_PAREN 1 /* \(\) */
633#define REG_ZPAREN 2 /* \z(\) */
634#define REG_NPAREN 3 /* \%(\) */
635
636/*
637 * Forward declarations for vim_regcomp()'s friends.
638 */
639static void initchr __ARGS((char_u *));
640static int getchr __ARGS((void));
641static void skipchr_keepstart __ARGS((void));
642static int peekchr __ARGS((void));
643static void skipchr __ARGS((void));
644static void ungetchr __ARGS((void));
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000645static int gethexchrs __ARGS((int maxinputlen));
646static int getoctchrs __ARGS((void));
647static int getdecchrs __ARGS((void));
648static int coll_get_char __ARGS((void));
Bram Moolenaar071d4272004-06-13 20:20:40 +0000649static void regcomp_start __ARGS((char_u *expr, int flags));
650static char_u *reg __ARGS((int, int *));
651static char_u *regbranch __ARGS((int *flagp));
652static char_u *regconcat __ARGS((int *flagp));
653static char_u *regpiece __ARGS((int *));
654static char_u *regatom __ARGS((int *));
655static char_u *regnode __ARGS((int));
656static int prog_magic_wrong __ARGS((void));
657static char_u *regnext __ARGS((char_u *));
658static void regc __ARGS((int b));
659#ifdef FEAT_MBYTE
660static void regmbc __ARGS((int c));
Bram Moolenaardf177f62005-02-22 08:39:57 +0000661#else
662# define regmbc(c) regc(c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000663#endif
664static void reginsert __ARGS((int, char_u *));
665static void reginsert_limits __ARGS((int, long, long, char_u *));
666static char_u *re_put_long __ARGS((char_u *pr, long_u val));
667static int read_limits __ARGS((long *, long *));
668static void regtail __ARGS((char_u *, char_u *));
669static void regoptail __ARGS((char_u *, char_u *));
670
671/*
672 * Return TRUE if compiled regular expression "prog" can match a line break.
673 */
674 int
675re_multiline(prog)
676 regprog_T *prog;
677{
678 return (prog->regflags & RF_HASNL);
679}
680
681/*
682 * Return TRUE if compiled regular expression "prog" looks before the start
683 * position (pattern contains "\@<=" or "\@<!").
684 */
685 int
686re_lookbehind(prog)
687 regprog_T *prog;
688{
689 return (prog->regflags & RF_LOOKBH);
690}
691
692/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000693 * Check for an equivalence class name "[=a=]". "pp" points to the '['.
694 * Returns a character representing the class. Zero means that no item was
695 * recognized. Otherwise "pp" is advanced to after the item.
696 */
697 static int
698get_equi_class(pp)
699 char_u **pp;
700{
701 int c;
702 int l = 1;
703 char_u *p = *pp;
704
705 if (p[1] == '=')
706 {
707#ifdef FEAT_MBYTE
708 if (has_mbyte)
709 l = mb_ptr2len_check(p + 2);
710#endif
711 if (p[l + 2] == '=' && p[l + 3] == ']')
712 {
713#ifdef FEAT_MBYTE
714 if (has_mbyte)
715 c = mb_ptr2char(p + 2);
716 else
717#endif
718 c = p[2];
719 *pp += l + 4;
720 return c;
721 }
722 }
723 return 0;
724}
725
726/*
727 * Produce the bytes for equivalence class "c".
728 * Currently only handles latin1, latin9 and utf-8.
729 */
730 static void
731reg_equi_class(c)
732 int c;
733{
734#ifdef FEAT_MBYTE
735 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
736 || STRCMP(p_enc, "latin9") == 0)
737#endif
738 {
739 switch (c)
740 {
741 case 'A': case 'À': case 'Á': case 'Â':
742 case 'Ã': case 'Ä': case 'Å':
743 regmbc('A'); regmbc('À'); regmbc('Á'); regmbc('Â');
744 regmbc('Ã'); regmbc('Ä'); regmbc('Å');
745 return;
746 case 'C': case 'Ç':
747 regmbc('C'); regmbc('Ç');
748 return;
749 case 'E': case 'È': case 'É': case 'Ê': case 'Ë':
750 regmbc('E'); regmbc('È'); regmbc('É'); regmbc('Ê');
751 regmbc('Ë');
752 return;
753 case 'I': case 'Ì': case 'Í': case 'Î': case 'Ï':
754 regmbc('I'); regmbc('Ì'); regmbc('Í'); regmbc('Î');
755 regmbc('Ï');
756 return;
757 case 'N': case 'Ñ':
758 regmbc('N'); regmbc('Ñ');
759 return;
760 case 'O': case 'Ò': case 'Ó': case 'Ô': case 'Õ': case 'Ö':
761 regmbc('O'); regmbc('Ò'); regmbc('Ó'); regmbc('Ô');
762 regmbc('Õ'); regmbc('Ö');
763 return;
764 case 'U': case 'Ù': case 'Ú': case 'Û': case 'Ü':
765 regmbc('U'); regmbc('Ù'); regmbc('Ú'); regmbc('Û');
766 regmbc('Ü');
767 return;
768 case 'Y': case 'Ý':
769 regmbc('Y'); regmbc('Ý');
770 return;
771 case 'a': case 'à': case 'á': case 'â':
772 case 'ã': case 'ä': case 'å':
773 regmbc('a'); regmbc('à'); regmbc('á'); regmbc('â');
774 regmbc('ã'); regmbc('ä'); regmbc('å');
775 return;
776 case 'c': case 'ç':
777 regmbc('c'); regmbc('ç');
778 return;
779 case 'e': case 'è': case 'é': case 'ê': case 'ë':
780 regmbc('e'); regmbc('è'); regmbc('é'); regmbc('ê');
781 regmbc('ë');
782 return;
783 case 'i': case 'ì': case 'í': case 'î': case 'ï':
784 regmbc('i'); regmbc('ì'); regmbc('í'); regmbc('î');
785 regmbc('ï');
786 return;
787 case 'n': case 'ñ':
788 regmbc('n'); regmbc('ñ');
789 return;
790 case 'o': case 'ò': case 'ó': case 'ô': case 'õ': case 'ö':
791 regmbc('o'); regmbc('ò'); regmbc('ó'); regmbc('ô');
792 regmbc('õ'); regmbc('ö');
793 return;
794 case 'u': case 'ù': case 'ú': case 'û': case 'ü':
795 regmbc('u'); regmbc('ù'); regmbc('ú'); regmbc('û');
796 regmbc('ü');
797 return;
798 case 'y': case 'ý': case 'ÿ':
799 regmbc('y'); regmbc('ý'); regmbc('ÿ');
800 return;
801 }
802 }
803 regmbc(c);
804}
805
806/*
807 * Check for a collating element "[.a.]". "pp" points to the '['.
808 * Returns a character. Zero means that no item was recognized. Otherwise
809 * "pp" is advanced to after the item.
810 * Currently only single characters are recognized!
811 */
812 static int
813get_coll_element(pp)
814 char_u **pp;
815{
816 int c;
817 int l = 1;
818 char_u *p = *pp;
819
820 if (p[1] == '.')
821 {
822#ifdef FEAT_MBYTE
823 if (has_mbyte)
824 l = mb_ptr2len_check(p + 2);
825#endif
826 if (p[l + 2] == '.' && p[l + 3] == ']')
827 {
828#ifdef FEAT_MBYTE
829 if (has_mbyte)
830 c = mb_ptr2char(p + 2);
831 else
832#endif
833 c = p[2];
834 *pp += l + 4;
835 return c;
836 }
837 }
838 return 0;
839}
840
841
842/*
843 * Skip over a "[]" range.
844 * "p" must point to the character after the '['.
845 * The returned pointer is on the matching ']', or the terminating NUL.
846 */
847 static char_u *
848skip_anyof(p)
849 char_u *p;
850{
851 int cpo_lit; /* 'cpoptions' contains 'l' flag */
852 int cpo_bsl; /* 'cpoptions' contains '\' flag */
853#ifdef FEAT_MBYTE
854 int l;
855#endif
856
857 cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
858 cpo_bsl = (!reg_syn && vim_strchr(p_cpo, CPO_BACKSL) != NULL);
859
860 if (*p == '^') /* Complement of range. */
861 ++p;
862 if (*p == ']' || *p == '-')
863 ++p;
864 while (*p != NUL && *p != ']')
865 {
866#ifdef FEAT_MBYTE
867 if (has_mbyte && (l = (*mb_ptr2len_check)(p)) > 1)
868 p += l;
869 else
870#endif
871 if (*p == '-')
872 {
873 ++p;
874 if (*p != ']' && *p != NUL)
875 mb_ptr_adv(p);
876 }
877 else if (*p == '\\'
878 && !cpo_bsl
879 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
880 || (!cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
881 p += 2;
882 else if (*p == '[')
883 {
884 if (get_char_class(&p) == CLASS_NONE
885 && get_equi_class(&p) == 0
886 && get_coll_element(&p) == 0)
887 ++p; /* It was not a class name */
888 }
889 else
890 ++p;
891 }
892
893 return p;
894}
895
896/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000897 * Skip past regular expression.
Bram Moolenaar748bf032005-02-02 23:04:36 +0000898 * Stop at end of "startp" or where "dirc" is found ('/', '?', etc).
Bram Moolenaar071d4272004-06-13 20:20:40 +0000899 * Take care of characters with a backslash in front of it.
900 * Skip strings inside [ and ].
901 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
902 * expression and change "\?" to "?". If "*newp" is not NULL the expression
903 * is changed in-place.
904 */
905 char_u *
906skip_regexp(startp, dirc, magic, newp)
907 char_u *startp;
908 int dirc;
909 int magic;
910 char_u **newp;
911{
912 int mymagic;
913 char_u *p = startp;
914
915 if (magic)
916 mymagic = MAGIC_ON;
917 else
918 mymagic = MAGIC_OFF;
919
Bram Moolenaar1cd871b2004-12-19 22:46:22 +0000920 for (; p[0] != NUL; mb_ptr_adv(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000921 {
922 if (p[0] == dirc) /* found end of regexp */
923 break;
924 if ((p[0] == '[' && mymagic >= MAGIC_ON)
925 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
926 {
927 p = skip_anyof(p + 1);
928 if (p[0] == NUL)
929 break;
930 }
931 else if (p[0] == '\\' && p[1] != NUL)
932 {
933 if (dirc == '?' && newp != NULL && p[1] == '?')
934 {
935 /* change "\?" to "?", make a copy first. */
936 if (*newp == NULL)
937 {
938 *newp = vim_strsave(startp);
939 if (*newp != NULL)
940 p = *newp + (p - startp);
941 }
942 if (*newp != NULL)
943 mch_memmove(p, p + 1, STRLEN(p));
944 else
945 ++p;
946 }
947 else
948 ++p; /* skip next character */
949 if (*p == 'v')
950 mymagic = MAGIC_ALL;
951 else if (*p == 'V')
952 mymagic = MAGIC_NONE;
953 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000954 }
955 return p;
956}
957
958/*
Bram Moolenaar86b68352004-12-27 21:59:20 +0000959 * vim_regcomp() - compile a regular expression into internal code
960 * Returns the program in allocated space. Returns NULL for an error.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000961 *
962 * We can't allocate space until we know how big the compiled form will be,
963 * but we can't compile it (and thus know how big it is) until we've got a
964 * place to put the code. So we cheat: we compile it twice, once with code
965 * generation turned off and size counting turned on, and once "for real".
966 * This also means that we don't allocate space until we are sure that the
967 * thing really will compile successfully, and we never have to move the
968 * code and thus invalidate pointers into it. (Note that it has to be in
969 * one piece because vim_free() must be able to free it all.)
970 *
971 * Whether upper/lower case is to be ignored is decided when executing the
972 * program, it does not matter here.
973 *
974 * Beware that the optimization-preparation code in here knows about some
975 * of the structure of the compiled regexp.
976 * "re_flags": RE_MAGIC and/or RE_STRING.
977 */
978 regprog_T *
979vim_regcomp(expr, re_flags)
980 char_u *expr;
981 int re_flags;
982{
983 regprog_T *r;
984 char_u *scan;
985 char_u *longest;
986 int len;
987 int flags;
988
989 if (expr == NULL)
990 EMSG_RET_NULL(_(e_null));
991
992 init_class_tab();
993
994 /*
995 * First pass: determine size, legality.
996 */
997 regcomp_start(expr, re_flags);
998 regcode = JUST_CALC_SIZE;
999 regc(REGMAGIC);
1000 if (reg(REG_NOPAREN, &flags) == NULL)
1001 return NULL;
1002
1003 /* Small enough for pointer-storage convention? */
1004#ifdef SMALL_MALLOC /* 16 bit storage allocation */
1005 if (regsize >= 65536L - 256L)
1006 EMSG_RET_NULL(_("E339: Pattern too long"));
1007#endif
1008
1009 /* Allocate space. */
1010 r = (regprog_T *)lalloc(sizeof(regprog_T) + regsize, TRUE);
1011 if (r == NULL)
1012 return NULL;
1013
1014 /*
1015 * Second pass: emit code.
1016 */
1017 regcomp_start(expr, re_flags);
1018 regcode = r->program;
1019 regc(REGMAGIC);
1020 if (reg(REG_NOPAREN, &flags) == NULL)
1021 {
1022 vim_free(r);
1023 return NULL;
1024 }
1025
1026 /* Dig out information for optimizations. */
1027 r->regstart = NUL; /* Worst-case defaults. */
1028 r->reganch = 0;
1029 r->regmust = NULL;
1030 r->regmlen = 0;
1031 r->regflags = regflags;
1032 if (flags & HASNL)
1033 r->regflags |= RF_HASNL;
1034 if (flags & HASLOOKBH)
1035 r->regflags |= RF_LOOKBH;
1036#ifdef FEAT_SYN_HL
1037 /* Remember whether this pattern has any \z specials in it. */
1038 r->reghasz = re_has_z;
1039#endif
1040 scan = r->program + 1; /* First BRANCH. */
1041 if (OP(regnext(scan)) == END) /* Only one top-level choice. */
1042 {
1043 scan = OPERAND(scan);
1044
1045 /* Starting-point info. */
1046 if (OP(scan) == BOL || OP(scan) == RE_BOF)
1047 {
1048 r->reganch++;
1049 scan = regnext(scan);
1050 }
1051
1052 if (OP(scan) == EXACTLY)
1053 {
1054#ifdef FEAT_MBYTE
1055 if (has_mbyte)
1056 r->regstart = (*mb_ptr2char)(OPERAND(scan));
1057 else
1058#endif
1059 r->regstart = *OPERAND(scan);
1060 }
1061 else if ((OP(scan) == BOW
1062 || OP(scan) == EOW
1063 || OP(scan) == NOTHING
1064 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
1065 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
1066 && OP(regnext(scan)) == EXACTLY)
1067 {
1068#ifdef FEAT_MBYTE
1069 if (has_mbyte)
1070 r->regstart = (*mb_ptr2char)(OPERAND(regnext(scan)));
1071 else
1072#endif
1073 r->regstart = *OPERAND(regnext(scan));
1074 }
1075
1076 /*
1077 * If there's something expensive in the r.e., find the longest
1078 * literal string that must appear and make it the regmust. Resolve
1079 * ties in favor of later strings, since the regstart check works
1080 * with the beginning of the r.e. and avoiding duplication
1081 * strengthens checking. Not a strong reason, but sufficient in the
1082 * absence of others.
1083 */
1084 /*
1085 * When the r.e. starts with BOW, it is faster to look for a regmust
1086 * first. Used a lot for "#" and "*" commands. (Added by mool).
1087 */
1088 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
1089 && !(flags & HASNL))
1090 {
1091 longest = NULL;
1092 len = 0;
1093 for (; scan != NULL; scan = regnext(scan))
1094 if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len)
1095 {
1096 longest = OPERAND(scan);
1097 len = (int)STRLEN(OPERAND(scan));
1098 }
1099 r->regmust = longest;
1100 r->regmlen = len;
1101 }
1102 }
1103#ifdef DEBUG
1104 regdump(expr, r);
1105#endif
1106 return r;
1107}
1108
1109/*
1110 * Setup to parse the regexp. Used once to get the length and once to do it.
1111 */
1112 static void
1113regcomp_start(expr, re_flags)
1114 char_u *expr;
1115 int re_flags; /* see vim_regcomp() */
1116{
1117 initchr(expr);
1118 if (re_flags & RE_MAGIC)
1119 reg_magic = MAGIC_ON;
1120 else
1121 reg_magic = MAGIC_OFF;
1122 reg_string = (re_flags & RE_STRING);
1123
1124 num_complex_braces = 0;
1125 regnpar = 1;
1126 vim_memset(had_endbrace, 0, sizeof(had_endbrace));
1127#ifdef FEAT_SYN_HL
1128 regnzpar = 1;
1129 re_has_z = 0;
1130#endif
1131 regsize = 0L;
1132 regflags = 0;
1133#if defined(FEAT_SYN_HL) || defined(PROTO)
1134 had_eol = FALSE;
1135#endif
1136}
1137
1138#if defined(FEAT_SYN_HL) || defined(PROTO)
1139/*
1140 * Check if during the previous call to vim_regcomp the EOL item "$" has been
1141 * found. This is messy, but it works fine.
1142 */
1143 int
1144vim_regcomp_had_eol()
1145{
1146 return had_eol;
1147}
1148#endif
1149
1150/*
1151 * reg - regular expression, i.e. main body or parenthesized thing
1152 *
1153 * Caller must absorb opening parenthesis.
1154 *
1155 * Combining parenthesis handling with the base level of regular expression
1156 * is a trifle forced, but the need to tie the tails of the branches to what
1157 * follows makes it hard to avoid.
1158 */
1159 static char_u *
1160reg(paren, flagp)
1161 int paren; /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */
1162 int *flagp;
1163{
1164 char_u *ret;
1165 char_u *br;
1166 char_u *ender;
1167 int parno = 0;
1168 int flags;
1169
1170 *flagp = HASWIDTH; /* Tentatively. */
1171
1172#ifdef FEAT_SYN_HL
1173 if (paren == REG_ZPAREN)
1174 {
1175 /* Make a ZOPEN node. */
1176 if (regnzpar >= NSUBEXP)
1177 EMSG_RET_NULL(_("E50: Too many \\z("));
1178 parno = regnzpar;
1179 regnzpar++;
1180 ret = regnode(ZOPEN + parno);
1181 }
1182 else
1183#endif
1184 if (paren == REG_PAREN)
1185 {
1186 /* Make a MOPEN node. */
1187 if (regnpar >= NSUBEXP)
1188 EMSG_M_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL);
1189 parno = regnpar;
1190 ++regnpar;
1191 ret = regnode(MOPEN + parno);
1192 }
1193 else if (paren == REG_NPAREN)
1194 {
1195 /* Make a NOPEN node. */
1196 ret = regnode(NOPEN);
1197 }
1198 else
1199 ret = NULL;
1200
1201 /* Pick up the branches, linking them together. */
1202 br = regbranch(&flags);
1203 if (br == NULL)
1204 return NULL;
1205 if (ret != NULL)
1206 regtail(ret, br); /* [MZ]OPEN -> first. */
1207 else
1208 ret = br;
1209 /* If one of the branches can be zero-width, the whole thing can.
1210 * If one of the branches has * at start or matches a line-break, the
1211 * whole thing can. */
1212 if (!(flags & HASWIDTH))
1213 *flagp &= ~HASWIDTH;
1214 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1215 while (peekchr() == Magic('|'))
1216 {
1217 skipchr();
1218 br = regbranch(&flags);
1219 if (br == NULL)
1220 return NULL;
1221 regtail(ret, br); /* BRANCH -> BRANCH. */
1222 if (!(flags & HASWIDTH))
1223 *flagp &= ~HASWIDTH;
1224 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1225 }
1226
1227 /* Make a closing node, and hook it on the end. */
1228 ender = regnode(
1229#ifdef FEAT_SYN_HL
1230 paren == REG_ZPAREN ? ZCLOSE + parno :
1231#endif
1232 paren == REG_PAREN ? MCLOSE + parno :
1233 paren == REG_NPAREN ? NCLOSE : END);
1234 regtail(ret, ender);
1235
1236 /* Hook the tails of the branches to the closing node. */
1237 for (br = ret; br != NULL; br = regnext(br))
1238 regoptail(br, ender);
1239
1240 /* Check for proper termination. */
1241 if (paren != REG_NOPAREN && getchr() != Magic(')'))
1242 {
1243#ifdef FEAT_SYN_HL
1244 if (paren == REG_ZPAREN)
1245 EMSG_RET_NULL(_("E52: Unmatched \\z("))
1246 else
1247#endif
1248 if (paren == REG_NPAREN)
1249 EMSG_M_RET_NULL(_("E53: Unmatched %s%%("), reg_magic == MAGIC_ALL)
1250 else
1251 EMSG_M_RET_NULL(_("E54: Unmatched %s("), reg_magic == MAGIC_ALL)
1252 }
1253 else if (paren == REG_NOPAREN && peekchr() != NUL)
1254 {
1255 if (curchr == Magic(')'))
1256 EMSG_M_RET_NULL(_("E55: Unmatched %s)"), reg_magic == MAGIC_ALL)
1257 else
1258 EMSG_RET_NULL(_(e_trailing)) /* "Can't happen". */
1259 /* NOTREACHED */
1260 }
1261 /*
1262 * Here we set the flag allowing back references to this set of
1263 * parentheses.
1264 */
1265 if (paren == REG_PAREN)
1266 had_endbrace[parno] = TRUE; /* have seen the close paren */
1267 return ret;
1268}
1269
1270/*
1271 * regbranch - one alternative of an | operator
1272 *
1273 * Implements the & operator.
1274 */
1275 static char_u *
1276regbranch(flagp)
1277 int *flagp;
1278{
1279 char_u *ret;
1280 char_u *chain = NULL;
1281 char_u *latest;
1282 int flags;
1283
1284 *flagp = WORST | HASNL; /* Tentatively. */
1285
1286 ret = regnode(BRANCH);
1287 for (;;)
1288 {
1289 latest = regconcat(&flags);
1290 if (latest == NULL)
1291 return NULL;
1292 /* If one of the branches has width, the whole thing has. If one of
1293 * the branches anchors at start-of-line, the whole thing does.
1294 * If one of the branches uses look-behind, the whole thing does. */
1295 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
1296 /* If one of the branches doesn't match a line-break, the whole thing
1297 * doesn't. */
1298 *flagp &= ~HASNL | (flags & HASNL);
1299 if (chain != NULL)
1300 regtail(chain, latest);
1301 if (peekchr() != Magic('&'))
1302 break;
1303 skipchr();
1304 regtail(latest, regnode(END)); /* operand ends */
1305 reginsert(MATCH, latest);
1306 chain = latest;
1307 }
1308
1309 return ret;
1310}
1311
1312/*
1313 * regbranch - one alternative of an | or & operator
1314 *
1315 * Implements the concatenation operator.
1316 */
1317 static char_u *
1318regconcat(flagp)
1319 int *flagp;
1320{
1321 char_u *first = NULL;
1322 char_u *chain = NULL;
1323 char_u *latest;
1324 int flags;
1325 int cont = TRUE;
1326
1327 *flagp = WORST; /* Tentatively. */
1328
1329 while (cont)
1330 {
1331 switch (peekchr())
1332 {
1333 case NUL:
1334 case Magic('|'):
1335 case Magic('&'):
1336 case Magic(')'):
1337 cont = FALSE;
1338 break;
1339 case Magic('Z'):
1340#ifdef FEAT_MBYTE
1341 regflags |= RF_ICOMBINE;
1342#endif
1343 skipchr_keepstart();
1344 break;
1345 case Magic('c'):
1346 regflags |= RF_ICASE;
1347 skipchr_keepstart();
1348 break;
1349 case Magic('C'):
1350 regflags |= RF_NOICASE;
1351 skipchr_keepstart();
1352 break;
1353 case Magic('v'):
1354 reg_magic = MAGIC_ALL;
1355 skipchr_keepstart();
1356 curchr = -1;
1357 break;
1358 case Magic('m'):
1359 reg_magic = MAGIC_ON;
1360 skipchr_keepstart();
1361 curchr = -1;
1362 break;
1363 case Magic('M'):
1364 reg_magic = MAGIC_OFF;
1365 skipchr_keepstart();
1366 curchr = -1;
1367 break;
1368 case Magic('V'):
1369 reg_magic = MAGIC_NONE;
1370 skipchr_keepstart();
1371 curchr = -1;
1372 break;
1373 default:
1374 latest = regpiece(&flags);
1375 if (latest == NULL)
1376 return NULL;
1377 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
1378 if (chain == NULL) /* First piece. */
1379 *flagp |= flags & SPSTART;
1380 else
1381 regtail(chain, latest);
1382 chain = latest;
1383 if (first == NULL)
1384 first = latest;
1385 break;
1386 }
1387 }
1388 if (first == NULL) /* Loop ran zero times. */
1389 first = regnode(NOTHING);
1390 return first;
1391}
1392
1393/*
1394 * regpiece - something followed by possible [*+=]
1395 *
1396 * Note that the branching code sequences used for = and the general cases
1397 * of * and + are somewhat optimized: they use the same NOTHING node as
1398 * both the endmarker for their branch list and the body of the last branch.
1399 * It might seem that this node could be dispensed with entirely, but the
1400 * endmarker role is not redundant.
1401 */
1402 static char_u *
1403regpiece(flagp)
1404 int *flagp;
1405{
1406 char_u *ret;
1407 int op;
1408 char_u *next;
1409 int flags;
1410 long minval;
1411 long maxval;
1412
1413 ret = regatom(&flags);
1414 if (ret == NULL)
1415 return NULL;
1416
1417 op = peekchr();
1418 if (re_multi_type(op) == NOT_MULTI)
1419 {
1420 *flagp = flags;
1421 return ret;
1422 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001423 /* default flags */
1424 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
1425
1426 skipchr();
1427 switch (op)
1428 {
1429 case Magic('*'):
1430 if (flags & SIMPLE)
1431 reginsert(STAR, ret);
1432 else
1433 {
1434 /* Emit x* as (x&|), where & means "self". */
1435 reginsert(BRANCH, ret); /* Either x */
1436 regoptail(ret, regnode(BACK)); /* and loop */
1437 regoptail(ret, ret); /* back */
1438 regtail(ret, regnode(BRANCH)); /* or */
1439 regtail(ret, regnode(NOTHING)); /* null. */
1440 }
1441 break;
1442
1443 case Magic('+'):
1444 if (flags & SIMPLE)
1445 reginsert(PLUS, ret);
1446 else
1447 {
1448 /* Emit x+ as x(&|), where & means "self". */
1449 next = regnode(BRANCH); /* Either */
1450 regtail(ret, next);
1451 regtail(regnode(BACK), ret); /* loop back */
1452 regtail(next, regnode(BRANCH)); /* or */
1453 regtail(ret, regnode(NOTHING)); /* null. */
1454 }
1455 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1456 break;
1457
1458 case Magic('@'):
1459 {
1460 int lop = END;
1461
1462 switch (no_Magic(getchr()))
1463 {
1464 case '=': lop = MATCH; break; /* \@= */
1465 case '!': lop = NOMATCH; break; /* \@! */
1466 case '>': lop = SUBPAT; break; /* \@> */
1467 case '<': switch (no_Magic(getchr()))
1468 {
1469 case '=': lop = BEHIND; break; /* \@<= */
1470 case '!': lop = NOBEHIND; break; /* \@<! */
1471 }
1472 }
1473 if (lop == END)
1474 EMSG_M_RET_NULL(_("E59: invalid character after %s@"),
1475 reg_magic == MAGIC_ALL);
1476 /* Look behind must match with behind_pos. */
1477 if (lop == BEHIND || lop == NOBEHIND)
1478 {
1479 regtail(ret, regnode(BHPOS));
1480 *flagp |= HASLOOKBH;
1481 }
1482 regtail(ret, regnode(END)); /* operand ends */
1483 reginsert(lop, ret);
1484 break;
1485 }
1486
1487 case Magic('?'):
1488 case Magic('='):
1489 /* Emit x= as (x|) */
1490 reginsert(BRANCH, ret); /* Either x */
1491 regtail(ret, regnode(BRANCH)); /* or */
1492 next = regnode(NOTHING); /* null. */
1493 regtail(ret, next);
1494 regoptail(ret, next);
1495 break;
1496
1497 case Magic('{'):
1498 if (!read_limits(&minval, &maxval))
1499 return NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001500 if (flags & SIMPLE)
1501 {
1502 reginsert(BRACE_SIMPLE, ret);
1503 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1504 }
1505 else
1506 {
1507 if (num_complex_braces >= 10)
1508 EMSG_M_RET_NULL(_("E60: Too many complex %s{...}s"),
1509 reg_magic == MAGIC_ALL);
1510 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
1511 regoptail(ret, regnode(BACK));
1512 regoptail(ret, ret);
1513 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1514 ++num_complex_braces;
1515 }
1516 if (minval > 0 && maxval > 0)
1517 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1518 break;
1519 }
1520 if (re_multi_type(peekchr()) != NOT_MULTI)
1521 {
1522 /* Can't have a multi follow a multi. */
1523 if (peekchr() == Magic('*'))
1524 sprintf((char *)IObuff, _("E61: Nested %s*"),
1525 reg_magic >= MAGIC_ON ? "" : "\\");
1526 else
1527 sprintf((char *)IObuff, _("E62: Nested %s%c"),
1528 reg_magic == MAGIC_ALL ? "" : "\\", no_Magic(peekchr()));
1529 EMSG_RET_NULL(IObuff);
1530 }
1531
1532 return ret;
1533}
1534
1535/*
1536 * regatom - the lowest level
1537 *
1538 * Optimization: gobbles an entire sequence of ordinary characters so that
1539 * it can turn them into a single node, which is smaller to store and
1540 * faster to run. Don't do this when one_exactly is set.
1541 */
1542 static char_u *
1543regatom(flagp)
1544 int *flagp;
1545{
1546 char_u *ret;
1547 int flags;
1548 int cpo_lit; /* 'cpoptions' contains 'l' flag */
Bram Moolenaardf177f62005-02-22 08:39:57 +00001549 int cpo_bsl; /* 'cpoptions' contains '\' flag */
Bram Moolenaar071d4272004-06-13 20:20:40 +00001550 int c;
1551 static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
1552 static int classcodes[] = {ANY, IDENT, SIDENT, KWORD, SKWORD,
1553 FNAME, SFNAME, PRINT, SPRINT,
1554 WHITE, NWHITE, DIGIT, NDIGIT,
1555 HEX, NHEX, OCTAL, NOCTAL,
1556 WORD, NWORD, HEAD, NHEAD,
1557 ALPHA, NALPHA, LOWER, NLOWER,
1558 UPPER, NUPPER
1559 };
1560 char_u *p;
1561 int extra = 0;
1562
1563 *flagp = WORST; /* Tentatively. */
1564 cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
Bram Moolenaardf177f62005-02-22 08:39:57 +00001565 cpo_bsl = (!reg_syn && vim_strchr(p_cpo, CPO_BACKSL) != NULL);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001566
1567 c = getchr();
1568 switch (c)
1569 {
1570 case Magic('^'):
1571 ret = regnode(BOL);
1572 break;
1573
1574 case Magic('$'):
1575 ret = regnode(EOL);
1576#if defined(FEAT_SYN_HL) || defined(PROTO)
1577 had_eol = TRUE;
1578#endif
1579 break;
1580
1581 case Magic('<'):
1582 ret = regnode(BOW);
1583 break;
1584
1585 case Magic('>'):
1586 ret = regnode(EOW);
1587 break;
1588
1589 case Magic('_'):
1590 c = no_Magic(getchr());
1591 if (c == '^') /* "\_^" is start-of-line */
1592 {
1593 ret = regnode(BOL);
1594 break;
1595 }
1596 if (c == '$') /* "\_$" is end-of-line */
1597 {
1598 ret = regnode(EOL);
1599#if defined(FEAT_SYN_HL) || defined(PROTO)
1600 had_eol = TRUE;
1601#endif
1602 break;
1603 }
1604
1605 extra = ADD_NL;
1606 *flagp |= HASNL;
1607
1608 /* "\_[" is character range plus newline */
1609 if (c == '[')
1610 goto collection;
1611
1612 /* "\_x" is character class plus newline */
1613 /*FALLTHROUGH*/
1614
1615 /*
1616 * Character classes.
1617 */
1618 case Magic('.'):
1619 case Magic('i'):
1620 case Magic('I'):
1621 case Magic('k'):
1622 case Magic('K'):
1623 case Magic('f'):
1624 case Magic('F'):
1625 case Magic('p'):
1626 case Magic('P'):
1627 case Magic('s'):
1628 case Magic('S'):
1629 case Magic('d'):
1630 case Magic('D'):
1631 case Magic('x'):
1632 case Magic('X'):
1633 case Magic('o'):
1634 case Magic('O'):
1635 case Magic('w'):
1636 case Magic('W'):
1637 case Magic('h'):
1638 case Magic('H'):
1639 case Magic('a'):
1640 case Magic('A'):
1641 case Magic('l'):
1642 case Magic('L'):
1643 case Magic('u'):
1644 case Magic('U'):
1645 p = vim_strchr(classchars, no_Magic(c));
1646 if (p == NULL)
1647 EMSG_RET_NULL(_("E63: invalid use of \\_"));
1648 ret = regnode(classcodes[p - classchars] + extra);
1649 *flagp |= HASWIDTH | SIMPLE;
1650 break;
1651
1652 case Magic('n'):
1653 if (reg_string)
1654 {
1655 /* In a string "\n" matches a newline character. */
1656 ret = regnode(EXACTLY);
1657 regc(NL);
1658 regc(NUL);
1659 *flagp |= HASWIDTH | SIMPLE;
1660 }
1661 else
1662 {
1663 /* In buffer text "\n" matches the end of a line. */
1664 ret = regnode(NEWL);
1665 *flagp |= HASWIDTH | HASNL;
1666 }
1667 break;
1668
1669 case Magic('('):
1670 if (one_exactly)
1671 EMSG_ONE_RET_NULL;
1672 ret = reg(REG_PAREN, &flags);
1673 if (ret == NULL)
1674 return NULL;
1675 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1676 break;
1677
1678 case NUL:
1679 case Magic('|'):
1680 case Magic('&'):
1681 case Magic(')'):
1682 EMSG_RET_NULL(_(e_internal)); /* Supposed to be caught earlier. */
1683 /* NOTREACHED */
1684
1685 case Magic('='):
1686 case Magic('?'):
1687 case Magic('+'):
1688 case Magic('@'):
1689 case Magic('{'):
1690 case Magic('*'):
1691 c = no_Magic(c);
1692 sprintf((char *)IObuff, _("E64: %s%c follows nothing"),
1693 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL)
1694 ? "" : "\\", c);
1695 EMSG_RET_NULL(IObuff);
1696 /* NOTREACHED */
1697
1698 case Magic('~'): /* previous substitute pattern */
1699 if (reg_prev_sub)
1700 {
1701 char_u *lp;
1702
1703 ret = regnode(EXACTLY);
1704 lp = reg_prev_sub;
1705 while (*lp != NUL)
1706 regc(*lp++);
1707 regc(NUL);
1708 if (*reg_prev_sub != NUL)
1709 {
1710 *flagp |= HASWIDTH;
1711 if ((lp - reg_prev_sub) == 1)
1712 *flagp |= SIMPLE;
1713 }
1714 }
1715 else
1716 EMSG_RET_NULL(_(e_nopresub));
1717 break;
1718
1719 case Magic('1'):
1720 case Magic('2'):
1721 case Magic('3'):
1722 case Magic('4'):
1723 case Magic('5'):
1724 case Magic('6'):
1725 case Magic('7'):
1726 case Magic('8'):
1727 case Magic('9'):
1728 {
1729 int refnum;
1730
1731 refnum = c - Magic('0');
1732 /*
1733 * Check if the back reference is legal. We must have seen the
1734 * close brace.
1735 * TODO: Should also check that we don't refer to something
1736 * that is repeated (+*=): what instance of the repetition
1737 * should we match?
1738 */
1739 if (!had_endbrace[refnum])
1740 {
1741 /* Trick: check if "@<=" or "@<!" follows, in which case
1742 * the \1 can appear before the referenced match. */
1743 for (p = regparse; *p != NUL; ++p)
1744 if (p[0] == '@' && p[1] == '<'
1745 && (p[2] == '!' || p[2] == '='))
1746 break;
1747 if (*p == NUL)
1748 EMSG_RET_NULL(_("E65: Illegal back reference"));
1749 }
1750 ret = regnode(BACKREF + refnum);
1751 }
1752 break;
1753
1754#ifdef FEAT_SYN_HL
1755 case Magic('z'):
1756 {
1757 c = no_Magic(getchr());
1758 switch (c)
1759 {
1760 case '(': if (reg_do_extmatch != REX_SET)
1761 EMSG_RET_NULL(_("E66: \\z( not allowed here"));
1762 if (one_exactly)
1763 EMSG_ONE_RET_NULL;
1764 ret = reg(REG_ZPAREN, &flags);
1765 if (ret == NULL)
1766 return NULL;
1767 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1768 re_has_z = REX_SET;
1769 break;
1770
1771 case '1':
1772 case '2':
1773 case '3':
1774 case '4':
1775 case '5':
1776 case '6':
1777 case '7':
1778 case '8':
1779 case '9': if (reg_do_extmatch != REX_USE)
1780 EMSG_RET_NULL(_("E67: \\z1 et al. not allowed here"));
1781 ret = regnode(ZREF + c - '0');
1782 re_has_z = REX_USE;
1783 break;
1784
1785 case 's': ret = regnode(MOPEN + 0);
1786 break;
1787
1788 case 'e': ret = regnode(MCLOSE + 0);
1789 break;
1790
1791 default: EMSG_RET_NULL(_("E68: Invalid character after \\z"));
1792 }
1793 }
1794 break;
1795#endif
1796
1797 case Magic('%'):
1798 {
1799 c = no_Magic(getchr());
1800 switch (c)
1801 {
1802 /* () without a back reference */
1803 case '(':
1804 if (one_exactly)
1805 EMSG_ONE_RET_NULL;
1806 ret = reg(REG_NPAREN, &flags);
1807 if (ret == NULL)
1808 return NULL;
1809 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1810 break;
1811
1812 /* Catch \%^ and \%$ regardless of where they appear in the
1813 * pattern -- regardless of whether or not it makes sense. */
1814 case '^':
1815 ret = regnode(RE_BOF);
1816 break;
1817
1818 case '$':
1819 ret = regnode(RE_EOF);
1820 break;
1821
1822 case '#':
1823 ret = regnode(CURSOR);
1824 break;
1825
1826 /* \%[abc]: Emit as a list of branches, all ending at the last
1827 * branch which matches nothing. */
1828 case '[':
1829 if (one_exactly) /* doesn't nest */
1830 EMSG_ONE_RET_NULL;
1831 {
1832 char_u *lastbranch;
1833 char_u *lastnode = NULL;
1834 char_u *br;
1835
1836 ret = NULL;
1837 while ((c = getchr()) != ']')
1838 {
1839 if (c == NUL)
1840 EMSG_M_RET_NULL(_("E69: Missing ] after %s%%["),
1841 reg_magic == MAGIC_ALL);
1842 br = regnode(BRANCH);
1843 if (ret == NULL)
1844 ret = br;
1845 else
1846 regtail(lastnode, br);
1847
1848 ungetchr();
1849 one_exactly = TRUE;
1850 lastnode = regatom(flagp);
1851 one_exactly = FALSE;
1852 if (lastnode == NULL)
1853 return NULL;
1854 }
1855 if (ret == NULL)
1856 EMSG_M_RET_NULL(_("E70: Empty %s%%[]"),
1857 reg_magic == MAGIC_ALL);
1858 lastbranch = regnode(BRANCH);
1859 br = regnode(NOTHING);
1860 if (ret != JUST_CALC_SIZE)
1861 {
1862 regtail(lastnode, br);
1863 regtail(lastbranch, br);
1864 /* connect all branches to the NOTHING
1865 * branch at the end */
1866 for (br = ret; br != lastnode; )
1867 {
1868 if (OP(br) == BRANCH)
1869 {
1870 regtail(br, lastbranch);
1871 br = OPERAND(br);
1872 }
1873 else
1874 br = regnext(br);
1875 }
1876 }
1877 *flagp &= ~HASWIDTH;
1878 break;
1879 }
1880
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001881 case 'd': /* %d123 decimal */
1882 case 'o': /* %o123 octal */
1883 case 'x': /* %xab hex 2 */
1884 case 'u': /* %uabcd hex 4 */
1885 case 'U': /* %U1234abcd hex 8 */
1886 {
1887 int i;
1888
1889 switch (c)
1890 {
1891 case 'd': i = getdecchrs(); break;
1892 case 'o': i = getoctchrs(); break;
1893 case 'x': i = gethexchrs(2); break;
1894 case 'u': i = gethexchrs(4); break;
1895 case 'U': i = gethexchrs(8); break;
1896 default: i = -1; break;
1897 }
1898
1899 if (i < 0)
1900 EMSG_M_RET_NULL(
1901 _("E678: Invalid character after %s%%[dxouU]"),
1902 reg_magic == MAGIC_ALL);
1903 ret = regnode(EXACTLY);
1904 if (i == 0)
1905 regc(0x0a);
1906 else
1907#ifdef FEAT_MBYTE
1908 regmbc(i);
1909#else
1910 regc(i);
1911#endif
1912 regc(NUL);
1913 *flagp |= HASWIDTH;
1914 break;
1915 }
1916
Bram Moolenaar071d4272004-06-13 20:20:40 +00001917 default:
1918 if (VIM_ISDIGIT(c) || c == '<' || c == '>')
1919 {
1920 long_u n = 0;
1921 int cmp;
1922
1923 cmp = c;
1924 if (cmp == '<' || cmp == '>')
1925 c = getchr();
1926 while (VIM_ISDIGIT(c))
1927 {
1928 n = n * 10 + (c - '0');
1929 c = getchr();
1930 }
1931 if (c == 'l' || c == 'c' || c == 'v')
1932 {
1933 if (c == 'l')
1934 ret = regnode(RE_LNUM);
1935 else if (c == 'c')
1936 ret = regnode(RE_COL);
1937 else
1938 ret = regnode(RE_VCOL);
1939 if (ret == JUST_CALC_SIZE)
1940 regsize += 5;
1941 else
1942 {
1943 /* put the number and the optional
1944 * comparator after the opcode */
1945 regcode = re_put_long(regcode, n);
1946 *regcode++ = cmp;
1947 }
1948 break;
1949 }
1950 }
1951
1952 EMSG_M_RET_NULL(_("E71: Invalid character after %s%%"),
1953 reg_magic == MAGIC_ALL);
1954 }
1955 }
1956 break;
1957
1958 case Magic('['):
1959collection:
1960 {
1961 char_u *lp;
1962
1963 /*
1964 * If there is no matching ']', we assume the '[' is a normal
1965 * character. This makes 'incsearch' and ":help [" work.
1966 */
1967 lp = skip_anyof(regparse);
1968 if (*lp == ']') /* there is a matching ']' */
1969 {
1970 int startc = -1; /* > 0 when next '-' is a range */
1971 int endc;
1972
1973 /*
1974 * In a character class, different parsing rules apply.
1975 * Not even \ is special anymore, nothing is.
1976 */
1977 if (*regparse == '^') /* Complement of range. */
1978 {
1979 ret = regnode(ANYBUT + extra);
1980 regparse++;
1981 }
1982 else
1983 ret = regnode(ANYOF + extra);
1984
1985 /* At the start ']' and '-' mean the literal character. */
1986 if (*regparse == ']' || *regparse == '-')
Bram Moolenaardf177f62005-02-22 08:39:57 +00001987 {
1988 startc = *regparse;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001989 regc(*regparse++);
Bram Moolenaardf177f62005-02-22 08:39:57 +00001990 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001991
1992 while (*regparse != NUL && *regparse != ']')
1993 {
1994 if (*regparse == '-')
1995 {
1996 ++regparse;
1997 /* The '-' is not used for a range at the end and
1998 * after or before a '\n'. */
1999 if (*regparse == ']' || *regparse == NUL
2000 || startc == -1
2001 || (regparse[0] == '\\' && regparse[1] == 'n'))
2002 {
2003 regc('-');
2004 startc = '-'; /* [--x] is a range */
2005 }
2006 else
2007 {
Bram Moolenaardf177f62005-02-22 08:39:57 +00002008 /* Also accept "a-[.z.]" */
2009 endc = 0;
2010 if (*regparse == '[')
2011 endc = get_coll_element(&regparse);
2012 if (endc == 0)
2013 {
Bram Moolenaar071d4272004-06-13 20:20:40 +00002014#ifdef FEAT_MBYTE
Bram Moolenaardf177f62005-02-22 08:39:57 +00002015 if (has_mbyte)
2016 endc = mb_ptr2char_adv(&regparse);
2017 else
Bram Moolenaar071d4272004-06-13 20:20:40 +00002018#endif
Bram Moolenaardf177f62005-02-22 08:39:57 +00002019 endc = *regparse++;
2020 }
Bram Moolenaarc0197e22004-09-13 20:26:32 +00002021
2022 /* Handle \o40, \x20 and \u20AC style sequences */
Bram Moolenaardf177f62005-02-22 08:39:57 +00002023 if (endc == '\\' && !cpo_lit && !cpo_bsl)
Bram Moolenaarc0197e22004-09-13 20:26:32 +00002024 endc = coll_get_char();
2025
Bram Moolenaar071d4272004-06-13 20:20:40 +00002026 if (startc > endc)
2027 EMSG_RET_NULL(_(e_invrange));
2028#ifdef FEAT_MBYTE
2029 if (has_mbyte && ((*mb_char2len)(startc) > 1
2030 || (*mb_char2len)(endc) > 1))
2031 {
2032 /* Limit to a range of 256 chars */
2033 if (endc > startc + 256)
2034 EMSG_RET_NULL(_(e_invrange));
2035 while (++startc <= endc)
2036 regmbc(startc);
2037 }
2038 else
2039#endif
2040 {
2041#ifdef EBCDIC
2042 int alpha_only = FALSE;
2043
2044 /* for alphabetical range skip the gaps
2045 * 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'. */
2046 if (isalpha(startc) && isalpha(endc))
2047 alpha_only = TRUE;
2048#endif
2049 while (++startc <= endc)
2050#ifdef EBCDIC
2051 if (!alpha_only || isalpha(startc))
2052#endif
2053 regc(startc);
2054 }
2055 startc = -1;
2056 }
2057 }
2058 /*
2059 * Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
2060 * accepts "\t", "\e", etc., but only when the 'l' flag in
2061 * 'cpoptions' is not included.
Bram Moolenaardf177f62005-02-22 08:39:57 +00002062 * Posix doesn't recognize backslash at all.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002063 */
2064 else if (*regparse == '\\'
Bram Moolenaardf177f62005-02-22 08:39:57 +00002065 && !cpo_bsl
Bram Moolenaar071d4272004-06-13 20:20:40 +00002066 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
2067 || (!cpo_lit
2068 && vim_strchr(REGEXP_ABBR,
2069 regparse[1]) != NULL)))
2070 {
2071 regparse++;
2072 if (*regparse == 'n')
2073 {
2074 /* '\n' in range: also match NL */
2075 if (ret != JUST_CALC_SIZE)
2076 {
2077 if (*ret == ANYBUT)
2078 *ret = ANYBUT + ADD_NL;
2079 else if (*ret == ANYOF)
2080 *ret = ANYOF + ADD_NL;
2081 /* else: must have had a \n already */
2082 }
2083 *flagp |= HASNL;
2084 regparse++;
2085 startc = -1;
2086 }
Bram Moolenaarc0197e22004-09-13 20:26:32 +00002087 else if (*regparse == 'd'
2088 || *regparse == 'o'
2089 || *regparse == 'x'
2090 || *regparse == 'u'
2091 || *regparse == 'U')
2092 {
2093 startc = coll_get_char();
2094 if (startc == 0)
2095 regc(0x0a);
2096 else
2097#ifdef FEAT_MBYTE
2098 regmbc(startc);
2099#else
2100 regc(startc);
2101#endif
2102 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002103 else
2104 {
2105 startc = backslash_trans(*regparse++);
2106 regc(startc);
2107 }
2108 }
2109 else if (*regparse == '[')
2110 {
2111 int c_class;
2112 int cu;
2113
Bram Moolenaardf177f62005-02-22 08:39:57 +00002114 c_class = get_char_class(&regparse);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002115 startc = -1;
2116 /* Characters assumed to be 8 bits! */
2117 switch (c_class)
2118 {
2119 case CLASS_NONE:
Bram Moolenaardf177f62005-02-22 08:39:57 +00002120 c_class = get_equi_class(&regparse);
2121 if (c_class != 0)
2122 {
2123 /* produce equivalence class */
2124 reg_equi_class(c_class);
2125 }
2126 else if ((c_class =
2127 get_coll_element(&regparse)) != 0)
2128 {
2129 /* produce a collating element */
2130 regmbc(c_class);
2131 }
2132 else
2133 {
2134 /* literal '[', allow [[-x] as a range */
2135 startc = *regparse++;
2136 regc(startc);
2137 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002138 break;
2139 case CLASS_ALNUM:
2140 for (cu = 1; cu <= 255; cu++)
2141 if (isalnum(cu))
2142 regc(cu);
2143 break;
2144 case CLASS_ALPHA:
2145 for (cu = 1; cu <= 255; cu++)
2146 if (isalpha(cu))
2147 regc(cu);
2148 break;
2149 case CLASS_BLANK:
2150 regc(' ');
2151 regc('\t');
2152 break;
2153 case CLASS_CNTRL:
2154 for (cu = 1; cu <= 255; cu++)
2155 if (iscntrl(cu))
2156 regc(cu);
2157 break;
2158 case CLASS_DIGIT:
2159 for (cu = 1; cu <= 255; cu++)
2160 if (VIM_ISDIGIT(cu))
2161 regc(cu);
2162 break;
2163 case CLASS_GRAPH:
2164 for (cu = 1; cu <= 255; cu++)
2165 if (isgraph(cu))
2166 regc(cu);
2167 break;
2168 case CLASS_LOWER:
2169 for (cu = 1; cu <= 255; cu++)
2170 if (islower(cu))
2171 regc(cu);
2172 break;
2173 case CLASS_PRINT:
2174 for (cu = 1; cu <= 255; cu++)
2175 if (vim_isprintc(cu))
2176 regc(cu);
2177 break;
2178 case CLASS_PUNCT:
2179 for (cu = 1; cu <= 255; cu++)
2180 if (ispunct(cu))
2181 regc(cu);
2182 break;
2183 case CLASS_SPACE:
2184 for (cu = 9; cu <= 13; cu++)
2185 regc(cu);
2186 regc(' ');
2187 break;
2188 case CLASS_UPPER:
2189 for (cu = 1; cu <= 255; cu++)
2190 if (isupper(cu))
2191 regc(cu);
2192 break;
2193 case CLASS_XDIGIT:
2194 for (cu = 1; cu <= 255; cu++)
2195 if (vim_isxdigit(cu))
2196 regc(cu);
2197 break;
2198 case CLASS_TAB:
2199 regc('\t');
2200 break;
2201 case CLASS_RETURN:
2202 regc('\r');
2203 break;
2204 case CLASS_BACKSPACE:
2205 regc('\b');
2206 break;
2207 case CLASS_ESCAPE:
2208 regc('\033');
2209 break;
2210 }
2211 }
2212 else
2213 {
2214#ifdef FEAT_MBYTE
2215 if (has_mbyte)
2216 {
2217 int len;
2218
2219 /* produce a multibyte character, including any
2220 * following composing characters */
2221 startc = mb_ptr2char(regparse);
2222 len = (*mb_ptr2len_check)(regparse);
2223 if (enc_utf8 && utf_char2len(startc) != len)
2224 startc = -1; /* composing chars */
2225 while (--len >= 0)
2226 regc(*regparse++);
2227 }
2228 else
2229#endif
2230 {
2231 startc = *regparse++;
2232 regc(startc);
2233 }
2234 }
2235 }
2236 regc(NUL);
2237 prevchr_len = 1; /* last char was the ']' */
2238 if (*regparse != ']')
2239 EMSG_RET_NULL(_(e_toomsbra)); /* Cannot happen? */
2240 skipchr(); /* let's be friends with the lexer again */
2241 *flagp |= HASWIDTH | SIMPLE;
2242 break;
2243 }
2244 }
2245 /* FALLTHROUGH */
2246
2247 default:
2248 {
2249 int len;
2250
2251#ifdef FEAT_MBYTE
2252 /* A multi-byte character is handled as a separate atom if it's
2253 * before a multi. */
2254 if (has_mbyte && (*mb_char2len)(c) > 1
2255 && re_multi_type(peekchr()) != NOT_MULTI)
2256 {
2257 ret = regnode(MULTIBYTECODE);
2258 regmbc(c);
2259 *flagp |= HASWIDTH | SIMPLE;
2260 break;
2261 }
2262#endif
2263
2264 ret = regnode(EXACTLY);
2265
2266 /*
2267 * Append characters as long as:
2268 * - there is no following multi, we then need the character in
2269 * front of it as a single character operand
2270 * - not running into a Magic character
2271 * - "one_exactly" is not set
2272 * But always emit at least one character. Might be a Multi,
2273 * e.g., a "[" without matching "]".
2274 */
2275 for (len = 0; c != NUL && (len == 0
2276 || (re_multi_type(peekchr()) == NOT_MULTI
2277 && !one_exactly
2278 && !is_Magic(c))); ++len)
2279 {
2280 c = no_Magic(c);
2281#ifdef FEAT_MBYTE
2282 if (has_mbyte)
2283 {
2284 regmbc(c);
2285 if (enc_utf8)
2286 {
2287 int off;
2288 int l;
2289
2290 /* Need to get composing character too, directly
2291 * access regparse for that, because skipchr() skips
2292 * over composing chars. */
2293 ungetchr();
2294 if (*regparse == '\\' && regparse[1] != NUL)
2295 off = 1;
2296 else
2297 off = 0;
2298 for (;;)
2299 {
2300 l = utf_ptr2len_check(regparse + off);
2301 if (!UTF_COMPOSINGLIKE(regparse + off,
2302 regparse + off + l))
2303 break;
2304 off += l;
2305 regmbc(utf_ptr2char(regparse + off));
2306 }
2307 skipchr();
2308 }
2309 }
2310 else
2311#endif
2312 regc(c);
2313 c = getchr();
2314 }
2315 ungetchr();
2316
2317 regc(NUL);
2318 *flagp |= HASWIDTH;
2319 if (len == 1)
2320 *flagp |= SIMPLE;
2321 }
2322 break;
2323 }
2324
2325 return ret;
2326}
2327
2328/*
2329 * emit a node
2330 * Return pointer to generated code.
2331 */
2332 static char_u *
2333regnode(op)
2334 int op;
2335{
2336 char_u *ret;
2337
2338 ret = regcode;
2339 if (ret == JUST_CALC_SIZE)
2340 regsize += 3;
2341 else
2342 {
2343 *regcode++ = op;
2344 *regcode++ = NUL; /* Null "next" pointer. */
2345 *regcode++ = NUL;
2346 }
2347 return ret;
2348}
2349
2350/*
2351 * Emit (if appropriate) a byte of code
2352 */
2353 static void
2354regc(b)
2355 int b;
2356{
2357 if (regcode == JUST_CALC_SIZE)
2358 regsize++;
2359 else
2360 *regcode++ = b;
2361}
2362
2363#ifdef FEAT_MBYTE
2364/*
2365 * Emit (if appropriate) a multi-byte character of code
2366 */
2367 static void
2368regmbc(c)
2369 int c;
2370{
2371 if (regcode == JUST_CALC_SIZE)
2372 regsize += (*mb_char2len)(c);
2373 else
2374 regcode += (*mb_char2bytes)(c, regcode);
2375}
2376#endif
2377
2378/*
2379 * reginsert - insert an operator in front of already-emitted operand
2380 *
2381 * Means relocating the operand.
2382 */
2383 static void
2384reginsert(op, opnd)
2385 int op;
2386 char_u *opnd;
2387{
2388 char_u *src;
2389 char_u *dst;
2390 char_u *place;
2391
2392 if (regcode == JUST_CALC_SIZE)
2393 {
2394 regsize += 3;
2395 return;
2396 }
2397 src = regcode;
2398 regcode += 3;
2399 dst = regcode;
2400 while (src > opnd)
2401 *--dst = *--src;
2402
2403 place = opnd; /* Op node, where operand used to be. */
2404 *place++ = op;
2405 *place++ = NUL;
2406 *place = NUL;
2407}
2408
2409/*
2410 * reginsert_limits - insert an operator in front of already-emitted operand.
2411 * The operator has the given limit values as operands. Also set next pointer.
2412 *
2413 * Means relocating the operand.
2414 */
2415 static void
2416reginsert_limits(op, minval, maxval, opnd)
2417 int op;
2418 long minval;
2419 long maxval;
2420 char_u *opnd;
2421{
2422 char_u *src;
2423 char_u *dst;
2424 char_u *place;
2425
2426 if (regcode == JUST_CALC_SIZE)
2427 {
2428 regsize += 11;
2429 return;
2430 }
2431 src = regcode;
2432 regcode += 11;
2433 dst = regcode;
2434 while (src > opnd)
2435 *--dst = *--src;
2436
2437 place = opnd; /* Op node, where operand used to be. */
2438 *place++ = op;
2439 *place++ = NUL;
2440 *place++ = NUL;
2441 place = re_put_long(place, (long_u)minval);
2442 place = re_put_long(place, (long_u)maxval);
2443 regtail(opnd, place);
2444}
2445
2446/*
2447 * Write a long as four bytes at "p" and return pointer to the next char.
2448 */
2449 static char_u *
2450re_put_long(p, val)
2451 char_u *p;
2452 long_u val;
2453{
2454 *p++ = (char_u) ((val >> 24) & 0377);
2455 *p++ = (char_u) ((val >> 16) & 0377);
2456 *p++ = (char_u) ((val >> 8) & 0377);
2457 *p++ = (char_u) (val & 0377);
2458 return p;
2459}
2460
2461/*
2462 * regtail - set the next-pointer at the end of a node chain
2463 */
2464 static void
2465regtail(p, val)
2466 char_u *p;
2467 char_u *val;
2468{
2469 char_u *scan;
2470 char_u *temp;
2471 int offset;
2472
2473 if (p == JUST_CALC_SIZE)
2474 return;
2475
2476 /* Find last node. */
2477 scan = p;
2478 for (;;)
2479 {
2480 temp = regnext(scan);
2481 if (temp == NULL)
2482 break;
2483 scan = temp;
2484 }
2485
2486 if (OP(scan) == BACK)
2487 offset = (int)(scan - val);
2488 else
2489 offset = (int)(val - scan);
2490 *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377);
2491 *(scan + 2) = (char_u) (offset & 0377);
2492}
2493
2494/*
2495 * regoptail - regtail on item after a BRANCH; nop if none
2496 */
2497 static void
2498regoptail(p, val)
2499 char_u *p;
2500 char_u *val;
2501{
2502 /* When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless" */
2503 if (p == NULL || p == JUST_CALC_SIZE
2504 || (OP(p) != BRANCH
2505 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
2506 return;
2507 regtail(OPERAND(p), val);
2508}
2509
2510/*
2511 * getchr() - get the next character from the pattern. We know about
2512 * magic and such, so therefore we need a lexical analyzer.
2513 */
2514
2515/* static int curchr; */
2516static int prevprevchr;
2517static int prevchr;
2518static int nextchr; /* used for ungetchr() */
2519/*
2520 * Note: prevchr is sometimes -1 when we are not at the start,
2521 * eg in /[ ^I]^ the pattern was never found even if it existed, because ^ was
2522 * taken to be magic -- webb
2523 */
2524static int at_start; /* True when on the first character */
2525static int prev_at_start; /* True when on the second character */
2526
2527 static void
2528initchr(str)
2529 char_u *str;
2530{
2531 regparse = str;
2532 prevchr_len = 0;
2533 curchr = prevprevchr = prevchr = nextchr = -1;
2534 at_start = TRUE;
2535 prev_at_start = FALSE;
2536}
2537
2538 static int
2539peekchr()
2540{
Bram Moolenaardf177f62005-02-22 08:39:57 +00002541 static int after_slash = FALSE;
2542
Bram Moolenaar071d4272004-06-13 20:20:40 +00002543 if (curchr == -1)
2544 {
2545 switch (curchr = regparse[0])
2546 {
2547 case '.':
2548 case '[':
2549 case '~':
2550 /* magic when 'magic' is on */
2551 if (reg_magic >= MAGIC_ON)
2552 curchr = Magic(curchr);
2553 break;
2554 case '(':
2555 case ')':
2556 case '{':
2557 case '%':
2558 case '+':
2559 case '=':
2560 case '?':
2561 case '@':
2562 case '!':
2563 case '&':
2564 case '|':
2565 case '<':
2566 case '>':
2567 case '#': /* future ext. */
2568 case '"': /* future ext. */
2569 case '\'': /* future ext. */
2570 case ',': /* future ext. */
2571 case '-': /* future ext. */
2572 case ':': /* future ext. */
2573 case ';': /* future ext. */
2574 case '`': /* future ext. */
2575 case '/': /* Can't be used in / command */
2576 /* magic only after "\v" */
2577 if (reg_magic == MAGIC_ALL)
2578 curchr = Magic(curchr);
2579 break;
2580 case '*':
Bram Moolenaardf177f62005-02-22 08:39:57 +00002581 /* * is not magic as the very first character, eg "?*ptr", when
2582 * after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But
2583 * "\(\*" is not magic, thus must be magic if "after_slash" */
2584 if (reg_magic >= MAGIC_ON
2585 && !at_start
2586 && !(prev_at_start && prevchr == Magic('^'))
2587 && (after_slash
2588 || (prevchr != Magic('(')
2589 && prevchr != Magic('&')
2590 && prevchr != Magic('|'))))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002591 curchr = Magic('*');
2592 break;
2593 case '^':
2594 /* '^' is only magic as the very first character and if it's after
2595 * "\(", "\|", "\&' or "\n" */
2596 if (reg_magic >= MAGIC_OFF
2597 && (at_start
2598 || reg_magic == MAGIC_ALL
2599 || prevchr == Magic('(')
2600 || prevchr == Magic('|')
2601 || prevchr == Magic('&')
2602 || prevchr == Magic('n')
2603 || (no_Magic(prevchr) == '('
2604 && prevprevchr == Magic('%'))))
2605 {
2606 curchr = Magic('^');
2607 at_start = TRUE;
2608 prev_at_start = FALSE;
2609 }
2610 break;
2611 case '$':
2612 /* '$' is only magic as the very last char and if it's in front of
2613 * either "\|", "\)", "\&", or "\n" */
2614 if (reg_magic >= MAGIC_OFF)
2615 {
2616 char_u *p = regparse + 1;
2617
2618 /* ignore \c \C \m and \M after '$' */
2619 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
2620 || p[1] == 'm' || p[1] == 'M' || p[1] == 'Z'))
2621 p += 2;
2622 if (p[0] == NUL
2623 || (p[0] == '\\'
2624 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
2625 || p[1] == 'n'))
2626 || reg_magic == MAGIC_ALL)
2627 curchr = Magic('$');
2628 }
2629 break;
2630 case '\\':
2631 {
2632 int c = regparse[1];
2633
2634 if (c == NUL)
2635 curchr = '\\'; /* trailing '\' */
2636 else if (
2637#ifdef EBCDIC
2638 vim_strchr(META, c)
2639#else
2640 c <= '~' && META_flags[c]
2641#endif
2642 )
2643 {
2644 /*
2645 * META contains everything that may be magic sometimes,
2646 * except ^ and $ ("\^" and "\$" are only magic after
2647 * "\v"). We now fetch the next character and toggle its
2648 * magicness. Therefore, \ is so meta-magic that it is
2649 * not in META.
2650 */
2651 curchr = -1;
2652 prev_at_start = at_start;
2653 at_start = FALSE; /* be able to say "/\*ptr" */
2654 ++regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +00002655 ++after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002656 peekchr();
2657 --regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +00002658 --after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002659 curchr = toggle_Magic(curchr);
2660 }
2661 else if (vim_strchr(REGEXP_ABBR, c))
2662 {
2663 /*
2664 * Handle abbreviations, like "\t" for TAB -- webb
2665 */
2666 curchr = backslash_trans(c);
2667 }
2668 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
2669 curchr = toggle_Magic(c);
2670 else
2671 {
2672 /*
2673 * Next character can never be (made) magic?
2674 * Then backslashing it won't do anything.
2675 */
2676#ifdef FEAT_MBYTE
2677 if (has_mbyte)
2678 curchr = (*mb_ptr2char)(regparse + 1);
2679 else
2680#endif
2681 curchr = c;
2682 }
2683 break;
2684 }
2685
2686#ifdef FEAT_MBYTE
2687 default:
2688 if (has_mbyte)
2689 curchr = (*mb_ptr2char)(regparse);
2690#endif
2691 }
2692 }
2693
2694 return curchr;
2695}
2696
2697/*
2698 * Eat one lexed character. Do this in a way that we can undo it.
2699 */
2700 static void
2701skipchr()
2702{
2703 /* peekchr() eats a backslash, do the same here */
2704 if (*regparse == '\\')
2705 prevchr_len = 1;
2706 else
2707 prevchr_len = 0;
2708 if (regparse[prevchr_len] != NUL)
2709 {
2710#ifdef FEAT_MBYTE
2711 if (has_mbyte)
2712 prevchr_len += (*mb_ptr2len_check)(regparse + prevchr_len);
2713 else
2714#endif
2715 ++prevchr_len;
2716 }
2717 regparse += prevchr_len;
2718 prev_at_start = at_start;
2719 at_start = FALSE;
2720 prevprevchr = prevchr;
2721 prevchr = curchr;
2722 curchr = nextchr; /* use previously unget char, or -1 */
2723 nextchr = -1;
2724}
2725
2726/*
2727 * Skip a character while keeping the value of prev_at_start for at_start.
2728 * prevchr and prevprevchr are also kept.
2729 */
2730 static void
2731skipchr_keepstart()
2732{
2733 int as = prev_at_start;
2734 int pr = prevchr;
2735 int prpr = prevprevchr;
2736
2737 skipchr();
2738 at_start = as;
2739 prevchr = pr;
2740 prevprevchr = prpr;
2741}
2742
2743 static int
2744getchr()
2745{
2746 int chr = peekchr();
2747
2748 skipchr();
2749 return chr;
2750}
2751
2752/*
2753 * put character back. Works only once!
2754 */
2755 static void
2756ungetchr()
2757{
2758 nextchr = curchr;
2759 curchr = prevchr;
2760 prevchr = prevprevchr;
2761 at_start = prev_at_start;
2762 prev_at_start = FALSE;
2763
2764 /* Backup regparse, so that it's at the same position as before the
2765 * getchr(). */
2766 regparse -= prevchr_len;
2767}
2768
2769/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +00002770 * Get and return the value of the hex string at the current position.
2771 * Return -1 if there is no valid hex number.
2772 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +00002773 * blahblah\%x20asdf
2774 * before-^ ^-after
2775 * The parameter controls the maximum number of input characters. This will be
2776 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
2777 */
2778 static int
2779gethexchrs(maxinputlen)
2780 int maxinputlen;
2781{
2782 int nr = 0;
2783 int c;
2784 int i;
2785
2786 for (i = 0; i < maxinputlen; ++i)
2787 {
2788 c = regparse[0];
2789 if (!vim_isxdigit(c))
2790 break;
2791 nr <<= 4;
2792 nr |= hex2nr(c);
2793 ++regparse;
2794 }
2795
2796 if (i == 0)
2797 return -1;
2798 return nr;
2799}
2800
2801/*
2802 * get and return the value of the decimal string immediately after the
2803 * current position. Return -1 for invalid. Consumes all digits.
2804 */
2805 static int
2806getdecchrs()
2807{
2808 int nr = 0;
2809 int c;
2810 int i;
2811
2812 for (i = 0; ; ++i)
2813 {
2814 c = regparse[0];
2815 if (c < '0' || c > '9')
2816 break;
2817 nr *= 10;
2818 nr += c - '0';
2819 ++regparse;
2820 }
2821
2822 if (i == 0)
2823 return -1;
2824 return nr;
2825}
2826
2827/*
2828 * get and return the value of the octal string immediately after the current
2829 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
2830 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
2831 * treat 8 or 9 as recognised characters. Position is updated:
2832 * blahblah\%o210asdf
2833 * before-^ ^-after
2834 */
2835 static int
2836getoctchrs()
2837{
2838 int nr = 0;
2839 int c;
2840 int i;
2841
2842 for (i = 0; i < 3 && nr < 040; ++i)
2843 {
2844 c = regparse[0];
2845 if (c < '0' || c > '7')
2846 break;
2847 nr <<= 3;
2848 nr |= hex2nr(c);
2849 ++regparse;
2850 }
2851
2852 if (i == 0)
2853 return -1;
2854 return nr;
2855}
2856
2857/*
2858 * Get a number after a backslash that is inside [].
2859 * When nothing is recognized return a backslash.
2860 */
2861 static int
2862coll_get_char()
2863{
2864 int nr = -1;
2865
2866 switch (*regparse++)
2867 {
2868 case 'd': nr = getdecchrs(); break;
2869 case 'o': nr = getoctchrs(); break;
2870 case 'x': nr = gethexchrs(2); break;
2871 case 'u': nr = gethexchrs(4); break;
2872 case 'U': nr = gethexchrs(8); break;
2873 }
2874 if (nr < 0)
2875 {
2876 /* If getting the number fails be backwards compatible: the character
2877 * is a backslash. */
2878 --regparse;
2879 nr = '\\';
2880 }
2881 return nr;
2882}
2883
2884/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00002885 * read_limits - Read two integers to be taken as a minimum and maximum.
2886 * If the first character is '-', then the range is reversed.
2887 * Should end with 'end'. If minval is missing, zero is default, if maxval is
2888 * missing, a very big number is the default.
2889 */
2890 static int
2891read_limits(minval, maxval)
2892 long *minval;
2893 long *maxval;
2894{
2895 int reverse = FALSE;
2896 char_u *first_char;
2897 long tmp;
2898
2899 if (*regparse == '-')
2900 {
2901 /* Starts with '-', so reverse the range later */
2902 regparse++;
2903 reverse = TRUE;
2904 }
2905 first_char = regparse;
2906 *minval = getdigits(&regparse);
2907 if (*regparse == ',') /* There is a comma */
2908 {
2909 if (vim_isdigit(*++regparse))
2910 *maxval = getdigits(&regparse);
2911 else
2912 *maxval = MAX_LIMIT;
2913 }
2914 else if (VIM_ISDIGIT(*first_char))
2915 *maxval = *minval; /* It was \{n} or \{-n} */
2916 else
2917 *maxval = MAX_LIMIT; /* It was \{} or \{-} */
2918 if (*regparse == '\\')
2919 regparse++; /* Allow either \{...} or \{...\} */
Bram Moolenaardf177f62005-02-22 08:39:57 +00002920 if (*regparse != '}')
Bram Moolenaar071d4272004-06-13 20:20:40 +00002921 {
2922 sprintf((char *)IObuff, _("E554: Syntax error in %s{...}"),
2923 reg_magic == MAGIC_ALL ? "" : "\\");
2924 EMSG_RET_FAIL(IObuff);
2925 }
2926
2927 /*
2928 * Reverse the range if there was a '-', or make sure it is in the right
2929 * order otherwise.
2930 */
2931 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
2932 {
2933 tmp = *minval;
2934 *minval = *maxval;
2935 *maxval = tmp;
2936 }
2937 skipchr(); /* let's be friends with the lexer again */
2938 return OK;
2939}
2940
2941/*
2942 * vim_regexec and friends
2943 */
2944
2945/*
2946 * Global work variables for vim_regexec().
2947 */
2948
2949/* The current match-position is remembered with these variables: */
2950static linenr_T reglnum; /* line number, relative to first line */
2951static char_u *regline; /* start of current line */
2952static char_u *reginput; /* current input, points into "regline" */
2953
2954static int need_clear_subexpr; /* subexpressions still need to be
2955 * cleared */
2956#ifdef FEAT_SYN_HL
2957static int need_clear_zsubexpr = FALSE; /* extmatch subexpressions
2958 * still need to be cleared */
2959#endif
2960
2961static int out_of_stack; /* TRUE when ran out of stack space */
2962
2963/*
2964 * Structure used to save the current input state, when it needs to be
2965 * restored after trying a match. Used by reg_save() and reg_restore().
2966 */
2967typedef struct
2968{
2969 union
2970 {
2971 char_u *ptr; /* reginput pointer, for single-line regexp */
2972 lpos_T pos; /* reginput pos, for multi-line regexp */
2973 } rs_u;
2974} regsave_T;
2975
2976/* struct to save start/end pointer/position in for \(\) */
2977typedef struct
2978{
2979 union
2980 {
2981 char_u *ptr;
2982 lpos_T pos;
2983 } se_u;
2984} save_se_T;
2985
2986static char_u *reg_getline __ARGS((linenr_T lnum));
2987static long vim_regexec_both __ARGS((char_u *line, colnr_T col));
2988static long regtry __ARGS((regprog_T *prog, colnr_T col));
2989static void cleanup_subexpr __ARGS((void));
2990#ifdef FEAT_SYN_HL
2991static void cleanup_zsubexpr __ARGS((void));
2992#endif
2993static void reg_nextline __ARGS((void));
2994static void reg_save __ARGS((regsave_T *save));
2995static void reg_restore __ARGS((regsave_T *save));
2996static int reg_save_equal __ARGS((regsave_T *save));
2997static void save_se_multi __ARGS((save_se_T *savep, lpos_T *posp));
2998static void save_se_one __ARGS((save_se_T *savep, char_u **pp));
2999
3000/* Save the sub-expressions before attempting a match. */
3001#define save_se(savep, posp, pp) \
3002 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
3003
3004/* After a failed match restore the sub-expressions. */
3005#define restore_se(savep, posp, pp) { \
3006 if (REG_MULTI) \
3007 *(posp) = (savep)->se_u.pos; \
3008 else \
3009 *(pp) = (savep)->se_u.ptr; }
3010
3011static int re_num_cmp __ARGS((long_u val, char_u *scan));
Bram Moolenaardf177f62005-02-22 08:39:57 +00003012static int regmatch __ARGS((char_u *prog, regsave_T *startp));
Bram Moolenaar071d4272004-06-13 20:20:40 +00003013static int regrepeat __ARGS((char_u *p, long maxcount));
3014
3015#ifdef DEBUG
3016int regnarrate = 0;
3017#endif
3018
3019/*
3020 * Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
3021 * Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
3022 * contains '\c' or '\C' the value is overruled.
3023 */
3024static int ireg_ic;
3025
3026#ifdef FEAT_MBYTE
3027/*
3028 * Similar to ireg_ic, but only for 'combining' characters. Set with \Z flag
3029 * in the regexp. Defaults to false, always.
3030 */
3031static int ireg_icombine;
3032#endif
3033
3034/*
3035 * Sometimes need to save a copy of a line. Since alloc()/free() is very
3036 * slow, we keep one allocated piece of memory and only re-allocate it when
3037 * it's too small. It's freed in vim_regexec_both() when finished.
3038 */
3039static char_u *reg_tofree;
3040static unsigned reg_tofreelen;
3041
3042/*
3043 * These variables are set when executing a regexp to speed up the execution.
3044 * Which ones are set depends on whethere a single-line or multi-line match is
3045 * done:
3046 * single-line multi-line
3047 * reg_match &regmatch_T NULL
3048 * reg_mmatch NULL &regmmatch_T
3049 * reg_startp reg_match->startp <invalid>
3050 * reg_endp reg_match->endp <invalid>
3051 * reg_startpos <invalid> reg_mmatch->startpos
3052 * reg_endpos <invalid> reg_mmatch->endpos
3053 * reg_win NULL window in which to search
3054 * reg_buf <invalid> buffer in which to search
3055 * reg_firstlnum <invalid> first line in which to search
3056 * reg_maxline 0 last line nr
3057 * reg_line_lbr FALSE or TRUE FALSE
3058 */
3059static regmatch_T *reg_match;
3060static regmmatch_T *reg_mmatch;
3061static char_u **reg_startp = NULL;
3062static char_u **reg_endp = NULL;
3063static lpos_T *reg_startpos = NULL;
3064static lpos_T *reg_endpos = NULL;
3065static win_T *reg_win;
3066static buf_T *reg_buf;
3067static linenr_T reg_firstlnum;
3068static linenr_T reg_maxline;
3069static int reg_line_lbr; /* "\n" in string is line break */
3070
3071/*
3072 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
3073 */
3074 static char_u *
3075reg_getline(lnum)
3076 linenr_T lnum;
3077{
3078 /* when looking behind for a match/no-match lnum is negative. But we
3079 * can't go before line 1 */
3080 if (reg_firstlnum + lnum < 1)
3081 return NULL;
3082 return ml_get_buf(reg_buf, reg_firstlnum + lnum, FALSE);
3083}
3084
3085static regsave_T behind_pos;
3086
3087#ifdef FEAT_SYN_HL
3088static char_u *reg_startzp[NSUBEXP]; /* Workspace to mark beginning */
3089static char_u *reg_endzp[NSUBEXP]; /* and end of \z(...\) matches */
3090static lpos_T reg_startzpos[NSUBEXP]; /* idem, beginning pos */
3091static lpos_T reg_endzpos[NSUBEXP]; /* idem, end pos */
3092#endif
3093
3094/* TRUE if using multi-line regexp. */
3095#define REG_MULTI (reg_match == NULL)
3096
3097/*
3098 * Match a regexp against a string.
3099 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
3100 * Uses curbuf for line count and 'iskeyword'.
3101 *
3102 * Return TRUE if there is a match, FALSE if not.
3103 */
3104 int
3105vim_regexec(rmp, line, col)
3106 regmatch_T *rmp;
3107 char_u *line; /* string to match against */
3108 colnr_T col; /* column to start looking for match */
3109{
3110 reg_match = rmp;
3111 reg_mmatch = NULL;
3112 reg_maxline = 0;
3113 reg_line_lbr = FALSE;
3114 reg_win = NULL;
3115 ireg_ic = rmp->rm_ic;
3116#ifdef FEAT_MBYTE
3117 ireg_icombine = FALSE;
3118#endif
3119 return (vim_regexec_both(line, col) != 0);
3120}
3121
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00003122#if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \
3123 || defined(FIND_REPLACE_DIALOG) || defined(PROTO)
Bram Moolenaar071d4272004-06-13 20:20:40 +00003124/*
3125 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
3126 */
3127 int
3128vim_regexec_nl(rmp, line, col)
3129 regmatch_T *rmp;
3130 char_u *line; /* string to match against */
3131 colnr_T col; /* column to start looking for match */
3132{
3133 reg_match = rmp;
3134 reg_mmatch = NULL;
3135 reg_maxline = 0;
3136 reg_line_lbr = TRUE;
3137 reg_win = NULL;
3138 ireg_ic = rmp->rm_ic;
3139#ifdef FEAT_MBYTE
3140 ireg_icombine = FALSE;
3141#endif
3142 return (vim_regexec_both(line, col) != 0);
3143}
3144#endif
3145
3146/*
3147 * Match a regexp against multiple lines.
3148 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
3149 * Uses curbuf for line count and 'iskeyword'.
3150 *
3151 * Return zero if there is no match. Return number of lines contained in the
3152 * match otherwise.
3153 */
3154 long
3155vim_regexec_multi(rmp, win, buf, lnum, col)
3156 regmmatch_T *rmp;
3157 win_T *win; /* window in which to search or NULL */
3158 buf_T *buf; /* buffer in which to search */
3159 linenr_T lnum; /* nr of line to start looking for match */
3160 colnr_T col; /* column to start looking for match */
3161{
3162 long r;
3163 buf_T *save_curbuf = curbuf;
3164
3165 reg_match = NULL;
3166 reg_mmatch = rmp;
3167 reg_buf = buf;
3168 reg_win = win;
3169 reg_firstlnum = lnum;
3170 reg_maxline = reg_buf->b_ml.ml_line_count - lnum;
3171 reg_line_lbr = FALSE;
3172 ireg_ic = rmp->rmm_ic;
3173#ifdef FEAT_MBYTE
3174 ireg_icombine = FALSE;
3175#endif
3176
3177 /* Need to switch to buffer "buf" to make vim_iswordc() work. */
3178 curbuf = buf;
3179 r = vim_regexec_both(NULL, col);
3180 curbuf = save_curbuf;
3181
3182 return r;
3183}
3184
3185/*
3186 * Match a regexp against a string ("line" points to the string) or multiple
3187 * lines ("line" is NULL, use reg_getline()).
3188 */
3189#ifdef HAVE_SETJMP_H
3190 static long
3191vim_regexec_both(line_arg, col_arg)
3192 char_u *line_arg;
3193 colnr_T col_arg; /* column to start looking for match */
3194#else
3195 static long
3196vim_regexec_both(line, col)
3197 char_u *line;
3198 colnr_T col; /* column to start looking for match */
3199#endif
3200{
3201 regprog_T *prog;
3202 char_u *s;
3203 long retval;
3204#ifdef HAVE_SETJMP_H
3205 char_u *line;
3206 colnr_T col;
Bram Moolenaar748bf032005-02-02 23:04:36 +00003207 int did_mch_startjmp = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00003208#endif
3209
3210 reg_tofree = NULL;
3211
Bram Moolenaar071d4272004-06-13 20:20:40 +00003212#ifdef HAVE_SETJMP_H
Bram Moolenaar071d4272004-06-13 20:20:40 +00003213 /* Trick to avoid "might be clobbered by `longjmp'" warning from gcc. */
3214 line = line_arg;
3215 col = col_arg;
3216#endif
3217 retval = 0L;
3218
3219 if (REG_MULTI)
3220 {
3221 prog = reg_mmatch->regprog;
3222 line = reg_getline((linenr_T)0);
3223 reg_startpos = reg_mmatch->startpos;
3224 reg_endpos = reg_mmatch->endpos;
3225 }
3226 else
3227 {
3228 prog = reg_match->regprog;
3229 reg_startp = reg_match->startp;
3230 reg_endp = reg_match->endp;
3231 }
3232
3233 /* Be paranoid... */
3234 if (prog == NULL || line == NULL)
3235 {
3236 EMSG(_(e_null));
3237 goto theend;
3238 }
3239
3240 /* Check validity of program. */
3241 if (prog_magic_wrong())
3242 goto theend;
3243
3244 /* If pattern contains "\c" or "\C": overrule value of ireg_ic */
3245 if (prog->regflags & RF_ICASE)
3246 ireg_ic = TRUE;
3247 else if (prog->regflags & RF_NOICASE)
3248 ireg_ic = FALSE;
3249
3250#ifdef FEAT_MBYTE
3251 /* If pattern contains "\Z" overrule value of ireg_icombine */
3252 if (prog->regflags & RF_ICOMBINE)
3253 ireg_icombine = TRUE;
3254#endif
3255
3256 /* If there is a "must appear" string, look for it. */
3257 if (prog->regmust != NULL)
3258 {
3259 int c;
3260
3261#ifdef FEAT_MBYTE
3262 if (has_mbyte)
3263 c = (*mb_ptr2char)(prog->regmust);
3264 else
3265#endif
3266 c = *prog->regmust;
3267 s = line + col;
Bram Moolenaar05159a02005-02-26 23:04:13 +00003268
3269 /*
3270 * This is used very often, esp. for ":global". Use three versions of
3271 * the loop to avoid overhead of conditions.
3272 */
3273 if (!ireg_ic
3274#ifdef FEAT_MBYTE
3275 && !has_mbyte
3276#endif
3277 )
3278 while ((s = vim_strbyte(s, c)) != NULL)
3279 {
3280 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
3281 break; /* Found it. */
3282 ++s;
3283 }
3284#ifdef FEAT_MBYTE
3285 else if (!ireg_ic || (!enc_utf8 && mb_char2len(c) > 1))
3286 while ((s = vim_strchr(s, c)) != NULL)
3287 {
3288 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
3289 break; /* Found it. */
3290 mb_ptr_adv(s);
3291 }
3292#endif
3293 else
3294 while ((s = cstrchr(s, c)) != NULL)
3295 {
3296 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
3297 break; /* Found it. */
3298 mb_ptr_adv(s);
3299 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00003300 if (s == NULL) /* Not present. */
3301 goto theend;
3302 }
3303
Bram Moolenaar748bf032005-02-02 23:04:36 +00003304#ifdef HAVE_TRY_EXCEPT
3305 __try
3306 {
3307#endif
3308
3309#ifdef HAVE_SETJMP_H
3310 /*
3311 * Matching with a regexp may cause a very deep recursive call of
3312 * regmatch(). Vim will crash when running out of stack space. Catch
3313 * this here if the system supports it.
3314 * It's a bit slow, do it after the check for "regmust".
3315 * Don't do it if the caller already set it up.
3316 */
3317 if (!lc_active)
3318 {
3319 did_mch_startjmp = TRUE;
3320 mch_startjmp();
3321 if (SETJMP(lc_jump_env) != 0)
3322 {
3323 mch_didjmp();
3324# ifdef SIGHASARG
3325 if (lc_signal != SIGINT)
3326# endif
3327 EMSG(_(e_complex));
3328 retval = 0L;
3329 goto inner_end;
3330 }
3331 }
3332#endif
3333
Bram Moolenaar071d4272004-06-13 20:20:40 +00003334 regline = line;
3335 reglnum = 0;
3336 out_of_stack = FALSE;
3337
3338 /* Simplest case: Anchored match need be tried only once. */
3339 if (prog->reganch)
3340 {
3341 int c;
3342
3343#ifdef FEAT_MBYTE
3344 if (has_mbyte)
3345 c = (*mb_ptr2char)(regline + col);
3346 else
3347#endif
3348 c = regline[col];
3349 if (prog->regstart == NUL
3350 || prog->regstart == c
3351 || (ireg_ic && ((
3352#ifdef FEAT_MBYTE
3353 (enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
3354 || (c < 255 && prog->regstart < 255 &&
3355#endif
3356 TOLOWER_LOC(prog->regstart) == TOLOWER_LOC(c)))))
3357 retval = regtry(prog, col);
3358 else
3359 retval = 0;
3360 }
3361 else
3362 {
3363 /* Messy cases: unanchored match. */
3364 while (!got_int && !out_of_stack)
3365 {
3366 if (prog->regstart != NUL)
3367 {
Bram Moolenaar05159a02005-02-26 23:04:13 +00003368 /* Skip until the char we know it must start with.
3369 * Used often, do some work to avoid call overhead. */
3370 if (!ireg_ic
3371#ifdef FEAT_MBYTE
3372 && !has_mbyte
3373#endif
3374 )
3375 s = vim_strbyte(regline + col, prog->regstart);
3376 else
3377 s = cstrchr(regline + col, prog->regstart);
Bram Moolenaar071d4272004-06-13 20:20:40 +00003378 if (s == NULL)
3379 {
3380 retval = 0;
3381 break;
3382 }
3383 col = (int)(s - regline);
3384 }
3385
3386 retval = regtry(prog, col);
3387 if (retval > 0)
3388 break;
3389
3390 /* if not currently on the first line, get it again */
3391 if (reglnum != 0)
3392 {
3393 regline = reg_getline((linenr_T)0);
3394 reglnum = 0;
3395 }
3396 if (regline[col] == NUL)
3397 break;
3398#ifdef FEAT_MBYTE
3399 if (has_mbyte)
3400 col += (*mb_ptr2len_check)(regline + col);
3401 else
3402#endif
3403 ++col;
3404 }
3405 }
3406
3407 if (out_of_stack)
Bram Moolenaar748bf032005-02-02 23:04:36 +00003408 EMSG(_(e_outofstack));
Bram Moolenaar071d4272004-06-13 20:20:40 +00003409
Bram Moolenaar748bf032005-02-02 23:04:36 +00003410#ifdef HAVE_SETJMP_H
3411inner_end:
Bram Moolenaar05159a02005-02-26 23:04:13 +00003412 if (did_mch_startjmp)
3413 mch_endjmp();
Bram Moolenaar748bf032005-02-02 23:04:36 +00003414#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +00003415#ifdef HAVE_TRY_EXCEPT
3416 }
3417 __except(EXCEPTION_EXECUTE_HANDLER)
3418 {
3419 if (GetExceptionCode() == EXCEPTION_STACK_OVERFLOW)
3420 {
3421 RESETSTKOFLW();
Bram Moolenaar748bf032005-02-02 23:04:36 +00003422 EMSG(_(e_outofstack));
Bram Moolenaar071d4272004-06-13 20:20:40 +00003423 }
3424 else
Bram Moolenaar748bf032005-02-02 23:04:36 +00003425 EMSG(_(e_complex));
Bram Moolenaar071d4272004-06-13 20:20:40 +00003426 retval = 0L;
3427 }
3428#endif
3429
3430theend:
3431 /* Didn't find a match. */
3432 vim_free(reg_tofree);
Bram Moolenaar071d4272004-06-13 20:20:40 +00003433 return retval;
3434}
3435
3436#ifdef FEAT_SYN_HL
3437static reg_extmatch_T *make_extmatch __ARGS((void));
3438
3439/*
3440 * Create a new extmatch and mark it as referenced once.
3441 */
3442 static reg_extmatch_T *
3443make_extmatch()
3444{
3445 reg_extmatch_T *em;
3446
3447 em = (reg_extmatch_T *)alloc_clear((unsigned)sizeof(reg_extmatch_T));
3448 if (em != NULL)
3449 em->refcnt = 1;
3450 return em;
3451}
3452
3453/*
3454 * Add a reference to an extmatch.
3455 */
3456 reg_extmatch_T *
3457ref_extmatch(em)
3458 reg_extmatch_T *em;
3459{
3460 if (em != NULL)
3461 em->refcnt++;
3462 return em;
3463}
3464
3465/*
3466 * Remove a reference to an extmatch. If there are no references left, free
3467 * the info.
3468 */
3469 void
3470unref_extmatch(em)
3471 reg_extmatch_T *em;
3472{
3473 int i;
3474
3475 if (em != NULL && --em->refcnt <= 0)
3476 {
3477 for (i = 0; i < NSUBEXP; ++i)
3478 vim_free(em->matches[i]);
3479 vim_free(em);
3480 }
3481}
3482#endif
3483
3484/*
3485 * regtry - try match of "prog" with at regline["col"].
3486 * Returns 0 for failure, number of lines contained in the match otherwise.
3487 */
3488 static long
3489regtry(prog, col)
3490 regprog_T *prog;
3491 colnr_T col;
3492{
3493 reginput = regline + col;
3494 need_clear_subexpr = TRUE;
3495#ifdef FEAT_SYN_HL
3496 /* Clear the external match subpointers if necessary. */
3497 if (prog->reghasz == REX_SET)
3498 need_clear_zsubexpr = TRUE;
3499#endif
3500
Bram Moolenaardf177f62005-02-22 08:39:57 +00003501 if (regmatch(prog->program + 1, NULL))
Bram Moolenaar071d4272004-06-13 20:20:40 +00003502 {
3503 cleanup_subexpr();
3504 if (REG_MULTI)
3505 {
3506 if (reg_startpos[0].lnum < 0)
3507 {
3508 reg_startpos[0].lnum = 0;
3509 reg_startpos[0].col = col;
3510 }
3511 if (reg_endpos[0].lnum < 0)
3512 {
3513 reg_endpos[0].lnum = reglnum;
3514 reg_endpos[0].col = (int)(reginput - regline);
3515 }
3516 else
3517 /* Use line number of "\ze". */
3518 reglnum = reg_endpos[0].lnum;
3519 }
3520 else
3521 {
3522 if (reg_startp[0] == NULL)
3523 reg_startp[0] = regline + col;
3524 if (reg_endp[0] == NULL)
3525 reg_endp[0] = reginput;
3526 }
3527#ifdef FEAT_SYN_HL
3528 /* Package any found \z(...\) matches for export. Default is none. */
3529 unref_extmatch(re_extmatch_out);
3530 re_extmatch_out = NULL;
3531
3532 if (prog->reghasz == REX_SET)
3533 {
3534 int i;
3535
3536 cleanup_zsubexpr();
3537 re_extmatch_out = make_extmatch();
3538 for (i = 0; i < NSUBEXP; i++)
3539 {
3540 if (REG_MULTI)
3541 {
3542 /* Only accept single line matches. */
3543 if (reg_startzpos[i].lnum >= 0
3544 && reg_endzpos[i].lnum == reg_startzpos[i].lnum)
3545 re_extmatch_out->matches[i] =
3546 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
3547 + reg_startzpos[i].col,
3548 reg_endzpos[i].col - reg_startzpos[i].col);
3549 }
3550 else
3551 {
3552 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
3553 re_extmatch_out->matches[i] =
3554 vim_strnsave(reg_startzp[i],
3555 (int)(reg_endzp[i] - reg_startzp[i]));
3556 }
3557 }
3558 }
3559#endif
3560 return 1 + reglnum;
3561 }
3562 return 0;
3563}
3564
3565#ifdef FEAT_MBYTE
Bram Moolenaar071d4272004-06-13 20:20:40 +00003566static int reg_prev_class __ARGS((void));
3567
Bram Moolenaar071d4272004-06-13 20:20:40 +00003568/*
3569 * Get class of previous character.
3570 */
3571 static int
3572reg_prev_class()
3573{
3574 if (reginput > regline)
3575 return mb_get_class(reginput - 1
3576 - (*mb_head_off)(regline, reginput - 1));
3577 return -1;
3578}
3579
Bram Moolenaar071d4272004-06-13 20:20:40 +00003580#endif
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00003581#define ADVANCE_REGINPUT() mb_ptr_adv(reginput)
Bram Moolenaar071d4272004-06-13 20:20:40 +00003582
3583/*
3584 * The arguments from BRACE_LIMITS are stored here. They are actually local
3585 * to regmatch(), but they are here to reduce the amount of stack space used
3586 * (it can be called recursively many times).
3587 */
3588static long bl_minval;
3589static long bl_maxval;
3590
3591/*
3592 * regmatch - main matching routine
3593 *
3594 * Conceptually the strategy is simple: Check to see whether the current
3595 * node matches, call self recursively to see whether the rest matches,
3596 * and then act accordingly. In practice we make some effort to avoid
3597 * recursion, in particular by going through "ordinary" nodes (that don't
3598 * need to know whether the rest of the match failed) by a loop instead of
3599 * by recursion.
3600 *
3601 * Returns TRUE when there is a match. Leaves reginput and reglnum just after
3602 * the last matched character.
3603 * Returns FALSE when there is no match. Leaves reginput and reglnum in an
3604 * undefined state!
3605 */
3606 static int
Bram Moolenaardf177f62005-02-22 08:39:57 +00003607regmatch(scan, startp)
Bram Moolenaar071d4272004-06-13 20:20:40 +00003608 char_u *scan; /* Current node. */
Bram Moolenaardf177f62005-02-22 08:39:57 +00003609 regsave_T *startp; /* start position for BACK */
Bram Moolenaar071d4272004-06-13 20:20:40 +00003610{
3611 char_u *next; /* Next node. */
3612 int op;
3613 int c;
3614
3615#ifdef HAVE_GETRLIMIT
3616 /* Check if we are running out of stack space. Could be caused by
3617 * recursively calling ourselves. */
3618 if (out_of_stack || mch_stackcheck((char *)&op) == FAIL)
3619 {
3620 out_of_stack = TRUE;
3621 return FALSE;
3622 }
3623#endif
3624
3625 /* Some patterns my cause a long time to match, even though they are not
3626 * illegal. E.g., "\([a-z]\+\)\+Q". Allow breaking them with CTRL-C. */
3627 fast_breakcheck();
3628
3629#ifdef DEBUG
3630 if (scan != NULL && regnarrate)
3631 {
3632 mch_errmsg(regprop(scan));
3633 mch_errmsg("(\n");
3634 }
3635#endif
3636 while (scan != NULL)
3637 {
3638 if (got_int || out_of_stack)
3639 return FALSE;
3640#ifdef DEBUG
3641 if (regnarrate)
3642 {
3643 mch_errmsg(regprop(scan));
3644 mch_errmsg("...\n");
3645# ifdef FEAT_SYN_HL
3646 if (re_extmatch_in != NULL)
3647 {
3648 int i;
3649
3650 mch_errmsg(_("External submatches:\n"));
3651 for (i = 0; i < NSUBEXP; i++)
3652 {
3653 mch_errmsg(" \"");
3654 if (re_extmatch_in->matches[i] != NULL)
3655 mch_errmsg(re_extmatch_in->matches[i]);
3656 mch_errmsg("\"\n");
3657 }
3658 }
3659# endif
3660 }
3661#endif
3662 next = regnext(scan);
3663
3664 op = OP(scan);
3665 /* Check for character class with NL added. */
3666 if (WITH_NL(op) && *reginput == NUL && reglnum < reg_maxline)
3667 {
3668 reg_nextline();
3669 }
3670 else if (reg_line_lbr && WITH_NL(op) && *reginput == '\n')
3671 {
3672 ADVANCE_REGINPUT();
3673 }
3674 else
3675 {
3676 if (WITH_NL(op))
3677 op -= ADD_NL;
3678#ifdef FEAT_MBYTE
3679 if (has_mbyte)
3680 c = (*mb_ptr2char)(reginput);
3681 else
3682#endif
3683 c = *reginput;
3684 switch (op)
3685 {
3686 case BOL:
3687 if (reginput != regline)
3688 return FALSE;
3689 break;
3690
3691 case EOL:
3692 if (c != NUL)
3693 return FALSE;
3694 break;
3695
3696 case RE_BOF:
3697 /* Passing -1 to the getline() function provided for the search
3698 * should always return NULL if the current line is the first
3699 * line of the file. */
3700 if (reglnum != 0 || reginput != regline
3701 || (REG_MULTI && reg_getline((linenr_T)-1) != NULL))
3702 return FALSE;
3703 break;
3704
3705 case RE_EOF:
3706 if (reglnum != reg_maxline || c != NUL)
3707 return FALSE;
3708 break;
3709
3710 case CURSOR:
3711 /* Check if the buffer is in a window and compare the
3712 * reg_win->w_cursor position to the match position. */
3713 if (reg_win == NULL
3714 || (reglnum + reg_firstlnum != reg_win->w_cursor.lnum)
3715 || ((colnr_T)(reginput - regline) != reg_win->w_cursor.col))
3716 return FALSE;
3717 break;
3718
3719 case RE_LNUM:
3720 if (!REG_MULTI || !re_num_cmp((long_u)(reglnum + reg_firstlnum),
3721 scan))
3722 return FALSE;
3723 break;
3724
3725 case RE_COL:
3726 if (!re_num_cmp((long_u)(reginput - regline) + 1, scan))
3727 return FALSE;
3728 break;
3729
3730 case RE_VCOL:
3731 if (!re_num_cmp((long_u)win_linetabsize(
3732 reg_win == NULL ? curwin : reg_win,
3733 regline, (colnr_T)(reginput - regline)) + 1, scan))
3734 return FALSE;
3735 break;
3736
3737 case BOW: /* \<word; reginput points to w */
3738 if (c == NUL) /* Can't match at end of line */
3739 return FALSE;
3740#ifdef FEAT_MBYTE
3741 if (has_mbyte)
3742 {
3743 int this_class;
3744
3745 /* Get class of current and previous char (if it exists). */
3746 this_class = mb_get_class(reginput);
3747 if (this_class <= 1)
3748 return FALSE; /* not on a word at all */
3749 if (reg_prev_class() == this_class)
3750 return FALSE; /* previous char is in same word */
3751 }
3752#endif
3753 else
3754 {
3755 if (!vim_iswordc(c)
3756 || (reginput > regline && vim_iswordc(reginput[-1])))
3757 return FALSE;
3758 }
3759 break;
3760
3761 case EOW: /* word\>; reginput points after d */
3762 if (reginput == regline) /* Can't match at start of line */
3763 return FALSE;
3764#ifdef FEAT_MBYTE
3765 if (has_mbyte)
3766 {
3767 int this_class, prev_class;
3768
3769 /* Get class of current and previous char (if it exists). */
3770 this_class = mb_get_class(reginput);
3771 prev_class = reg_prev_class();
3772 if (this_class == prev_class)
3773 return FALSE;
3774 if (prev_class == 0 || prev_class == 1)
3775 return FALSE;
3776 }
3777 else
3778#endif
3779 {
3780 if (!vim_iswordc(reginput[-1]))
3781 return FALSE;
3782 if (reginput[0] != NUL && vim_iswordc(c))
3783 return FALSE;
3784 }
3785 break; /* Matched with EOW */
3786
3787 case ANY:
3788 if (c == NUL)
3789 return FALSE;
3790 ADVANCE_REGINPUT();
3791 break;
3792
3793 case IDENT:
3794 if (!vim_isIDc(c))
3795 return FALSE;
3796 ADVANCE_REGINPUT();
3797 break;
3798
3799 case SIDENT:
3800 if (VIM_ISDIGIT(*reginput) || !vim_isIDc(c))
3801 return FALSE;
3802 ADVANCE_REGINPUT();
3803 break;
3804
3805 case KWORD:
3806 if (!vim_iswordp(reginput))
3807 return FALSE;
3808 ADVANCE_REGINPUT();
3809 break;
3810
3811 case SKWORD:
3812 if (VIM_ISDIGIT(*reginput) || !vim_iswordp(reginput))
3813 return FALSE;
3814 ADVANCE_REGINPUT();
3815 break;
3816
3817 case FNAME:
3818 if (!vim_isfilec(c))
3819 return FALSE;
3820 ADVANCE_REGINPUT();
3821 break;
3822
3823 case SFNAME:
3824 if (VIM_ISDIGIT(*reginput) || !vim_isfilec(c))
3825 return FALSE;
3826 ADVANCE_REGINPUT();
3827 break;
3828
3829 case PRINT:
3830 if (ptr2cells(reginput) != 1)
3831 return FALSE;
3832 ADVANCE_REGINPUT();
3833 break;
3834
3835 case SPRINT:
3836 if (VIM_ISDIGIT(*reginput) || ptr2cells(reginput) != 1)
3837 return FALSE;
3838 ADVANCE_REGINPUT();
3839 break;
3840
3841 case WHITE:
3842 if (!vim_iswhite(c))
3843 return FALSE;
3844 ADVANCE_REGINPUT();
3845 break;
3846
3847 case NWHITE:
3848 if (c == NUL || vim_iswhite(c))
3849 return FALSE;
3850 ADVANCE_REGINPUT();
3851 break;
3852
3853 case DIGIT:
3854 if (!ri_digit(c))
3855 return FALSE;
3856 ADVANCE_REGINPUT();
3857 break;
3858
3859 case NDIGIT:
3860 if (c == NUL || ri_digit(c))
3861 return FALSE;
3862 ADVANCE_REGINPUT();
3863 break;
3864
3865 case HEX:
3866 if (!ri_hex(c))
3867 return FALSE;
3868 ADVANCE_REGINPUT();
3869 break;
3870
3871 case NHEX:
3872 if (c == NUL || ri_hex(c))
3873 return FALSE;
3874 ADVANCE_REGINPUT();
3875 break;
3876
3877 case OCTAL:
3878 if (!ri_octal(c))
3879 return FALSE;
3880 ADVANCE_REGINPUT();
3881 break;
3882
3883 case NOCTAL:
3884 if (c == NUL || ri_octal(c))
3885 return FALSE;
3886 ADVANCE_REGINPUT();
3887 break;
3888
3889 case WORD:
3890 if (!ri_word(c))
3891 return FALSE;
3892 ADVANCE_REGINPUT();
3893 break;
3894
3895 case NWORD:
3896 if (c == NUL || ri_word(c))
3897 return FALSE;
3898 ADVANCE_REGINPUT();
3899 break;
3900
3901 case HEAD:
3902 if (!ri_head(c))
3903 return FALSE;
3904 ADVANCE_REGINPUT();
3905 break;
3906
3907 case NHEAD:
3908 if (c == NUL || ri_head(c))
3909 return FALSE;
3910 ADVANCE_REGINPUT();
3911 break;
3912
3913 case ALPHA:
3914 if (!ri_alpha(c))
3915 return FALSE;
3916 ADVANCE_REGINPUT();
3917 break;
3918
3919 case NALPHA:
3920 if (c == NUL || ri_alpha(c))
3921 return FALSE;
3922 ADVANCE_REGINPUT();
3923 break;
3924
3925 case LOWER:
3926 if (!ri_lower(c))
3927 return FALSE;
3928 ADVANCE_REGINPUT();
3929 break;
3930
3931 case NLOWER:
3932 if (c == NUL || ri_lower(c))
3933 return FALSE;
3934 ADVANCE_REGINPUT();
3935 break;
3936
3937 case UPPER:
3938 if (!ri_upper(c))
3939 return FALSE;
3940 ADVANCE_REGINPUT();
3941 break;
3942
3943 case NUPPER:
3944 if (c == NUL || ri_upper(c))
3945 return FALSE;
3946 ADVANCE_REGINPUT();
3947 break;
3948
3949 case EXACTLY:
3950 {
3951 int len;
3952 char_u *opnd;
3953
3954 opnd = OPERAND(scan);
3955 /* Inline the first byte, for speed. */
3956 if (*opnd != *reginput
3957 && (!ireg_ic || (
3958#ifdef FEAT_MBYTE
3959 !enc_utf8 &&
3960#endif
3961 TOLOWER_LOC(*opnd) != TOLOWER_LOC(*reginput))))
3962 return FALSE;
3963 if (*opnd == NUL)
3964 {
3965 /* match empty string always works; happens when "~" is
3966 * empty. */
3967 }
3968 else if (opnd[1] == NUL
3969#ifdef FEAT_MBYTE
3970 && !(enc_utf8 && ireg_ic)
3971#endif
3972 )
3973 ++reginput; /* matched a single char */
3974 else
3975 {
3976 len = (int)STRLEN(opnd);
3977 /* Need to match first byte again for multi-byte. */
3978 if (cstrncmp(opnd, reginput, &len) != 0)
3979 return FALSE;
3980#ifdef FEAT_MBYTE
3981 /* Check for following composing character. */
3982 if (enc_utf8 && UTF_COMPOSINGLIKE(reginput, reginput + len))
3983 {
3984 /* raaron: This code makes a composing character get
3985 * ignored, which is the correct behavior (sometimes)
3986 * for voweled Hebrew texts. */
3987 if (!ireg_icombine)
3988 return FALSE;
3989 }
3990 else
3991#endif
3992 reginput += len;
3993 }
3994 }
3995 break;
3996
3997 case ANYOF:
3998 case ANYBUT:
3999 if (c == NUL)
4000 return FALSE;
4001 if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
4002 return FALSE;
4003 ADVANCE_REGINPUT();
4004 break;
4005
4006#ifdef FEAT_MBYTE
4007 case MULTIBYTECODE:
4008 if (has_mbyte)
4009 {
4010 int i, len;
4011 char_u *opnd;
4012
4013 opnd = OPERAND(scan);
4014 /* Safety check (just in case 'encoding' was changed since
4015 * compiling the program). */
4016 if ((len = (*mb_ptr2len_check)(opnd)) < 2)
4017 return FALSE;
4018 for (i = 0; i < len; ++i)
4019 if (opnd[i] != reginput[i])
4020 return FALSE;
4021 reginput += len;
4022 }
4023 else
4024 return FALSE;
4025 break;
4026#endif
4027
4028 case NOTHING:
4029 break;
4030
4031 case BACK:
Bram Moolenaardf177f62005-02-22 08:39:57 +00004032 /* When we run into BACK without matching something non-empty, we
4033 * fail. */
4034 if (startp != NULL && reg_save_equal(startp))
4035 return FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00004036 break;
4037
4038 case MOPEN + 0: /* Match start: \zs */
4039 case MOPEN + 1: /* \( */
4040 case MOPEN + 2:
4041 case MOPEN + 3:
4042 case MOPEN + 4:
4043 case MOPEN + 5:
4044 case MOPEN + 6:
4045 case MOPEN + 7:
4046 case MOPEN + 8:
4047 case MOPEN + 9:
4048 {
4049 int no;
4050 save_se_T save;
4051
4052 no = op - MOPEN;
4053 cleanup_subexpr();
4054 save_se(&save, &reg_startpos[no], &reg_startp[no]);
4055
Bram Moolenaardf177f62005-02-22 08:39:57 +00004056 if (regmatch(next, startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004057 return TRUE;
4058
4059 restore_se(&save, &reg_startpos[no], &reg_startp[no]);
4060 return FALSE;
4061 }
4062 /* break; Not Reached */
4063
4064 case NOPEN: /* \%( */
4065 case NCLOSE: /* \) after \%( */
Bram Moolenaardf177f62005-02-22 08:39:57 +00004066 if (regmatch(next, startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004067 return TRUE;
4068 return FALSE;
4069 /* break; Not Reached */
4070
4071#ifdef FEAT_SYN_HL
4072 case ZOPEN + 1:
4073 case ZOPEN + 2:
4074 case ZOPEN + 3:
4075 case ZOPEN + 4:
4076 case ZOPEN + 5:
4077 case ZOPEN + 6:
4078 case ZOPEN + 7:
4079 case ZOPEN + 8:
4080 case ZOPEN + 9:
4081 {
4082 int no;
4083 save_se_T save;
4084
4085 no = op - ZOPEN;
4086 cleanup_zsubexpr();
4087 save_se(&save, &reg_startzpos[no], &reg_startzp[no]);
4088
Bram Moolenaardf177f62005-02-22 08:39:57 +00004089 if (regmatch(next, startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004090 return TRUE;
4091
4092 restore_se(&save, &reg_startzpos[no], &reg_startzp[no]);
4093 return FALSE;
4094 }
4095 /* break; Not Reached */
4096#endif
4097
4098 case MCLOSE + 0: /* Match end: \ze */
4099 case MCLOSE + 1: /* \) */
4100 case MCLOSE + 2:
4101 case MCLOSE + 3:
4102 case MCLOSE + 4:
4103 case MCLOSE + 5:
4104 case MCLOSE + 6:
4105 case MCLOSE + 7:
4106 case MCLOSE + 8:
4107 case MCLOSE + 9:
4108 {
4109 int no;
4110 save_se_T save;
4111
4112 no = op - MCLOSE;
4113 cleanup_subexpr();
4114 save_se(&save, &reg_endpos[no], &reg_endp[no]);
4115
Bram Moolenaardf177f62005-02-22 08:39:57 +00004116 if (regmatch(next, startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004117 return TRUE;
4118
4119 restore_se(&save, &reg_endpos[no], &reg_endp[no]);
4120 return FALSE;
4121 }
4122 /* break; Not Reached */
4123
4124#ifdef FEAT_SYN_HL
4125 case ZCLOSE + 1: /* \) after \z( */
4126 case ZCLOSE + 2:
4127 case ZCLOSE + 3:
4128 case ZCLOSE + 4:
4129 case ZCLOSE + 5:
4130 case ZCLOSE + 6:
4131 case ZCLOSE + 7:
4132 case ZCLOSE + 8:
4133 case ZCLOSE + 9:
4134 {
4135 int no;
4136 save_se_T save;
4137
4138 no = op - ZCLOSE;
4139 cleanup_zsubexpr();
4140 save_se(&save, &reg_endzpos[no], &reg_endzp[no]);
4141
Bram Moolenaardf177f62005-02-22 08:39:57 +00004142 if (regmatch(next, startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004143 return TRUE;
4144
4145 restore_se(&save, &reg_endzpos[no], &reg_endzp[no]);
4146 return FALSE;
4147 }
4148 /* break; Not Reached */
4149#endif
4150
4151 case BACKREF + 1:
4152 case BACKREF + 2:
4153 case BACKREF + 3:
4154 case BACKREF + 4:
4155 case BACKREF + 5:
4156 case BACKREF + 6:
4157 case BACKREF + 7:
4158 case BACKREF + 8:
4159 case BACKREF + 9:
4160 {
4161 int no;
4162 int len;
4163 linenr_T clnum;
4164 colnr_T ccol;
4165 char_u *p;
4166
4167 no = op - BACKREF;
4168 cleanup_subexpr();
4169 if (!REG_MULTI) /* Single-line regexp */
4170 {
4171 if (reg_endp[no] == NULL)
4172 {
4173 /* Backref was not set: Match an empty string. */
4174 len = 0;
4175 }
4176 else
4177 {
4178 /* Compare current input with back-ref in the same
4179 * line. */
4180 len = (int)(reg_endp[no] - reg_startp[no]);
4181 if (cstrncmp(reg_startp[no], reginput, &len) != 0)
4182 return FALSE;
4183 }
4184 }
4185 else /* Multi-line regexp */
4186 {
4187 if (reg_endpos[no].lnum < 0)
4188 {
4189 /* Backref was not set: Match an empty string. */
4190 len = 0;
4191 }
4192 else
4193 {
4194 if (reg_startpos[no].lnum == reglnum
4195 && reg_endpos[no].lnum == reglnum)
4196 {
4197 /* Compare back-ref within the current line. */
4198 len = reg_endpos[no].col - reg_startpos[no].col;
4199 if (cstrncmp(regline + reg_startpos[no].col,
4200 reginput, &len) != 0)
4201 return FALSE;
4202 }
4203 else
4204 {
4205 /* Messy situation: Need to compare between two
4206 * lines. */
4207 ccol = reg_startpos[no].col;
4208 clnum = reg_startpos[no].lnum;
4209 for (;;)
4210 {
4211 /* Since getting one line may invalidate
4212 * the other, need to make copy. Slow! */
4213 if (regline != reg_tofree)
4214 {
4215 len = (int)STRLEN(regline);
4216 if (reg_tofree == NULL
4217 || len >= (int)reg_tofreelen)
4218 {
4219 len += 50; /* get some extra */
4220 vim_free(reg_tofree);
4221 reg_tofree = alloc(len);
4222 if (reg_tofree == NULL)
4223 return FALSE; /* out of memory! */
4224 reg_tofreelen = len;
4225 }
4226 STRCPY(reg_tofree, regline);
4227 reginput = reg_tofree
4228 + (reginput - regline);
4229 regline = reg_tofree;
4230 }
4231
4232 /* Get the line to compare with. */
4233 p = reg_getline(clnum);
4234 if (clnum == reg_endpos[no].lnum)
4235 len = reg_endpos[no].col - ccol;
4236 else
4237 len = (int)STRLEN(p + ccol);
4238
4239 if (cstrncmp(p + ccol, reginput, &len) != 0)
4240 return FALSE; /* doesn't match */
4241 if (clnum == reg_endpos[no].lnum)
4242 break; /* match and at end! */
4243 if (reglnum == reg_maxline)
4244 return FALSE; /* text too short */
4245
4246 /* Advance to next line. */
4247 reg_nextline();
4248 ++clnum;
4249 ccol = 0;
4250 if (got_int || out_of_stack)
4251 return FALSE;
4252 }
4253
4254 /* found a match! Note that regline may now point
4255 * to a copy of the line, that should not matter. */
4256 }
4257 }
4258 }
4259
4260 /* Matched the backref, skip over it. */
4261 reginput += len;
4262 }
4263 break;
4264
4265#ifdef FEAT_SYN_HL
4266 case ZREF + 1:
4267 case ZREF + 2:
4268 case ZREF + 3:
4269 case ZREF + 4:
4270 case ZREF + 5:
4271 case ZREF + 6:
4272 case ZREF + 7:
4273 case ZREF + 8:
4274 case ZREF + 9:
4275 {
4276 int no;
4277 int len;
4278
4279 cleanup_zsubexpr();
4280 no = op - ZREF;
4281 if (re_extmatch_in != NULL
4282 && re_extmatch_in->matches[no] != NULL)
4283 {
4284 len = (int)STRLEN(re_extmatch_in->matches[no]);
4285 if (cstrncmp(re_extmatch_in->matches[no],
4286 reginput, &len) != 0)
4287 return FALSE;
4288 reginput += len;
4289 }
4290 else
4291 {
4292 /* Backref was not set: Match an empty string. */
4293 }
4294 }
4295 break;
4296#endif
4297
4298 case BRANCH:
4299 {
4300 if (OP(next) != BRANCH) /* No choice. */
4301 next = OPERAND(scan); /* Avoid recursion. */
4302 else
4303 {
4304 regsave_T save;
4305
4306 do
4307 {
4308 reg_save(&save);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004309 if (regmatch(OPERAND(scan), &save))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004310 return TRUE;
4311 reg_restore(&save);
4312 scan = regnext(scan);
4313 } while (scan != NULL && OP(scan) == BRANCH);
4314 return FALSE;
4315 /* NOTREACHED */
4316 }
4317 }
4318 break;
4319
4320 case BRACE_LIMITS:
4321 {
4322 int no;
4323
4324 if (OP(next) == BRACE_SIMPLE)
4325 {
4326 bl_minval = OPERAND_MIN(scan);
4327 bl_maxval = OPERAND_MAX(scan);
4328 }
4329 else if (OP(next) >= BRACE_COMPLEX
4330 && OP(next) < BRACE_COMPLEX + 10)
4331 {
4332 no = OP(next) - BRACE_COMPLEX;
4333 brace_min[no] = OPERAND_MIN(scan);
4334 brace_max[no] = OPERAND_MAX(scan);
4335 brace_count[no] = 0;
4336 }
4337 else
4338 {
4339 EMSG(_(e_internal)); /* Shouldn't happen */
4340 return FALSE;
4341 }
4342 }
4343 break;
4344
4345 case BRACE_COMPLEX + 0:
4346 case BRACE_COMPLEX + 1:
4347 case BRACE_COMPLEX + 2:
4348 case BRACE_COMPLEX + 3:
4349 case BRACE_COMPLEX + 4:
4350 case BRACE_COMPLEX + 5:
4351 case BRACE_COMPLEX + 6:
4352 case BRACE_COMPLEX + 7:
4353 case BRACE_COMPLEX + 8:
4354 case BRACE_COMPLEX + 9:
4355 {
4356 int no;
4357 regsave_T save;
4358
4359 no = op - BRACE_COMPLEX;
4360 ++brace_count[no];
4361
4362 /* If not matched enough times yet, try one more */
4363 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
4364 ? brace_min[no] : brace_max[no]))
4365 {
4366 reg_save(&save);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004367 if (regmatch(OPERAND(scan), &save))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004368 return TRUE;
4369 reg_restore(&save);
4370 --brace_count[no]; /* failed, decrement match count */
4371 return FALSE;
4372 }
4373
4374 /* If matched enough times, may try matching some more */
4375 if (brace_min[no] <= brace_max[no])
4376 {
4377 /* Range is the normal way around, use longest match */
4378 if (brace_count[no] <= brace_max[no])
4379 {
4380 reg_save(&save);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004381 if (regmatch(OPERAND(scan), &save))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004382 return TRUE; /* matched some more times */
4383 reg_restore(&save);
4384 --brace_count[no]; /* matched just enough times */
Bram Moolenaardf177f62005-02-22 08:39:57 +00004385 /* { continue with the items after \{} */
Bram Moolenaar071d4272004-06-13 20:20:40 +00004386 }
4387 }
4388 else
4389 {
4390 /* Range is backwards, use shortest match first */
4391 if (brace_count[no] <= brace_min[no])
4392 {
4393 reg_save(&save);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004394 if (regmatch(next, &save))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004395 return TRUE;
4396 reg_restore(&save);
4397 next = OPERAND(scan);
4398 /* must try to match one more item */
4399 }
4400 }
4401 }
4402 break;
4403
4404 case BRACE_SIMPLE:
4405 case STAR:
4406 case PLUS:
4407 {
4408 int nextb; /* next byte */
4409 int nextb_ic; /* next byte reverse case */
4410 long count;
4411 regsave_T save;
4412 long minval;
4413 long maxval;
4414
4415 /*
4416 * Lookahead to avoid useless match attempts when we know
4417 * what character comes next.
4418 */
4419 if (OP(next) == EXACTLY)
4420 {
4421 nextb = *OPERAND(next);
4422 if (ireg_ic)
4423 {
4424 if (isupper(nextb))
4425 nextb_ic = TOLOWER_LOC(nextb);
4426 else
4427 nextb_ic = TOUPPER_LOC(nextb);
4428 }
4429 else
4430 nextb_ic = nextb;
4431 }
4432 else
4433 {
4434 nextb = NUL;
4435 nextb_ic = NUL;
4436 }
4437 if (op != BRACE_SIMPLE)
4438 {
4439 minval = (op == STAR) ? 0 : 1;
4440 maxval = MAX_LIMIT;
4441 }
4442 else
4443 {
4444 minval = bl_minval;
4445 maxval = bl_maxval;
4446 }
4447
4448 /*
4449 * When maxval > minval, try matching as much as possible, up
4450 * to maxval. When maxval < minval, try matching at least the
4451 * minimal number (since the range is backwards, that's also
4452 * maxval!).
4453 */
4454 count = regrepeat(OPERAND(scan), maxval);
4455 if (got_int)
4456 return FALSE;
4457 if (minval <= maxval)
4458 {
4459 /* Range is the normal way around, use longest match */
4460 while (count >= minval)
4461 {
4462 /* If it could match, try it. */
4463 if (nextb == NUL || *reginput == nextb
4464 || *reginput == nextb_ic)
4465 {
4466 reg_save(&save);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004467 if (regmatch(next, startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004468 return TRUE;
4469 reg_restore(&save);
4470 }
4471 /* Couldn't or didn't match -- back up one char. */
4472 if (--count < minval)
4473 break;
4474 if (reginput == regline)
4475 {
4476 /* backup to last char of previous line */
4477 --reglnum;
4478 regline = reg_getline(reglnum);
4479 /* Just in case regrepeat() didn't count right. */
4480 if (regline == NULL)
4481 return FALSE;
4482 reginput = regline + STRLEN(regline);
4483 fast_breakcheck();
4484 if (got_int || out_of_stack)
4485 return FALSE;
4486 }
4487 else
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004488 mb_ptr_back(regline, reginput);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004489 }
4490 }
4491 else
4492 {
4493 /* Range is backwards, use shortest match first.
4494 * Careful: maxval and minval are exchanged! */
4495 if (count < maxval)
4496 return FALSE;
4497 for (;;)
4498 {
4499 /* If it could work, try it. */
4500 if (nextb == NUL || *reginput == nextb
4501 || *reginput == nextb_ic)
4502 {
4503 reg_save(&save);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004504 if (regmatch(next, &save))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004505 return TRUE;
4506 reg_restore(&save);
4507 }
4508 /* Couldn't or didn't match: try advancing one char. */
4509 if (count == minval
4510 || regrepeat(OPERAND(scan), 1L) == 0)
4511 break;
4512 ++count;
4513 if (got_int || out_of_stack)
4514 return FALSE;
4515 }
4516 }
4517 return FALSE;
4518 }
4519 /* break; Not Reached */
4520
4521 case NOMATCH:
4522 {
4523 regsave_T save;
4524
4525 /* If the operand matches, we fail. Otherwise backup and
4526 * continue with the next item. */
4527 reg_save(&save);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004528 if (regmatch(OPERAND(scan), startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004529 return FALSE;
4530 reg_restore(&save);
4531 }
4532 break;
4533
4534 case MATCH:
4535 case SUBPAT:
4536 {
4537 regsave_T save;
4538
4539 /* If the operand doesn't match, we fail. Otherwise backup
4540 * and continue with the next item. */
4541 reg_save(&save);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004542 if (!regmatch(OPERAND(scan), startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004543 return FALSE;
4544 if (op == MATCH) /* zero-width */
4545 reg_restore(&save);
4546 }
4547 break;
4548
4549 case BEHIND:
4550 case NOBEHIND:
4551 {
4552 regsave_T save_after, save_start;
4553 regsave_T save_behind_pos;
4554 int needmatch = (op == BEHIND);
4555
4556 /*
4557 * Look back in the input of the operand matches or not. This
4558 * must be done at every position in the input and checking if
4559 * the match ends at the current position.
4560 * First check if the next item matches, that's probably
4561 * faster.
4562 */
4563 reg_save(&save_start);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004564 if (regmatch(next, startp))
Bram Moolenaar071d4272004-06-13 20:20:40 +00004565 {
4566 /* save the position after the found match for next */
4567 reg_save(&save_after);
4568
4569 /* start looking for a match with operand at the current
4570 * postion. Go back one character until we find the
4571 * result, hitting the start of the line or the previous
4572 * line (for multi-line matching).
4573 * Set behind_pos to where the match should end, BHPOS
4574 * will match it. */
4575 save_behind_pos = behind_pos;
4576 behind_pos = save_start;
4577 for (;;)
4578 {
4579 reg_restore(&save_start);
Bram Moolenaardf177f62005-02-22 08:39:57 +00004580 if (regmatch(OPERAND(scan), startp)
Bram Moolenaar071d4272004-06-13 20:20:40 +00004581 && reg_save_equal(&behind_pos))
4582 {
4583 behind_pos = save_behind_pos;
4584 /* found a match that ends where "next" started */
4585 if (needmatch)
4586 {
4587 reg_restore(&save_after);
4588 return TRUE;
4589 }
4590 return FALSE;
4591 }
4592 /*
4593 * No match: Go back one character. May go to
4594 * previous line once.
4595 */
4596 if (REG_MULTI)
4597 {
4598 if (save_start.rs_u.pos.col == 0)
4599 {
4600 if (save_start.rs_u.pos.lnum
4601 < behind_pos.rs_u.pos.lnum
4602 || reg_getline(
4603 --save_start.rs_u.pos.lnum) == NULL)
4604 break;
4605 reg_restore(&save_start);
4606 save_start.rs_u.pos.col =
4607 (colnr_T)STRLEN(regline);
4608 }
4609 else
4610 --save_start.rs_u.pos.col;
4611 }
4612 else
4613 {
4614 if (save_start.rs_u.ptr == regline)
4615 break;
4616 --save_start.rs_u.ptr;
4617 }
4618 }
4619
4620 /* NOBEHIND succeeds when no match was found */
4621 behind_pos = save_behind_pos;
4622 if (!needmatch)
4623 {
4624 reg_restore(&save_after);
4625 return TRUE;
4626 }
4627 }
4628 return FALSE;
4629 }
4630
4631 case BHPOS:
4632 if (REG_MULTI)
4633 {
4634 if (behind_pos.rs_u.pos.col != (colnr_T)(reginput - regline)
4635 || behind_pos.rs_u.pos.lnum != reglnum)
4636 return FALSE;
4637 }
4638 else if (behind_pos.rs_u.ptr != reginput)
4639 return FALSE;
4640 break;
4641
4642 case NEWL:
4643 if ((c != NUL || reglnum == reg_maxline)
4644 && (c != '\n' || !reg_line_lbr))
4645 return FALSE;
4646 if (reg_line_lbr)
4647 ADVANCE_REGINPUT();
4648 else
4649 reg_nextline();
4650 break;
4651
4652 case END:
4653 return TRUE; /* Success! */
4654
4655 default:
4656 EMSG(_(e_re_corr));
4657#ifdef DEBUG
4658 printf("Illegal op code %d\n", op);
4659#endif
4660 return FALSE;
4661 }
4662 }
4663
4664 scan = next;
4665 }
4666
4667 /*
4668 * We get here only if there's trouble -- normally "case END" is the
4669 * terminating point.
4670 */
4671 EMSG(_(e_re_corr));
4672#ifdef DEBUG
4673 printf("Premature EOL\n");
4674#endif
4675 return FALSE;
4676}
4677
Bram Moolenaar071d4272004-06-13 20:20:40 +00004678/*
4679 * regrepeat - repeatedly match something simple, return how many.
4680 * Advances reginput (and reglnum) to just after the matched chars.
4681 */
4682 static int
4683regrepeat(p, maxcount)
4684 char_u *p;
4685 long maxcount; /* maximum number of matches allowed */
4686{
4687 long count = 0;
4688 char_u *scan;
4689 char_u *opnd;
4690 int mask;
4691 int testval = 0;
4692
4693 scan = reginput; /* Make local copy of reginput for speed. */
4694 opnd = OPERAND(p);
4695 switch (OP(p))
4696 {
4697 case ANY:
4698 case ANY + ADD_NL:
4699 while (count < maxcount)
4700 {
4701 /* Matching anything means we continue until end-of-line (or
4702 * end-of-file for ANY + ADD_NL), only limited by maxcount. */
4703 while (*scan != NUL && count < maxcount)
4704 {
4705 ++count;
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004706 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004707 }
4708 if (!WITH_NL(OP(p)) || reglnum == reg_maxline || count == maxcount)
4709 break;
4710 ++count; /* count the line-break */
4711 reg_nextline();
4712 scan = reginput;
4713 if (got_int)
4714 break;
4715 }
4716 break;
4717
4718 case IDENT:
4719 case IDENT + ADD_NL:
4720 testval = TRUE;
4721 /*FALLTHROUGH*/
4722 case SIDENT:
4723 case SIDENT + ADD_NL:
4724 while (count < maxcount)
4725 {
4726 if (vim_isIDc(*scan) && (testval || !VIM_ISDIGIT(*scan)))
4727 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004728 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004729 }
4730 else if (*scan == NUL)
4731 {
4732 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4733 break;
4734 reg_nextline();
4735 scan = reginput;
4736 if (got_int)
4737 break;
4738 }
4739 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4740 ++scan;
4741 else
4742 break;
4743 ++count;
4744 }
4745 break;
4746
4747 case KWORD:
4748 case KWORD + ADD_NL:
4749 testval = TRUE;
4750 /*FALLTHROUGH*/
4751 case SKWORD:
4752 case SKWORD + ADD_NL:
4753 while (count < maxcount)
4754 {
4755 if (vim_iswordp(scan) && (testval || !VIM_ISDIGIT(*scan)))
4756 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004757 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004758 }
4759 else if (*scan == NUL)
4760 {
4761 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4762 break;
4763 reg_nextline();
4764 scan = reginput;
4765 if (got_int)
4766 break;
4767 }
4768 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4769 ++scan;
4770 else
4771 break;
4772 ++count;
4773 }
4774 break;
4775
4776 case FNAME:
4777 case FNAME + ADD_NL:
4778 testval = TRUE;
4779 /*FALLTHROUGH*/
4780 case SFNAME:
4781 case SFNAME + ADD_NL:
4782 while (count < maxcount)
4783 {
4784 if (vim_isfilec(*scan) && (testval || !VIM_ISDIGIT(*scan)))
4785 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004786 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004787 }
4788 else if (*scan == NUL)
4789 {
4790 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4791 break;
4792 reg_nextline();
4793 scan = reginput;
4794 if (got_int)
4795 break;
4796 }
4797 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4798 ++scan;
4799 else
4800 break;
4801 ++count;
4802 }
4803 break;
4804
4805 case PRINT:
4806 case PRINT + ADD_NL:
4807 testval = TRUE;
4808 /*FALLTHROUGH*/
4809 case SPRINT:
4810 case SPRINT + ADD_NL:
4811 while (count < maxcount)
4812 {
4813 if (*scan == NUL)
4814 {
4815 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4816 break;
4817 reg_nextline();
4818 scan = reginput;
4819 if (got_int)
4820 break;
4821 }
4822 else if (ptr2cells(scan) == 1 && (testval || !VIM_ISDIGIT(*scan)))
4823 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004824 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004825 }
4826 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4827 ++scan;
4828 else
4829 break;
4830 ++count;
4831 }
4832 break;
4833
4834 case WHITE:
4835 case WHITE + ADD_NL:
4836 testval = mask = RI_WHITE;
4837do_class:
4838 while (count < maxcount)
4839 {
4840#ifdef FEAT_MBYTE
4841 int l;
4842#endif
4843 if (*scan == NUL)
4844 {
4845 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4846 break;
4847 reg_nextline();
4848 scan = reginput;
4849 if (got_int)
4850 break;
4851 }
4852#ifdef FEAT_MBYTE
4853 else if (has_mbyte && (l = (*mb_ptr2len_check)(scan)) > 1)
4854 {
4855 if (testval != 0)
4856 break;
4857 scan += l;
4858 }
4859#endif
4860 else if ((class_tab[*scan] & mask) == testval)
4861 ++scan;
4862 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4863 ++scan;
4864 else
4865 break;
4866 ++count;
4867 }
4868 break;
4869
4870 case NWHITE:
4871 case NWHITE + ADD_NL:
4872 mask = RI_WHITE;
4873 goto do_class;
4874 case DIGIT:
4875 case DIGIT + ADD_NL:
4876 testval = mask = RI_DIGIT;
4877 goto do_class;
4878 case NDIGIT:
4879 case NDIGIT + ADD_NL:
4880 mask = RI_DIGIT;
4881 goto do_class;
4882 case HEX:
4883 case HEX + ADD_NL:
4884 testval = mask = RI_HEX;
4885 goto do_class;
4886 case NHEX:
4887 case NHEX + ADD_NL:
4888 mask = RI_HEX;
4889 goto do_class;
4890 case OCTAL:
4891 case OCTAL + ADD_NL:
4892 testval = mask = RI_OCTAL;
4893 goto do_class;
4894 case NOCTAL:
4895 case NOCTAL + ADD_NL:
4896 mask = RI_OCTAL;
4897 goto do_class;
4898 case WORD:
4899 case WORD + ADD_NL:
4900 testval = mask = RI_WORD;
4901 goto do_class;
4902 case NWORD:
4903 case NWORD + ADD_NL:
4904 mask = RI_WORD;
4905 goto do_class;
4906 case HEAD:
4907 case HEAD + ADD_NL:
4908 testval = mask = RI_HEAD;
4909 goto do_class;
4910 case NHEAD:
4911 case NHEAD + ADD_NL:
4912 mask = RI_HEAD;
4913 goto do_class;
4914 case ALPHA:
4915 case ALPHA + ADD_NL:
4916 testval = mask = RI_ALPHA;
4917 goto do_class;
4918 case NALPHA:
4919 case NALPHA + ADD_NL:
4920 mask = RI_ALPHA;
4921 goto do_class;
4922 case LOWER:
4923 case LOWER + ADD_NL:
4924 testval = mask = RI_LOWER;
4925 goto do_class;
4926 case NLOWER:
4927 case NLOWER + ADD_NL:
4928 mask = RI_LOWER;
4929 goto do_class;
4930 case UPPER:
4931 case UPPER + ADD_NL:
4932 testval = mask = RI_UPPER;
4933 goto do_class;
4934 case NUPPER:
4935 case NUPPER + ADD_NL:
4936 mask = RI_UPPER;
4937 goto do_class;
4938
4939 case EXACTLY:
4940 {
4941 int cu, cl;
4942
4943 /* This doesn't do a multi-byte character, because a MULTIBYTECODE
4944 * would have been used for it. */
4945 if (ireg_ic)
4946 {
4947 cu = TOUPPER_LOC(*opnd);
4948 cl = TOLOWER_LOC(*opnd);
4949 while (count < maxcount && (*scan == cu || *scan == cl))
4950 {
4951 count++;
4952 scan++;
4953 }
4954 }
4955 else
4956 {
4957 cu = *opnd;
4958 while (count < maxcount && *scan == cu)
4959 {
4960 count++;
4961 scan++;
4962 }
4963 }
4964 break;
4965 }
4966
4967#ifdef FEAT_MBYTE
4968 case MULTIBYTECODE:
4969 {
4970 int i, len, cf = 0;
4971
4972 /* Safety check (just in case 'encoding' was changed since
4973 * compiling the program). */
4974 if ((len = (*mb_ptr2len_check)(opnd)) > 1)
4975 {
4976 if (ireg_ic && enc_utf8)
4977 cf = utf_fold(utf_ptr2char(opnd));
4978 while (count < maxcount)
4979 {
4980 for (i = 0; i < len; ++i)
4981 if (opnd[i] != scan[i])
4982 break;
4983 if (i < len && (!ireg_ic || !enc_utf8
4984 || utf_fold(utf_ptr2char(scan)) != cf))
4985 break;
4986 scan += len;
4987 ++count;
4988 }
4989 }
4990 }
4991 break;
4992#endif
4993
4994 case ANYOF:
4995 case ANYOF + ADD_NL:
4996 testval = TRUE;
4997 /*FALLTHROUGH*/
4998
4999 case ANYBUT:
5000 case ANYBUT + ADD_NL:
5001 while (count < maxcount)
5002 {
5003#ifdef FEAT_MBYTE
5004 int len;
5005#endif
5006 if (*scan == NUL)
5007 {
5008 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
5009 break;
5010 reg_nextline();
5011 scan = reginput;
5012 if (got_int)
5013 break;
5014 }
5015 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
5016 ++scan;
5017#ifdef FEAT_MBYTE
5018 else if (has_mbyte && (len = (*mb_ptr2len_check)(scan)) > 1)
5019 {
5020 if ((cstrchr(opnd, (*mb_ptr2char)(scan)) == NULL) == testval)
5021 break;
5022 scan += len;
5023 }
5024#endif
5025 else
5026 {
5027 if ((cstrchr(opnd, *scan) == NULL) == testval)
5028 break;
5029 ++scan;
5030 }
5031 ++count;
5032 }
5033 break;
5034
5035 case NEWL:
5036 while (count < maxcount
5037 && ((*scan == NUL && reglnum < reg_maxline)
5038 || (*scan == '\n' && reg_line_lbr)))
5039 {
5040 count++;
5041 if (reg_line_lbr)
5042 ADVANCE_REGINPUT();
5043 else
5044 reg_nextline();
5045 scan = reginput;
5046 if (got_int)
5047 break;
5048 }
5049 break;
5050
5051 default: /* Oh dear. Called inappropriately. */
5052 EMSG(_(e_re_corr));
5053#ifdef DEBUG
5054 printf("Called regrepeat with op code %d\n", OP(p));
5055#endif
5056 break;
5057 }
5058
5059 reginput = scan;
5060
5061 return (int)count;
5062}
5063
5064/*
5065 * regnext - dig the "next" pointer out of a node
5066 */
5067 static char_u *
5068regnext(p)
5069 char_u *p;
5070{
5071 int offset;
5072
5073 if (p == JUST_CALC_SIZE)
5074 return NULL;
5075
5076 offset = NEXT(p);
5077 if (offset == 0)
5078 return NULL;
5079
5080 if (OP(p) == BACK)
5081 return p - offset;
5082 else
5083 return p + offset;
5084}
5085
5086/*
5087 * Check the regexp program for its magic number.
5088 * Return TRUE if it's wrong.
5089 */
5090 static int
5091prog_magic_wrong()
5092{
5093 if (UCHARAT(REG_MULTI
5094 ? reg_mmatch->regprog->program
5095 : reg_match->regprog->program) != REGMAGIC)
5096 {
5097 EMSG(_(e_re_corr));
5098 return TRUE;
5099 }
5100 return FALSE;
5101}
5102
5103/*
5104 * Cleanup the subexpressions, if this wasn't done yet.
5105 * This construction is used to clear the subexpressions only when they are
5106 * used (to increase speed).
5107 */
5108 static void
5109cleanup_subexpr()
5110{
5111 if (need_clear_subexpr)
5112 {
5113 if (REG_MULTI)
5114 {
5115 /* Use 0xff to set lnum to -1 */
5116 vim_memset(reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
5117 vim_memset(reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
5118 }
5119 else
5120 {
5121 vim_memset(reg_startp, 0, sizeof(char_u *) * NSUBEXP);
5122 vim_memset(reg_endp, 0, sizeof(char_u *) * NSUBEXP);
5123 }
5124 need_clear_subexpr = FALSE;
5125 }
5126}
5127
5128#ifdef FEAT_SYN_HL
5129 static void
5130cleanup_zsubexpr()
5131{
5132 if (need_clear_zsubexpr)
5133 {
5134 if (REG_MULTI)
5135 {
5136 /* Use 0xff to set lnum to -1 */
5137 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
5138 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
5139 }
5140 else
5141 {
5142 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
5143 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
5144 }
5145 need_clear_zsubexpr = FALSE;
5146 }
5147}
5148#endif
5149
5150/*
5151 * Advance reglnum, regline and reginput to the next line.
5152 */
5153 static void
5154reg_nextline()
5155{
5156 regline = reg_getline(++reglnum);
5157 reginput = regline;
5158 fast_breakcheck();
5159}
5160
5161/*
5162 * Save the input line and position in a regsave_T.
5163 */
5164 static void
5165reg_save(save)
5166 regsave_T *save;
5167{
5168 if (REG_MULTI)
5169 {
5170 save->rs_u.pos.col = (colnr_T)(reginput - regline);
5171 save->rs_u.pos.lnum = reglnum;
5172 }
5173 else
5174 save->rs_u.ptr = reginput;
5175}
5176
5177/*
5178 * Restore the input line and position from a regsave_T.
5179 */
5180 static void
5181reg_restore(save)
5182 regsave_T *save;
5183{
5184 if (REG_MULTI)
5185 {
5186 if (reglnum != save->rs_u.pos.lnum)
5187 {
5188 /* only call reg_getline() when the line number changed to save
5189 * a bit of time */
5190 reglnum = save->rs_u.pos.lnum;
5191 regline = reg_getline(reglnum);
5192 }
5193 reginput = regline + save->rs_u.pos.col;
5194 }
5195 else
5196 reginput = save->rs_u.ptr;
5197}
5198
5199/*
5200 * Return TRUE if current position is equal to saved position.
5201 */
5202 static int
5203reg_save_equal(save)
5204 regsave_T *save;
5205{
5206 if (REG_MULTI)
5207 return reglnum == save->rs_u.pos.lnum
5208 && reginput == regline + save->rs_u.pos.col;
5209 return reginput == save->rs_u.ptr;
5210}
5211
5212/*
5213 * Tentatively set the sub-expression start to the current position (after
5214 * calling regmatch() they will have changed). Need to save the existing
5215 * values for when there is no match.
5216 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
5217 * depending on REG_MULTI.
5218 */
5219 static void
5220save_se_multi(savep, posp)
5221 save_se_T *savep;
5222 lpos_T *posp;
5223{
5224 savep->se_u.pos = *posp;
5225 posp->lnum = reglnum;
5226 posp->col = (colnr_T)(reginput - regline);
5227}
5228
5229 static void
5230save_se_one(savep, pp)
5231 save_se_T *savep;
5232 char_u **pp;
5233{
5234 savep->se_u.ptr = *pp;
5235 *pp = reginput;
5236}
5237
5238/*
5239 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
5240 */
5241 static int
5242re_num_cmp(val, scan)
5243 long_u val;
5244 char_u *scan;
5245{
5246 long_u n = OPERAND_MIN(scan);
5247
5248 if (OPERAND_CMP(scan) == '>')
5249 return val > n;
5250 if (OPERAND_CMP(scan) == '<')
5251 return val < n;
5252 return val == n;
5253}
5254
5255
5256#ifdef DEBUG
5257
5258/*
5259 * regdump - dump a regexp onto stdout in vaguely comprehensible form
5260 */
5261 static void
5262regdump(pattern, r)
5263 char_u *pattern;
5264 regprog_T *r;
5265{
5266 char_u *s;
5267 int op = EXACTLY; /* Arbitrary non-END op. */
5268 char_u *next;
5269 char_u *end = NULL;
5270
5271 printf("\r\nregcomp(%s):\r\n", pattern);
5272
5273 s = r->program + 1;
5274 /*
5275 * Loop until we find the END that isn't before a referred next (an END
5276 * can also appear in a NOMATCH operand).
5277 */
5278 while (op != END || s <= end)
5279 {
5280 op = OP(s);
5281 printf("%2d%s", (int)(s - r->program), regprop(s)); /* Where, what. */
5282 next = regnext(s);
5283 if (next == NULL) /* Next ptr. */
5284 printf("(0)");
5285 else
5286 printf("(%d)", (int)((s - r->program) + (next - s)));
5287 if (end < next)
5288 end = next;
5289 if (op == BRACE_LIMITS)
5290 {
5291 /* Two short ints */
5292 printf(" minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
5293 s += 8;
5294 }
5295 s += 3;
5296 if (op == ANYOF || op == ANYOF + ADD_NL
5297 || op == ANYBUT || op == ANYBUT + ADD_NL
5298 || op == EXACTLY)
5299 {
5300 /* Literal string, where present. */
5301 while (*s != NUL)
5302 printf("%c", *s++);
5303 s++;
5304 }
5305 printf("\r\n");
5306 }
5307
5308 /* Header fields of interest. */
5309 if (r->regstart != NUL)
5310 printf("start `%s' 0x%x; ", r->regstart < 256
5311 ? (char *)transchar(r->regstart)
5312 : "multibyte", r->regstart);
5313 if (r->reganch)
5314 printf("anchored; ");
5315 if (r->regmust != NULL)
5316 printf("must have \"%s\"", r->regmust);
5317 printf("\r\n");
5318}
5319
5320/*
5321 * regprop - printable representation of opcode
5322 */
5323 static char_u *
5324regprop(op)
5325 char_u *op;
5326{
5327 char_u *p;
5328 static char_u buf[50];
5329
5330 (void) strcpy(buf, ":");
5331
5332 switch (OP(op))
5333 {
5334 case BOL:
5335 p = "BOL";
5336 break;
5337 case EOL:
5338 p = "EOL";
5339 break;
5340 case RE_BOF:
5341 p = "BOF";
5342 break;
5343 case RE_EOF:
5344 p = "EOF";
5345 break;
5346 case CURSOR:
5347 p = "CURSOR";
5348 break;
5349 case RE_LNUM:
5350 p = "RE_LNUM";
5351 break;
5352 case RE_COL:
5353 p = "RE_COL";
5354 break;
5355 case RE_VCOL:
5356 p = "RE_VCOL";
5357 break;
5358 case BOW:
5359 p = "BOW";
5360 break;
5361 case EOW:
5362 p = "EOW";
5363 break;
5364 case ANY:
5365 p = "ANY";
5366 break;
5367 case ANY + ADD_NL:
5368 p = "ANY+NL";
5369 break;
5370 case ANYOF:
5371 p = "ANYOF";
5372 break;
5373 case ANYOF + ADD_NL:
5374 p = "ANYOF+NL";
5375 break;
5376 case ANYBUT:
5377 p = "ANYBUT";
5378 break;
5379 case ANYBUT + ADD_NL:
5380 p = "ANYBUT+NL";
5381 break;
5382 case IDENT:
5383 p = "IDENT";
5384 break;
5385 case IDENT + ADD_NL:
5386 p = "IDENT+NL";
5387 break;
5388 case SIDENT:
5389 p = "SIDENT";
5390 break;
5391 case SIDENT + ADD_NL:
5392 p = "SIDENT+NL";
5393 break;
5394 case KWORD:
5395 p = "KWORD";
5396 break;
5397 case KWORD + ADD_NL:
5398 p = "KWORD+NL";
5399 break;
5400 case SKWORD:
5401 p = "SKWORD";
5402 break;
5403 case SKWORD + ADD_NL:
5404 p = "SKWORD+NL";
5405 break;
5406 case FNAME:
5407 p = "FNAME";
5408 break;
5409 case FNAME + ADD_NL:
5410 p = "FNAME+NL";
5411 break;
5412 case SFNAME:
5413 p = "SFNAME";
5414 break;
5415 case SFNAME + ADD_NL:
5416 p = "SFNAME+NL";
5417 break;
5418 case PRINT:
5419 p = "PRINT";
5420 break;
5421 case PRINT + ADD_NL:
5422 p = "PRINT+NL";
5423 break;
5424 case SPRINT:
5425 p = "SPRINT";
5426 break;
5427 case SPRINT + ADD_NL:
5428 p = "SPRINT+NL";
5429 break;
5430 case WHITE:
5431 p = "WHITE";
5432 break;
5433 case WHITE + ADD_NL:
5434 p = "WHITE+NL";
5435 break;
5436 case NWHITE:
5437 p = "NWHITE";
5438 break;
5439 case NWHITE + ADD_NL:
5440 p = "NWHITE+NL";
5441 break;
5442 case DIGIT:
5443 p = "DIGIT";
5444 break;
5445 case DIGIT + ADD_NL:
5446 p = "DIGIT+NL";
5447 break;
5448 case NDIGIT:
5449 p = "NDIGIT";
5450 break;
5451 case NDIGIT + ADD_NL:
5452 p = "NDIGIT+NL";
5453 break;
5454 case HEX:
5455 p = "HEX";
5456 break;
5457 case HEX + ADD_NL:
5458 p = "HEX+NL";
5459 break;
5460 case NHEX:
5461 p = "NHEX";
5462 break;
5463 case NHEX + ADD_NL:
5464 p = "NHEX+NL";
5465 break;
5466 case OCTAL:
5467 p = "OCTAL";
5468 break;
5469 case OCTAL + ADD_NL:
5470 p = "OCTAL+NL";
5471 break;
5472 case NOCTAL:
5473 p = "NOCTAL";
5474 break;
5475 case NOCTAL + ADD_NL:
5476 p = "NOCTAL+NL";
5477 break;
5478 case WORD:
5479 p = "WORD";
5480 break;
5481 case WORD + ADD_NL:
5482 p = "WORD+NL";
5483 break;
5484 case NWORD:
5485 p = "NWORD";
5486 break;
5487 case NWORD + ADD_NL:
5488 p = "NWORD+NL";
5489 break;
5490 case HEAD:
5491 p = "HEAD";
5492 break;
5493 case HEAD + ADD_NL:
5494 p = "HEAD+NL";
5495 break;
5496 case NHEAD:
5497 p = "NHEAD";
5498 break;
5499 case NHEAD + ADD_NL:
5500 p = "NHEAD+NL";
5501 break;
5502 case ALPHA:
5503 p = "ALPHA";
5504 break;
5505 case ALPHA + ADD_NL:
5506 p = "ALPHA+NL";
5507 break;
5508 case NALPHA:
5509 p = "NALPHA";
5510 break;
5511 case NALPHA + ADD_NL:
5512 p = "NALPHA+NL";
5513 break;
5514 case LOWER:
5515 p = "LOWER";
5516 break;
5517 case LOWER + ADD_NL:
5518 p = "LOWER+NL";
5519 break;
5520 case NLOWER:
5521 p = "NLOWER";
5522 break;
5523 case NLOWER + ADD_NL:
5524 p = "NLOWER+NL";
5525 break;
5526 case UPPER:
5527 p = "UPPER";
5528 break;
5529 case UPPER + ADD_NL:
5530 p = "UPPER+NL";
5531 break;
5532 case NUPPER:
5533 p = "NUPPER";
5534 break;
5535 case NUPPER + ADD_NL:
5536 p = "NUPPER+NL";
5537 break;
5538 case BRANCH:
5539 p = "BRANCH";
5540 break;
5541 case EXACTLY:
5542 p = "EXACTLY";
5543 break;
5544 case NOTHING:
5545 p = "NOTHING";
5546 break;
5547 case BACK:
5548 p = "BACK";
5549 break;
5550 case END:
5551 p = "END";
5552 break;
5553 case MOPEN + 0:
5554 p = "MATCH START";
5555 break;
5556 case MOPEN + 1:
5557 case MOPEN + 2:
5558 case MOPEN + 3:
5559 case MOPEN + 4:
5560 case MOPEN + 5:
5561 case MOPEN + 6:
5562 case MOPEN + 7:
5563 case MOPEN + 8:
5564 case MOPEN + 9:
5565 sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
5566 p = NULL;
5567 break;
5568 case MCLOSE + 0:
5569 p = "MATCH END";
5570 break;
5571 case MCLOSE + 1:
5572 case MCLOSE + 2:
5573 case MCLOSE + 3:
5574 case MCLOSE + 4:
5575 case MCLOSE + 5:
5576 case MCLOSE + 6:
5577 case MCLOSE + 7:
5578 case MCLOSE + 8:
5579 case MCLOSE + 9:
5580 sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
5581 p = NULL;
5582 break;
5583 case BACKREF + 1:
5584 case BACKREF + 2:
5585 case BACKREF + 3:
5586 case BACKREF + 4:
5587 case BACKREF + 5:
5588 case BACKREF + 6:
5589 case BACKREF + 7:
5590 case BACKREF + 8:
5591 case BACKREF + 9:
5592 sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
5593 p = NULL;
5594 break;
5595 case NOPEN:
5596 p = "NOPEN";
5597 break;
5598 case NCLOSE:
5599 p = "NCLOSE";
5600 break;
5601#ifdef FEAT_SYN_HL
5602 case ZOPEN + 1:
5603 case ZOPEN + 2:
5604 case ZOPEN + 3:
5605 case ZOPEN + 4:
5606 case ZOPEN + 5:
5607 case ZOPEN + 6:
5608 case ZOPEN + 7:
5609 case ZOPEN + 8:
5610 case ZOPEN + 9:
5611 sprintf(buf + STRLEN(buf), "ZOPEN%d", OP(op) - ZOPEN);
5612 p = NULL;
5613 break;
5614 case ZCLOSE + 1:
5615 case ZCLOSE + 2:
5616 case ZCLOSE + 3:
5617 case ZCLOSE + 4:
5618 case ZCLOSE + 5:
5619 case ZCLOSE + 6:
5620 case ZCLOSE + 7:
5621 case ZCLOSE + 8:
5622 case ZCLOSE + 9:
5623 sprintf(buf + STRLEN(buf), "ZCLOSE%d", OP(op) - ZCLOSE);
5624 p = NULL;
5625 break;
5626 case ZREF + 1:
5627 case ZREF + 2:
5628 case ZREF + 3:
5629 case ZREF + 4:
5630 case ZREF + 5:
5631 case ZREF + 6:
5632 case ZREF + 7:
5633 case ZREF + 8:
5634 case ZREF + 9:
5635 sprintf(buf + STRLEN(buf), "ZREF%d", OP(op) - ZREF);
5636 p = NULL;
5637 break;
5638#endif
5639 case STAR:
5640 p = "STAR";
5641 break;
5642 case PLUS:
5643 p = "PLUS";
5644 break;
5645 case NOMATCH:
5646 p = "NOMATCH";
5647 break;
5648 case MATCH:
5649 p = "MATCH";
5650 break;
5651 case BEHIND:
5652 p = "BEHIND";
5653 break;
5654 case NOBEHIND:
5655 p = "NOBEHIND";
5656 break;
5657 case SUBPAT:
5658 p = "SUBPAT";
5659 break;
5660 case BRACE_LIMITS:
5661 p = "BRACE_LIMITS";
5662 break;
5663 case BRACE_SIMPLE:
5664 p = "BRACE_SIMPLE";
5665 break;
5666 case BRACE_COMPLEX + 0:
5667 case BRACE_COMPLEX + 1:
5668 case BRACE_COMPLEX + 2:
5669 case BRACE_COMPLEX + 3:
5670 case BRACE_COMPLEX + 4:
5671 case BRACE_COMPLEX + 5:
5672 case BRACE_COMPLEX + 6:
5673 case BRACE_COMPLEX + 7:
5674 case BRACE_COMPLEX + 8:
5675 case BRACE_COMPLEX + 9:
5676 sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
5677 p = NULL;
5678 break;
5679#ifdef FEAT_MBYTE
5680 case MULTIBYTECODE:
5681 p = "MULTIBYTECODE";
5682 break;
5683#endif
5684 case NEWL:
5685 p = "NEWL";
5686 break;
5687 default:
5688 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
5689 p = NULL;
5690 break;
5691 }
5692 if (p != NULL)
5693 (void) strcat(buf, p);
5694 return buf;
5695}
5696#endif
5697
5698#ifdef FEAT_MBYTE
5699static void mb_decompose __ARGS((int c, int *c1, int *c2, int *c3));
5700
5701typedef struct
5702{
5703 int a, b, c;
5704} decomp_T;
5705
5706
5707/* 0xfb20 - 0xfb4f */
5708decomp_T decomp_table[0xfb4f-0xfb20+1] =
5709{
5710 {0x5e2,0,0}, /* 0xfb20 alt ayin */
5711 {0x5d0,0,0}, /* 0xfb21 alt alef */
5712 {0x5d3,0,0}, /* 0xfb22 alt dalet */
5713 {0x5d4,0,0}, /* 0xfb23 alt he */
5714 {0x5db,0,0}, /* 0xfb24 alt kaf */
5715 {0x5dc,0,0}, /* 0xfb25 alt lamed */
5716 {0x5dd,0,0}, /* 0xfb26 alt mem-sofit */
5717 {0x5e8,0,0}, /* 0xfb27 alt resh */
5718 {0x5ea,0,0}, /* 0xfb28 alt tav */
5719 {'+', 0, 0}, /* 0xfb29 alt plus */
5720 {0x5e9, 0x5c1, 0}, /* 0xfb2a shin+shin-dot */
5721 {0x5e9, 0x5c2, 0}, /* 0xfb2b shin+sin-dot */
5722 {0x5e9, 0x5c1, 0x5bc}, /* 0xfb2c shin+shin-dot+dagesh */
5723 {0x5e9, 0x5c2, 0x5bc}, /* 0xfb2d shin+sin-dot+dagesh */
5724 {0x5d0, 0x5b7, 0}, /* 0xfb2e alef+patah */
5725 {0x5d0, 0x5b8, 0}, /* 0xfb2f alef+qamats */
5726 {0x5d0, 0x5b4, 0}, /* 0xfb30 alef+hiriq */
5727 {0x5d1, 0x5bc, 0}, /* 0xfb31 bet+dagesh */
5728 {0x5d2, 0x5bc, 0}, /* 0xfb32 gimel+dagesh */
5729 {0x5d3, 0x5bc, 0}, /* 0xfb33 dalet+dagesh */
5730 {0x5d4, 0x5bc, 0}, /* 0xfb34 he+dagesh */
5731 {0x5d5, 0x5bc, 0}, /* 0xfb35 vav+dagesh */
5732 {0x5d6, 0x5bc, 0}, /* 0xfb36 zayin+dagesh */
5733 {0xfb37, 0, 0}, /* 0xfb37 -- UNUSED */
5734 {0x5d8, 0x5bc, 0}, /* 0xfb38 tet+dagesh */
5735 {0x5d9, 0x5bc, 0}, /* 0xfb39 yud+dagesh */
5736 {0x5da, 0x5bc, 0}, /* 0xfb3a kaf sofit+dagesh */
5737 {0x5db, 0x5bc, 0}, /* 0xfb3b kaf+dagesh */
5738 {0x5dc, 0x5bc, 0}, /* 0xfb3c lamed+dagesh */
5739 {0xfb3d, 0, 0}, /* 0xfb3d -- UNUSED */
5740 {0x5de, 0x5bc, 0}, /* 0xfb3e mem+dagesh */
5741 {0xfb3f, 0, 0}, /* 0xfb3f -- UNUSED */
5742 {0x5e0, 0x5bc, 0}, /* 0xfb40 nun+dagesh */
5743 {0x5e1, 0x5bc, 0}, /* 0xfb41 samech+dagesh */
5744 {0xfb42, 0, 0}, /* 0xfb42 -- UNUSED */
5745 {0x5e3, 0x5bc, 0}, /* 0xfb43 pe sofit+dagesh */
5746 {0x5e4, 0x5bc,0}, /* 0xfb44 pe+dagesh */
5747 {0xfb45, 0, 0}, /* 0xfb45 -- UNUSED */
5748 {0x5e6, 0x5bc, 0}, /* 0xfb46 tsadi+dagesh */
5749 {0x5e7, 0x5bc, 0}, /* 0xfb47 qof+dagesh */
5750 {0x5e8, 0x5bc, 0}, /* 0xfb48 resh+dagesh */
5751 {0x5e9, 0x5bc, 0}, /* 0xfb49 shin+dagesh */
5752 {0x5ea, 0x5bc, 0}, /* 0xfb4a tav+dagesh */
5753 {0x5d5, 0x5b9, 0}, /* 0xfb4b vav+holam */
5754 {0x5d1, 0x5bf, 0}, /* 0xfb4c bet+rafe */
5755 {0x5db, 0x5bf, 0}, /* 0xfb4d kaf+rafe */
5756 {0x5e4, 0x5bf, 0}, /* 0xfb4e pe+rafe */
5757 {0x5d0, 0x5dc, 0} /* 0xfb4f alef-lamed */
5758};
5759
5760 static void
5761mb_decompose(c, c1, c2, c3)
5762 int c, *c1, *c2, *c3;
5763{
5764 decomp_T d;
5765
5766 if (c >= 0x4b20 && c <= 0xfb4f)
5767 {
5768 d = decomp_table[c - 0xfb20];
5769 *c1 = d.a;
5770 *c2 = d.b;
5771 *c3 = d.c;
5772 }
5773 else
5774 {
5775 *c1 = c;
5776 *c2 = *c3 = 0;
5777 }
5778}
5779#endif
5780
5781/*
5782 * Compare two strings, ignore case if ireg_ic set.
5783 * Return 0 if strings match, non-zero otherwise.
5784 * Correct the length "*n" when composing characters are ignored.
5785 */
5786 static int
5787cstrncmp(s1, s2, n)
5788 char_u *s1, *s2;
5789 int *n;
5790{
5791 int result;
5792
5793 if (!ireg_ic)
5794 result = STRNCMP(s1, s2, *n);
5795 else
5796 result = MB_STRNICMP(s1, s2, *n);
5797
5798#ifdef FEAT_MBYTE
5799 /* if it failed and it's utf8 and we want to combineignore: */
5800 if (result != 0 && enc_utf8 && ireg_icombine)
5801 {
5802 char_u *str1, *str2;
5803 int c1, c2, c11, c12;
5804 int ix;
5805 int junk;
5806
5807 /* we have to handle the strcmp ourselves, since it is necessary to
5808 * deal with the composing characters by ignoring them: */
5809 str1 = s1;
5810 str2 = s2;
5811 c1 = c2 = 0;
5812 for (ix = 0; ix < *n; )
5813 {
5814 c1 = mb_ptr2char_adv(&str1);
5815 c2 = mb_ptr2char_adv(&str2);
5816 ix += utf_char2len(c1);
5817
5818 /* decompose the character if necessary, into 'base' characters
5819 * because I don't care about Arabic, I will hard-code the Hebrew
5820 * which I *do* care about! So sue me... */
5821 if (c1 != c2 && (!ireg_ic || utf_fold(c1) != utf_fold(c2)))
5822 {
5823 /* decomposition necessary? */
5824 mb_decompose(c1, &c11, &junk, &junk);
5825 mb_decompose(c2, &c12, &junk, &junk);
5826 c1 = c11;
5827 c2 = c12;
5828 if (c11 != c12 && (!ireg_ic || utf_fold(c11) != utf_fold(c12)))
5829 break;
5830 }
5831 }
5832 result = c2 - c1;
5833 if (result == 0)
5834 *n = (int)(str2 - s2);
5835 }
5836#endif
5837
5838 return result;
5839}
5840
5841/*
5842 * cstrchr: This function is used a lot for simple searches, keep it fast!
5843 */
5844 static char_u *
5845cstrchr(s, c)
5846 char_u *s;
5847 int c;
5848{
5849 char_u *p;
5850 int cc;
5851
5852 if (!ireg_ic
5853#ifdef FEAT_MBYTE
5854 || (!enc_utf8 && mb_char2len(c) > 1)
5855#endif
5856 )
5857 return vim_strchr(s, c);
5858
5859 /* tolower() and toupper() can be slow, comparing twice should be a lot
5860 * faster (esp. when using MS Visual C++!).
5861 * For UTF-8 need to use folded case. */
5862#ifdef FEAT_MBYTE
5863 if (enc_utf8 && c > 0x80)
5864 cc = utf_fold(c);
5865 else
5866#endif
5867 if (isupper(c))
5868 cc = TOLOWER_LOC(c);
5869 else if (islower(c))
5870 cc = TOUPPER_LOC(c);
5871 else
5872 return vim_strchr(s, c);
5873
5874#ifdef FEAT_MBYTE
5875 if (has_mbyte)
5876 {
5877 for (p = s; *p != NUL; p += (*mb_ptr2len_check)(p))
5878 {
5879 if (enc_utf8 && c > 0x80)
5880 {
5881 if (utf_fold(utf_ptr2char(p)) == cc)
5882 return p;
5883 }
5884 else if (*p == c || *p == cc)
5885 return p;
5886 }
5887 }
5888 else
5889#endif
5890 /* Faster version for when there are no multi-byte characters. */
5891 for (p = s; *p != NUL; ++p)
5892 if (*p == c || *p == cc)
5893 return p;
5894
5895 return NULL;
5896}
5897
5898/***************************************************************
5899 * regsub stuff *
5900 ***************************************************************/
5901
5902/* This stuff below really confuses cc on an SGI -- webb */
5903#ifdef __sgi
5904# undef __ARGS
5905# define __ARGS(x) ()
5906#endif
5907
5908/*
5909 * We should define ftpr as a pointer to a function returning a pointer to
5910 * a function returning a pointer to a function ...
5911 * This is impossible, so we declare a pointer to a function returning a
5912 * pointer to a function returning void. This should work for all compilers.
5913 */
5914typedef void (*(*fptr) __ARGS((char_u *, int)))();
5915
5916static fptr do_upper __ARGS((char_u *, int));
5917static fptr do_Upper __ARGS((char_u *, int));
5918static fptr do_lower __ARGS((char_u *, int));
5919static fptr do_Lower __ARGS((char_u *, int));
5920
5921static int vim_regsub_both __ARGS((char_u *source, char_u *dest, int copy, int magic, int backslash));
5922
5923 static fptr
5924do_upper(d, c)
5925 char_u *d;
5926 int c;
5927{
5928 *d = TOUPPER_LOC(c);
5929
5930 return (fptr)NULL;
5931}
5932
5933 static fptr
5934do_Upper(d, c)
5935 char_u *d;
5936 int c;
5937{
5938 *d = TOUPPER_LOC(c);
5939
5940 return (fptr)do_Upper;
5941}
5942
5943 static fptr
5944do_lower(d, c)
5945 char_u *d;
5946 int c;
5947{
5948 *d = TOLOWER_LOC(c);
5949
5950 return (fptr)NULL;
5951}
5952
5953 static fptr
5954do_Lower(d, c)
5955 char_u *d;
5956 int c;
5957{
5958 *d = TOLOWER_LOC(c);
5959
5960 return (fptr)do_Lower;
5961}
5962
5963/*
5964 * regtilde(): Replace tildes in the pattern by the old pattern.
5965 *
5966 * Short explanation of the tilde: It stands for the previous replacement
5967 * pattern. If that previous pattern also contains a ~ we should go back a
5968 * step further... But we insert the previous pattern into the current one
5969 * and remember that.
5970 * This still does not handle the case where "magic" changes. TODO?
5971 *
5972 * The tildes are parsed once before the first call to vim_regsub().
5973 */
5974 char_u *
5975regtilde(source, magic)
5976 char_u *source;
5977 int magic;
5978{
5979 char_u *newsub = source;
5980 char_u *tmpsub;
5981 char_u *p;
5982 int len;
5983 int prevlen;
5984
5985 for (p = newsub; *p; ++p)
5986 {
5987 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
5988 {
5989 if (reg_prev_sub != NULL)
5990 {
5991 /* length = len(newsub) - 1 + len(prev_sub) + 1 */
5992 prevlen = (int)STRLEN(reg_prev_sub);
5993 tmpsub = alloc((unsigned)(STRLEN(newsub) + prevlen));
5994 if (tmpsub != NULL)
5995 {
5996 /* copy prefix */
5997 len = (int)(p - newsub); /* not including ~ */
5998 mch_memmove(tmpsub, newsub, (size_t)len);
5999 /* interpretate tilde */
6000 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
6001 /* copy postfix */
6002 if (!magic)
6003 ++p; /* back off \ */
6004 STRCPY(tmpsub + len + prevlen, p + 1);
6005
6006 if (newsub != source) /* already allocated newsub */
6007 vim_free(newsub);
6008 newsub = tmpsub;
6009 p = newsub + len + prevlen;
6010 }
6011 }
6012 else if (magic)
6013 STRCPY(p, p + 1); /* remove '~' */
6014 else
6015 STRCPY(p, p + 2); /* remove '\~' */
6016 --p;
6017 }
6018 else
6019 {
6020 if (*p == '\\' && p[1]) /* skip escaped characters */
6021 ++p;
6022#ifdef FEAT_MBYTE
6023 if (has_mbyte)
6024 p += (*mb_ptr2len_check)(p) - 1;
6025#endif
6026 }
6027 }
6028
6029 vim_free(reg_prev_sub);
6030 if (newsub != source) /* newsub was allocated, just keep it */
6031 reg_prev_sub = newsub;
6032 else /* no ~ found, need to save newsub */
6033 reg_prev_sub = vim_strsave(newsub);
6034 return newsub;
6035}
6036
6037#ifdef FEAT_EVAL
6038static int can_f_submatch = FALSE; /* TRUE when submatch() can be used */
6039
6040/* These pointers are used instead of reg_match and reg_mmatch for
6041 * reg_submatch(). Needed for when the substitution string is an expression
6042 * that contains a call to substitute() and submatch(). */
6043static regmatch_T *submatch_match;
6044static regmmatch_T *submatch_mmatch;
6045#endif
6046
6047#if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) || defined(PROTO)
6048/*
6049 * vim_regsub() - perform substitutions after a vim_regexec() or
6050 * vim_regexec_multi() match.
6051 *
6052 * If "copy" is TRUE really copy into "dest".
6053 * If "copy" is FALSE nothing is copied, this is just to find out the length
6054 * of the result.
6055 *
6056 * If "backslash" is TRUE, a backslash will be removed later, need to double
6057 * them to keep them, and insert a backslash before a CR to avoid it being
6058 * replaced with a line break later.
6059 *
6060 * Note: The matched text must not change between the call of
6061 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
6062 * references invalid!
6063 *
6064 * Returns the size of the replacement, including terminating NUL.
6065 */
6066 int
6067vim_regsub(rmp, source, dest, copy, magic, backslash)
6068 regmatch_T *rmp;
6069 char_u *source;
6070 char_u *dest;
6071 int copy;
6072 int magic;
6073 int backslash;
6074{
6075 reg_match = rmp;
6076 reg_mmatch = NULL;
6077 reg_maxline = 0;
6078 return vim_regsub_both(source, dest, copy, magic, backslash);
6079}
6080#endif
6081
6082 int
6083vim_regsub_multi(rmp, lnum, source, dest, copy, magic, backslash)
6084 regmmatch_T *rmp;
6085 linenr_T lnum;
6086 char_u *source;
6087 char_u *dest;
6088 int copy;
6089 int magic;
6090 int backslash;
6091{
6092 reg_match = NULL;
6093 reg_mmatch = rmp;
6094 reg_buf = curbuf; /* always works on the current buffer! */
6095 reg_firstlnum = lnum;
6096 reg_maxline = curbuf->b_ml.ml_line_count - lnum;
6097 return vim_regsub_both(source, dest, copy, magic, backslash);
6098}
6099
6100 static int
6101vim_regsub_both(source, dest, copy, magic, backslash)
6102 char_u *source;
6103 char_u *dest;
6104 int copy;
6105 int magic;
6106 int backslash;
6107{
6108 char_u *src;
6109 char_u *dst;
6110 char_u *s;
6111 int c;
6112 int no = -1;
6113 fptr func = (fptr)NULL;
6114 linenr_T clnum = 0; /* init for GCC */
6115 int len = 0; /* init for GCC */
6116#ifdef FEAT_EVAL
6117 static char_u *eval_result = NULL;
6118#endif
6119#ifdef FEAT_MBYTE
6120 int l;
6121#endif
6122
6123
6124 /* Be paranoid... */
6125 if (source == NULL || dest == NULL)
6126 {
6127 EMSG(_(e_null));
6128 return 0;
6129 }
6130 if (prog_magic_wrong())
6131 return 0;
6132 src = source;
6133 dst = dest;
6134
6135 /*
6136 * When the substitute part starts with "\=" evaluate it as an expression.
6137 */
6138 if (source[0] == '\\' && source[1] == '='
6139#ifdef FEAT_EVAL
6140 && !can_f_submatch /* can't do this recursively */
6141#endif
6142 )
6143 {
6144#ifdef FEAT_EVAL
6145 /* To make sure that the length doesn't change between checking the
6146 * length and copying the string, and to speed up things, the
6147 * resulting string is saved from the call with "copy" == FALSE to the
6148 * call with "copy" == TRUE. */
6149 if (copy)
6150 {
6151 if (eval_result != NULL)
6152 {
6153 STRCPY(dest, eval_result);
6154 dst += STRLEN(eval_result);
6155 vim_free(eval_result);
6156 eval_result = NULL;
6157 }
6158 }
6159 else
6160 {
6161 linenr_T save_reg_maxline;
6162 win_T *save_reg_win;
6163 int save_ireg_ic;
6164
6165 vim_free(eval_result);
6166
6167 /* The expression may contain substitute(), which calls us
6168 * recursively. Make sure submatch() gets the text from the first
6169 * level. Don't need to save "reg_buf", because
6170 * vim_regexec_multi() can't be called recursively. */
6171 submatch_match = reg_match;
6172 submatch_mmatch = reg_mmatch;
6173 save_reg_maxline = reg_maxline;
6174 save_reg_win = reg_win;
6175 save_ireg_ic = ireg_ic;
6176 can_f_submatch = TRUE;
6177
6178 eval_result = eval_to_string(source + 2, NULL);
6179 if (eval_result != NULL)
6180 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00006181 for (s = eval_result; *s != NUL; mb_ptr_adv(s))
Bram Moolenaar071d4272004-06-13 20:20:40 +00006182 {
6183 /* Change NL to CR, so that it becomes a line break.
6184 * Skip over a backslashed character. */
6185 if (*s == NL)
6186 *s = CAR;
6187 else if (*s == '\\' && s[1] != NUL)
6188 ++s;
Bram Moolenaar071d4272004-06-13 20:20:40 +00006189 }
6190
6191 dst += STRLEN(eval_result);
6192 }
6193
6194 reg_match = submatch_match;
6195 reg_mmatch = submatch_mmatch;
6196 reg_maxline = save_reg_maxline;
6197 reg_win = save_reg_win;
6198 ireg_ic = save_ireg_ic;
6199 can_f_submatch = FALSE;
6200 }
6201#endif
6202 }
6203 else
6204 while ((c = *src++) != NUL)
6205 {
6206 if (c == '&' && magic)
6207 no = 0;
6208 else if (c == '\\' && *src != NUL)
6209 {
6210 if (*src == '&' && !magic)
6211 {
6212 ++src;
6213 no = 0;
6214 }
6215 else if ('0' <= *src && *src <= '9')
6216 {
6217 no = *src++ - '0';
6218 }
6219 else if (vim_strchr((char_u *)"uUlLeE", *src))
6220 {
6221 switch (*src++)
6222 {
6223 case 'u': func = (fptr)do_upper;
6224 continue;
6225 case 'U': func = (fptr)do_Upper;
6226 continue;
6227 case 'l': func = (fptr)do_lower;
6228 continue;
6229 case 'L': func = (fptr)do_Lower;
6230 continue;
6231 case 'e':
6232 case 'E': func = (fptr)NULL;
6233 continue;
6234 }
6235 }
6236 }
6237 if (no < 0) /* Ordinary character. */
6238 {
6239 if (c == '\\' && *src != NUL)
6240 {
6241 /* Check for abbreviations -- webb */
6242 switch (*src)
6243 {
6244 case 'r': c = CAR; ++src; break;
6245 case 'n': c = NL; ++src; break;
6246 case 't': c = TAB; ++src; break;
6247 /* Oh no! \e already has meaning in subst pat :-( */
6248 /* case 'e': c = ESC; ++src; break; */
6249 case 'b': c = Ctrl_H; ++src; break;
6250
6251 /* If "backslash" is TRUE the backslash will be removed
6252 * later. Used to insert a literal CR. */
6253 default: if (backslash)
6254 {
6255 if (copy)
6256 *dst = '\\';
6257 ++dst;
6258 }
6259 c = *src++;
6260 }
6261 }
6262
6263 /* Write to buffer, if copy is set. */
6264#ifdef FEAT_MBYTE
6265 if (has_mbyte && (l = (*mb_ptr2len_check)(src - 1)) > 1)
6266 {
6267 /* TODO: should use "func" here. */
6268 if (copy)
6269 mch_memmove(dst, src - 1, l);
6270 dst += l - 1;
6271 src += l - 1;
6272 }
6273 else
6274 {
6275#endif
6276 if (copy)
6277 {
6278 if (func == (fptr)NULL) /* just copy */
6279 *dst = c;
6280 else /* change case */
6281 func = (fptr)(func(dst, c));
6282 /* Turbo C complains without the typecast */
6283 }
6284#ifdef FEAT_MBYTE
6285 }
6286#endif
6287 dst++;
6288 }
6289 else
6290 {
6291 if (REG_MULTI)
6292 {
6293 clnum = reg_mmatch->startpos[no].lnum;
6294 if (clnum < 0 || reg_mmatch->endpos[no].lnum < 0)
6295 s = NULL;
6296 else
6297 {
6298 s = reg_getline(clnum) + reg_mmatch->startpos[no].col;
6299 if (reg_mmatch->endpos[no].lnum == clnum)
6300 len = reg_mmatch->endpos[no].col
6301 - reg_mmatch->startpos[no].col;
6302 else
6303 len = (int)STRLEN(s);
6304 }
6305 }
6306 else
6307 {
6308 s = reg_match->startp[no];
6309 if (reg_match->endp[no] == NULL)
6310 s = NULL;
6311 else
6312 len = (int)(reg_match->endp[no] - s);
6313 }
6314 if (s != NULL)
6315 {
6316 for (;;)
6317 {
6318 if (len == 0)
6319 {
6320 if (REG_MULTI)
6321 {
6322 if (reg_mmatch->endpos[no].lnum == clnum)
6323 break;
6324 if (copy)
6325 *dst = CAR;
6326 ++dst;
6327 s = reg_getline(++clnum);
6328 if (reg_mmatch->endpos[no].lnum == clnum)
6329 len = reg_mmatch->endpos[no].col;
6330 else
6331 len = (int)STRLEN(s);
6332 }
6333 else
6334 break;
6335 }
6336 else if (*s == NUL) /* we hit NUL. */
6337 {
6338 if (copy)
6339 EMSG(_(e_re_damg));
6340 goto exit;
6341 }
6342 else
6343 {
6344 if (backslash && (*s == CAR || *s == '\\'))
6345 {
6346 /*
6347 * Insert a backslash in front of a CR, otherwise
6348 * it will be replaced by a line break.
6349 * Number of backslashes will be halved later,
6350 * double them here.
6351 */
6352 if (copy)
6353 {
6354 dst[0] = '\\';
6355 dst[1] = *s;
6356 }
6357 dst += 2;
6358 }
6359#ifdef FEAT_MBYTE
6360 else if (has_mbyte && (l = (*mb_ptr2len_check)(s)) > 1)
6361 {
6362 /* TODO: should use "func" here. */
6363 if (copy)
6364 mch_memmove(dst, s, l);
6365 dst += l;
6366 s += l - 1;
6367 len -= l - 1;
6368 }
6369#endif
6370 else
6371 {
6372 if (copy)
6373 {
6374 if (func == (fptr)NULL) /* just copy */
6375 *dst = *s;
6376 else /* change case */
6377 func = (fptr)(func(dst, *s));
6378 /* Turbo C complains without the typecast */
6379 }
6380 ++dst;
6381 }
6382 ++s;
6383 --len;
6384 }
6385 }
6386 }
6387 no = -1;
6388 }
6389 }
6390 if (copy)
6391 *dst = NUL;
6392
6393exit:
6394 return (int)((dst - dest) + 1);
6395}
6396
6397#ifdef FEAT_EVAL
6398/*
6399 * Used for the submatch() function: get the string from tne n'th submatch in
6400 * allocated memory.
6401 * Returns NULL when not in a ":s" command and for a non-existing submatch.
6402 */
6403 char_u *
6404reg_submatch(no)
6405 int no;
6406{
6407 char_u *retval = NULL;
6408 char_u *s;
6409 int len;
6410 int round;
6411 linenr_T lnum;
6412
6413 if (!can_f_submatch)
6414 return NULL;
6415
6416 if (submatch_match == NULL)
6417 {
6418 /*
6419 * First round: compute the length and allocate memory.
6420 * Second round: copy the text.
6421 */
6422 for (round = 1; round <= 2; ++round)
6423 {
6424 lnum = submatch_mmatch->startpos[no].lnum;
6425 if (lnum < 0 || submatch_mmatch->endpos[no].lnum < 0)
6426 return NULL;
6427
6428 s = reg_getline(lnum) + submatch_mmatch->startpos[no].col;
6429 if (s == NULL) /* anti-crash check, cannot happen? */
6430 break;
6431 if (submatch_mmatch->endpos[no].lnum == lnum)
6432 {
6433 /* Within one line: take form start to end col. */
6434 len = submatch_mmatch->endpos[no].col
6435 - submatch_mmatch->startpos[no].col;
6436 if (round == 2)
6437 {
6438 STRNCPY(retval, s, len);
6439 retval[len] = NUL;
6440 }
6441 ++len;
6442 }
6443 else
6444 {
6445 /* Multiple lines: take start line from start col, middle
6446 * lines completely and end line up to end col. */
6447 len = (int)STRLEN(s);
6448 if (round == 2)
6449 {
6450 STRCPY(retval, s);
6451 retval[len] = '\n';
6452 }
6453 ++len;
6454 ++lnum;
6455 while (lnum < submatch_mmatch->endpos[no].lnum)
6456 {
6457 s = reg_getline(lnum++);
6458 if (round == 2)
6459 STRCPY(retval + len, s);
6460 len += (int)STRLEN(s);
6461 if (round == 2)
6462 retval[len] = '\n';
6463 ++len;
6464 }
6465 if (round == 2)
6466 STRNCPY(retval + len, reg_getline(lnum),
6467 submatch_mmatch->endpos[no].col);
6468 len += submatch_mmatch->endpos[no].col;
6469 if (round == 2)
6470 retval[len] = NUL;
6471 ++len;
6472 }
6473
6474 if (round == 1)
6475 {
6476 retval = lalloc((long_u)len, TRUE);
6477 if (s == NULL)
6478 return NULL;
6479 }
6480 }
6481 }
6482 else
6483 {
6484 if (submatch_match->endp[no] == NULL)
6485 retval = NULL;
6486 else
6487 {
6488 s = submatch_match->startp[no];
6489 retval = vim_strnsave(s, (int)(submatch_match->endp[no] - s));
6490 }
6491 }
6492
6493 return retval;
6494}
6495#endif