blob: 2e828541abb71d5dff9be3efd8866958518a2bf8 [file] [log] [blame]
Bram Moolenaar071d4272004-06-13 20:20:40 +00001/* vi:set ts=8 sts=4 sw=4:
2 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
4 *
5 * NOTICE:
6 *
7 * This is NOT the original regular expression code as written by Henry
8 * Spencer. This code has been modified specifically for use with the VIM
9 * editor, and should not be used separately from Vim. If you want a good
10 * regular expression library, get the original code. The copyright notice
11 * that follows is from the original.
12 *
13 * END NOTICE
14 *
15 * Copyright (c) 1986 by University of Toronto.
16 * Written by Henry Spencer. Not derived from licensed software.
17 *
18 * Permission is granted to anyone to use this software for any
19 * purpose on any computer system, and to redistribute it freely,
20 * subject to the following restrictions:
21 *
22 * 1. The author is not responsible for the consequences of use of
23 * this software, no matter how awful, even if they arise
24 * from defects in it.
25 *
26 * 2. The origin of this software must not be misrepresented, either
27 * by explicit claim or by omission.
28 *
29 * 3. Altered versions must be plainly marked as such, and must not
30 * be misrepresented as being the original software.
31 *
32 * Beware that some of this code is subtly aware of the way operator
33 * precedence is structured in regular expressions. Serious changes in
34 * regular-expression syntax might require a total rethink.
35 *
Bram Moolenaarc0197e22004-09-13 20:26:32 +000036 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
37 * Webb, Ciaran McCreesh and Bram Moolenaar.
Bram Moolenaar071d4272004-06-13 20:20:40 +000038 * Named character class support added by Walter Briscoe (1998 Jul 01)
39 */
40
41#include "vim.h"
42
43#undef DEBUG
44
45/*
46 * The "internal use only" fields in regexp.h are present to pass info from
47 * compile to execute that permits the execute phase to run lots faster on
48 * simple cases. They are:
49 *
50 * regstart char that must begin a match; NUL if none obvious; Can be a
51 * multi-byte character.
52 * reganch is the match anchored (at beginning-of-line only)?
53 * regmust string (pointer into program) that match must include, or NULL
54 * regmlen length of regmust string
55 * regflags RF_ values or'ed together
56 *
57 * Regstart and reganch permit very fast decisions on suitable starting points
58 * for a match, cutting down the work a lot. Regmust permits fast rejection
59 * of lines that cannot possibly match. The regmust tests are costly enough
60 * that vim_regcomp() supplies a regmust only if the r.e. contains something
61 * potentially expensive (at present, the only such thing detected is * or +
62 * at the start of the r.e., which can involve a lot of backup). Regmlen is
63 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
64 * computing it anyway.
65 */
66
67/*
68 * Structure for regexp "program". This is essentially a linear encoding
69 * of a nondeterministic finite-state machine (aka syntax charts or
70 * "railroad normal form" in parsing technology). Each node is an opcode
71 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
72 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
73 * pointer with a BRANCH on both ends of it is connecting two alternatives.
74 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
75 * (as opposed to a collection of them) is never concatenated with anything
76 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
77 * node points to the node after the stuff to be repeated. The operand of some
78 * types of node is a literal string; for others, it is a node leading into a
79 * sub-FSM. In particular, the operand of a BRANCH node is the first node of
80 * the branch. (NB this is *not* a tree structure: the tail of the branch
81 * connects to the thing following the set of BRANCHes.)
82 *
83 * pattern is coded like:
84 *
85 * +-----------------+
86 * | V
87 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
88 * | ^ | ^
89 * +------+ +----------+
90 *
91 *
92 * +------------------+
93 * V |
94 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
95 * | | ^ ^
96 * | +---------------+ |
97 * +---------------------------------------------+
98 *
99 *
100 * +-------------------------+
101 * V |
102 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
103 * | | ^
104 * | +----------------+
105 * +-----------------------------------------------+
106 *
107 *
108 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
109 * | | ^ ^
110 * | +----------------+ |
111 * +--------------------------------+
112 *
113 * +---------+
114 * | V
115 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
116 * | | | | ^ ^
117 * | | | +-----+ |
118 * | | +----------------+ |
119 * | +---------------------------+ |
120 * +------------------------------------------------------+
121 *
122 * They all start with a BRANCH for "\|" alternaties, even when there is only
123 * one alternative.
124 */
125
126/*
127 * The opcodes are:
128 */
129
130/* definition number opnd? meaning */
131#define END 0 /* End of program or NOMATCH operand. */
132#define BOL 1 /* Match "" at beginning of line. */
133#define EOL 2 /* Match "" at end of line. */
134#define BRANCH 3 /* node Match this alternative, or the
135 * next... */
136#define BACK 4 /* Match "", "next" ptr points backward. */
137#define EXACTLY 5 /* str Match this string. */
138#define NOTHING 6 /* Match empty string. */
139#define STAR 7 /* node Match this (simple) thing 0 or more
140 * times. */
141#define PLUS 8 /* node Match this (simple) thing 1 or more
142 * times. */
143#define MATCH 9 /* node match the operand zero-width */
144#define NOMATCH 10 /* node check for no match with operand */
145#define BEHIND 11 /* node look behind for a match with operand */
146#define NOBEHIND 12 /* node look behind for no match with operand */
147#define SUBPAT 13 /* node match the operand here */
148#define BRACE_SIMPLE 14 /* node Match this (simple) thing between m and
149 * n times (\{m,n\}). */
150#define BOW 15 /* Match "" after [^a-zA-Z0-9_] */
151#define EOW 16 /* Match "" at [^a-zA-Z0-9_] */
152#define BRACE_LIMITS 17 /* nr nr define the min & max for BRACE_SIMPLE
153 * and BRACE_COMPLEX. */
154#define NEWL 18 /* Match line-break */
155#define BHPOS 19 /* End position for BEHIND or NOBEHIND */
156
157
158/* character classes: 20-48 normal, 50-78 include a line-break */
159#define ADD_NL 30
160#define FIRST_NL ANY + ADD_NL
161#define ANY 20 /* Match any one character. */
162#define ANYOF 21 /* str Match any character in this string. */
163#define ANYBUT 22 /* str Match any character not in this
164 * string. */
165#define IDENT 23 /* Match identifier char */
166#define SIDENT 24 /* Match identifier char but no digit */
167#define KWORD 25 /* Match keyword char */
168#define SKWORD 26 /* Match word char but no digit */
169#define FNAME 27 /* Match file name char */
170#define SFNAME 28 /* Match file name char but no digit */
171#define PRINT 29 /* Match printable char */
172#define SPRINT 30 /* Match printable char but no digit */
173#define WHITE 31 /* Match whitespace char */
174#define NWHITE 32 /* Match non-whitespace char */
175#define DIGIT 33 /* Match digit char */
176#define NDIGIT 34 /* Match non-digit char */
177#define HEX 35 /* Match hex char */
178#define NHEX 36 /* Match non-hex char */
179#define OCTAL 37 /* Match octal char */
180#define NOCTAL 38 /* Match non-octal char */
181#define WORD 39 /* Match word char */
182#define NWORD 40 /* Match non-word char */
183#define HEAD 41 /* Match head char */
184#define NHEAD 42 /* Match non-head char */
185#define ALPHA 43 /* Match alpha char */
186#define NALPHA 44 /* Match non-alpha char */
187#define LOWER 45 /* Match lowercase char */
188#define NLOWER 46 /* Match non-lowercase char */
189#define UPPER 47 /* Match uppercase char */
190#define NUPPER 48 /* Match non-uppercase char */
191#define LAST_NL NUPPER + ADD_NL
192#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
193
194#define MOPEN 80 /* -89 Mark this point in input as start of
195 * \( subexpr. MOPEN + 0 marks start of
196 * match. */
197#define MCLOSE 90 /* -99 Analogous to MOPEN. MCLOSE + 0 marks
198 * end of match. */
199#define BACKREF 100 /* -109 node Match same string again \1-\9 */
200
201#ifdef FEAT_SYN_HL
202# define ZOPEN 110 /* -119 Mark this point in input as start of
203 * \z( subexpr. */
204# define ZCLOSE 120 /* -129 Analogous to ZOPEN. */
205# define ZREF 130 /* -139 node Match external submatch \z1-\z9 */
206#endif
207
208#define BRACE_COMPLEX 140 /* -149 node Match nodes between m & n times */
209
210#define NOPEN 150 /* Mark this point in input as start of
211 \%( subexpr. */
212#define NCLOSE 151 /* Analogous to NOPEN. */
213
214#define MULTIBYTECODE 200 /* mbc Match one multi-byte character */
215#define RE_BOF 201 /* Match "" at beginning of file. */
216#define RE_EOF 202 /* Match "" at end of file. */
217#define CURSOR 203 /* Match location of cursor. */
218
219#define RE_LNUM 204 /* nr cmp Match line number */
220#define RE_COL 205 /* nr cmp Match column number */
221#define RE_VCOL 206 /* nr cmp Match virtual column number */
222
223/*
224 * Magic characters have a special meaning, they don't match literally.
225 * Magic characters are negative. This separates them from literal characters
226 * (possibly multi-byte). Only ASCII characters can be Magic.
227 */
228#define Magic(x) ((int)(x) - 256)
229#define un_Magic(x) ((x) + 256)
230#define is_Magic(x) ((x) < 0)
231
232static int no_Magic __ARGS((int x));
233static int toggle_Magic __ARGS((int x));
234
235 static int
236no_Magic(x)
237 int x;
238{
239 if (is_Magic(x))
240 return un_Magic(x);
241 return x;
242}
243
244 static int
245toggle_Magic(x)
246 int x;
247{
248 if (is_Magic(x))
249 return un_Magic(x);
250 return Magic(x);
251}
252
253/*
254 * The first byte of the regexp internal "program" is actually this magic
255 * number; the start node begins in the second byte. It's used to catch the
256 * most severe mutilation of the program by the caller.
257 */
258
259#define REGMAGIC 0234
260
261/*
262 * Opcode notes:
263 *
264 * BRANCH The set of branches constituting a single choice are hooked
265 * together with their "next" pointers, since precedence prevents
266 * anything being concatenated to any individual branch. The
267 * "next" pointer of the last BRANCH in a choice points to the
268 * thing following the whole choice. This is also where the
269 * final "next" pointer of each individual branch points; each
270 * branch starts with the operand node of a BRANCH node.
271 *
272 * BACK Normal "next" pointers all implicitly point forward; BACK
273 * exists to make loop structures possible.
274 *
275 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
276 * BRANCH structures using BACK. Simple cases (one character
277 * per match) are implemented with STAR and PLUS for speed
278 * and to minimize recursive plunges.
279 *
280 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
281 * node, and defines the min and max limits to be used for that
282 * node.
283 *
284 * MOPEN,MCLOSE ...are numbered at compile time.
285 * ZOPEN,ZCLOSE ...ditto
286 */
287
288/*
289 * A node is one char of opcode followed by two chars of "next" pointer.
290 * "Next" pointers are stored as two 8-bit bytes, high order first. The
291 * value is a positive offset from the opcode of the node containing it.
292 * An operand, if any, simply follows the node. (Note that much of the
293 * code generation knows about this implicit relationship.)
294 *
295 * Using two bytes for the "next" pointer is vast overkill for most things,
296 * but allows patterns to get big without disasters.
297 */
298#define OP(p) ((int)*(p))
299#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
300#define OPERAND(p) ((p) + 3)
301/* Obtain an operand that was stored as four bytes, MSB first. */
302#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
303 + ((long)(p)[5] << 8) + (long)(p)[6])
304/* Obtain a second operand stored as four bytes. */
305#define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
306/* Obtain a second single-byte operand stored after a four bytes operand. */
307#define OPERAND_CMP(p) (p)[7]
308
309/*
310 * Utility definitions.
311 */
312#define UCHARAT(p) ((int)*(char_u *)(p))
313
314/* Used for an error (down from) vim_regcomp(): give the error message, set
315 * rc_did_emsg and return NULL */
316#define EMSG_RET_NULL(m) { EMSG(m); rc_did_emsg = TRUE; return NULL; }
317#define EMSG_M_RET_NULL(m, c) { EMSG2(m, c ? "" : "\\"); rc_did_emsg = TRUE; return NULL; }
318#define EMSG_RET_FAIL(m) { EMSG(m); rc_did_emsg = TRUE; return FAIL; }
319#define EMSG_ONE_RET_NULL EMSG_M_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
320
321#define MAX_LIMIT (32767L << 16L)
322
323static int re_multi_type __ARGS((int));
324static int cstrncmp __ARGS((char_u *s1, char_u *s2, int *n));
325static char_u *cstrchr __ARGS((char_u *, int));
326
327#ifdef DEBUG
328static void regdump __ARGS((char_u *, regprog_T *));
329static char_u *regprop __ARGS((char_u *));
330#endif
331
332#define NOT_MULTI 0
333#define MULTI_ONE 1
334#define MULTI_MULT 2
335/*
336 * Return NOT_MULTI if c is not a "multi" operator.
337 * Return MULTI_ONE if c is a single "multi" operator.
338 * Return MULTI_MULT if c is a multi "multi" operator.
339 */
340 static int
341re_multi_type(c)
342 int c;
343{
344 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
345 return MULTI_ONE;
346 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
347 return MULTI_MULT;
348 return NOT_MULTI;
349}
350
351/*
352 * Flags to be passed up and down.
353 */
354#define HASWIDTH 0x1 /* Known never to match null string. */
355#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
356#define SPSTART 0x4 /* Starts with * or +. */
357#define HASNL 0x8 /* Contains some \n. */
358#define HASLOOKBH 0x10 /* Contains "\@<=" or "\@<!". */
359#define WORST 0 /* Worst case. */
360
361/*
362 * When regcode is set to this value, code is not emitted and size is computed
363 * instead.
364 */
365#define JUST_CALC_SIZE ((char_u *) -1)
366
367static char_u *reg_prev_sub;
368
369/*
370 * REGEXP_INRANGE contains all characters which are always special in a []
371 * range after '\'.
372 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
373 * These are:
374 * \n - New line (NL).
375 * \r - Carriage Return (CR).
376 * \t - Tab (TAB).
377 * \e - Escape (ESC).
378 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000379 * \d - Character code in decimal, eg \d123
380 * \o - Character code in octal, eg \o80
381 * \x - Character code in hex, eg \x4a
382 * \u - Multibyte character code, eg \u20ac
383 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000384 */
385static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000386static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000387
388static int backslash_trans __ARGS((int c));
389static int skip_class_name __ARGS((char_u **pp));
390static char_u *skip_anyof __ARGS((char_u *p));
391static void init_class_tab __ARGS((void));
392
393/*
394 * Translate '\x' to its control character, except "\n", which is Magic.
395 */
396 static int
397backslash_trans(c)
398 int c;
399{
400 switch (c)
401 {
402 case 'r': return CAR;
403 case 't': return TAB;
404 case 'e': return ESC;
405 case 'b': return BS;
406 }
407 return c;
408}
409
410/*
411 * Check for a character class name. "pp" points to the '['.
412 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
413 * recognized. Otherwise "pp" is advanced to after the item.
414 */
415 static int
416skip_class_name(pp)
417 char_u **pp;
418{
419 static const char *(class_names[]) =
420 {
421 "alnum:]",
422#define CLASS_ALNUM 0
423 "alpha:]",
424#define CLASS_ALPHA 1
425 "blank:]",
426#define CLASS_BLANK 2
427 "cntrl:]",
428#define CLASS_CNTRL 3
429 "digit:]",
430#define CLASS_DIGIT 4
431 "graph:]",
432#define CLASS_GRAPH 5
433 "lower:]",
434#define CLASS_LOWER 6
435 "print:]",
436#define CLASS_PRINT 7
437 "punct:]",
438#define CLASS_PUNCT 8
439 "space:]",
440#define CLASS_SPACE 9
441 "upper:]",
442#define CLASS_UPPER 10
443 "xdigit:]",
444#define CLASS_XDIGIT 11
445 "tab:]",
446#define CLASS_TAB 12
447 "return:]",
448#define CLASS_RETURN 13
449 "backspace:]",
450#define CLASS_BACKSPACE 14
451 "escape:]",
452#define CLASS_ESCAPE 15
453 };
454#define CLASS_NONE 99
455 int i;
456
457 if ((*pp)[1] == ':')
458 {
459 for (i = 0; i < sizeof(class_names) / sizeof(*class_names); ++i)
460 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
461 {
462 *pp += STRLEN(class_names[i]) + 2;
463 return i;
464 }
465 }
466 return CLASS_NONE;
467}
468
469/*
470 * Skip over a "[]" range.
471 * "p" must point to the character after the '['.
472 * The returned pointer is on the matching ']', or the terminating NUL.
473 */
474 static char_u *
475skip_anyof(p)
476 char_u *p;
477{
478 int cpo_lit; /* 'cpoptions' contains 'l' flag */
479#ifdef FEAT_MBYTE
480 int l;
481#endif
482
483 cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
484
485 if (*p == '^') /* Complement of range. */
486 ++p;
487 if (*p == ']' || *p == '-')
488 ++p;
489 while (*p != NUL && *p != ']')
490 {
491#ifdef FEAT_MBYTE
492 if (has_mbyte && (l = (*mb_ptr2len_check)(p)) > 1)
493 p += l;
494 else
495#endif
496 if (*p == '-')
497 {
498 ++p;
499 if (*p != ']' && *p != NUL)
Bram Moolenaar1cd871b2004-12-19 22:46:22 +0000500 mb_ptr_adv(p);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000501 }
502 else if (*p == '\\'
503 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
504 || (!cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
505 p += 2;
506 else if (*p == '[')
507 {
508 if (skip_class_name(&p) == CLASS_NONE)
509 ++p; /* It was not a class name */
510 }
511 else
512 ++p;
513 }
514
515 return p;
516}
517
518/*
519 * Specific version of character class functions.
520 * Using a table to keep this fast.
521 */
522static short class_tab[256];
523
524#define RI_DIGIT 0x01
525#define RI_HEX 0x02
526#define RI_OCTAL 0x04
527#define RI_WORD 0x08
528#define RI_HEAD 0x10
529#define RI_ALPHA 0x20
530#define RI_LOWER 0x40
531#define RI_UPPER 0x80
532#define RI_WHITE 0x100
533
534 static void
535init_class_tab()
536{
537 int i;
538 static int done = FALSE;
539
540 if (done)
541 return;
542
543 for (i = 0; i < 256; ++i)
544 {
545 if (i >= '0' && i <= '7')
546 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
547 else if (i >= '8' && i <= '9')
548 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
549 else if (i >= 'a' && i <= 'f')
550 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
551#ifdef EBCDIC
552 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
553 || (i >= 's' && i <= 'z'))
554#else
555 else if (i >= 'g' && i <= 'z')
556#endif
557 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
558 else if (i >= 'A' && i <= 'F')
559 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
560#ifdef EBCDIC
561 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
562 || (i >= 'S' && i <= 'Z'))
563#else
564 else if (i >= 'G' && i <= 'Z')
565#endif
566 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
567 else if (i == '_')
568 class_tab[i] = RI_WORD + RI_HEAD;
569 else
570 class_tab[i] = 0;
571 }
572 class_tab[' '] |= RI_WHITE;
573 class_tab['\t'] |= RI_WHITE;
574 done = TRUE;
575}
576
577#ifdef FEAT_MBYTE
578# define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
579# define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
580# define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
581# define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
582# define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
583# define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
584# define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
585# define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
586# define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
587#else
588# define ri_digit(c) (class_tab[c] & RI_DIGIT)
589# define ri_hex(c) (class_tab[c] & RI_HEX)
590# define ri_octal(c) (class_tab[c] & RI_OCTAL)
591# define ri_word(c) (class_tab[c] & RI_WORD)
592# define ri_head(c) (class_tab[c] & RI_HEAD)
593# define ri_alpha(c) (class_tab[c] & RI_ALPHA)
594# define ri_lower(c) (class_tab[c] & RI_LOWER)
595# define ri_upper(c) (class_tab[c] & RI_UPPER)
596# define ri_white(c) (class_tab[c] & RI_WHITE)
597#endif
598
599/* flags for regflags */
600#define RF_ICASE 1 /* ignore case */
601#define RF_NOICASE 2 /* don't ignore case */
602#define RF_HASNL 4 /* can match a NL */
603#define RF_ICOMBINE 8 /* ignore combining characters */
604#define RF_LOOKBH 16 /* uses "\@<=" or "\@<!" */
605
606/*
607 * Global work variables for vim_regcomp().
608 */
609
610static char_u *regparse; /* Input-scan pointer. */
611static int prevchr_len; /* byte length of previous char */
612static int num_complex_braces; /* Complex \{...} count */
613static int regnpar; /* () count. */
614#ifdef FEAT_SYN_HL
615static int regnzpar; /* \z() count. */
616static int re_has_z; /* \z item detected */
617#endif
618static char_u *regcode; /* Code-emit pointer, or JUST_CALC_SIZE */
619static long regsize; /* Code size. */
620static char_u had_endbrace[NSUBEXP]; /* flags, TRUE if end of () found */
621static unsigned regflags; /* RF_ flags for prog */
622static long brace_min[10]; /* Minimums for complex brace repeats */
623static long brace_max[10]; /* Maximums for complex brace repeats */
624static int brace_count[10]; /* Current counts for complex brace repeats */
625#if defined(FEAT_SYN_HL) || defined(PROTO)
626static int had_eol; /* TRUE when EOL found by vim_regcomp() */
627#endif
628static int one_exactly = FALSE; /* only do one char for EXACTLY */
629
630static int reg_magic; /* magicness of the pattern: */
631#define MAGIC_NONE 1 /* "\V" very unmagic */
632#define MAGIC_OFF 2 /* "\M" or 'magic' off */
633#define MAGIC_ON 3 /* "\m" or 'magic' */
634#define MAGIC_ALL 4 /* "\v" very magic */
635
636static int reg_string; /* matching with a string instead of a buffer
637 line */
638
639/*
640 * META contains all characters that may be magic, except '^' and '$'.
641 */
642
643#ifdef EBCDIC
644static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
645#else
646/* META[] is used often enough to justify turning it into a table. */
647static char_u META_flags[] = {
648 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
649 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
650/* % & ( ) * + . */
651 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
652/* 1 2 3 4 5 6 7 8 9 < = > ? */
653 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
654/* @ A C D F H I K L M O */
655 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
656/* P S U V W X Z [ _ */
657 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
658/* a c d f h i k l m n o */
659 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
660/* p s u v w x z { | ~ */
661 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
662};
663#endif
664
665static int curchr;
666
667/* arguments for reg() */
668#define REG_NOPAREN 0 /* toplevel reg() */
669#define REG_PAREN 1 /* \(\) */
670#define REG_ZPAREN 2 /* \z(\) */
671#define REG_NPAREN 3 /* \%(\) */
672
673/*
674 * Forward declarations for vim_regcomp()'s friends.
675 */
676static void initchr __ARGS((char_u *));
677static int getchr __ARGS((void));
678static void skipchr_keepstart __ARGS((void));
679static int peekchr __ARGS((void));
680static void skipchr __ARGS((void));
681static void ungetchr __ARGS((void));
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000682static int gethexchrs __ARGS((int maxinputlen));
683static int getoctchrs __ARGS((void));
684static int getdecchrs __ARGS((void));
685static int coll_get_char __ARGS((void));
Bram Moolenaar071d4272004-06-13 20:20:40 +0000686static void regcomp_start __ARGS((char_u *expr, int flags));
687static char_u *reg __ARGS((int, int *));
688static char_u *regbranch __ARGS((int *flagp));
689static char_u *regconcat __ARGS((int *flagp));
690static char_u *regpiece __ARGS((int *));
691static char_u *regatom __ARGS((int *));
692static char_u *regnode __ARGS((int));
693static int prog_magic_wrong __ARGS((void));
694static char_u *regnext __ARGS((char_u *));
695static void regc __ARGS((int b));
696#ifdef FEAT_MBYTE
697static void regmbc __ARGS((int c));
698#endif
699static void reginsert __ARGS((int, char_u *));
700static void reginsert_limits __ARGS((int, long, long, char_u *));
701static char_u *re_put_long __ARGS((char_u *pr, long_u val));
702static int read_limits __ARGS((long *, long *));
703static void regtail __ARGS((char_u *, char_u *));
704static void regoptail __ARGS((char_u *, char_u *));
705
706/*
707 * Return TRUE if compiled regular expression "prog" can match a line break.
708 */
709 int
710re_multiline(prog)
711 regprog_T *prog;
712{
713 return (prog->regflags & RF_HASNL);
714}
715
716/*
717 * Return TRUE if compiled regular expression "prog" looks before the start
718 * position (pattern contains "\@<=" or "\@<!").
719 */
720 int
721re_lookbehind(prog)
722 regprog_T *prog;
723{
724 return (prog->regflags & RF_LOOKBH);
725}
726
727/*
728 * Skip past regular expression.
729 * Stop at end of 'p' of where 'dirc' is found ('/', '?', etc).
730 * Take care of characters with a backslash in front of it.
731 * Skip strings inside [ and ].
732 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
733 * expression and change "\?" to "?". If "*newp" is not NULL the expression
734 * is changed in-place.
735 */
736 char_u *
737skip_regexp(startp, dirc, magic, newp)
738 char_u *startp;
739 int dirc;
740 int magic;
741 char_u **newp;
742{
743 int mymagic;
744 char_u *p = startp;
745
746 if (magic)
747 mymagic = MAGIC_ON;
748 else
749 mymagic = MAGIC_OFF;
750
Bram Moolenaar1cd871b2004-12-19 22:46:22 +0000751 for (; p[0] != NUL; mb_ptr_adv(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000752 {
753 if (p[0] == dirc) /* found end of regexp */
754 break;
755 if ((p[0] == '[' && mymagic >= MAGIC_ON)
756 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
757 {
758 p = skip_anyof(p + 1);
759 if (p[0] == NUL)
760 break;
761 }
762 else if (p[0] == '\\' && p[1] != NUL)
763 {
764 if (dirc == '?' && newp != NULL && p[1] == '?')
765 {
766 /* change "\?" to "?", make a copy first. */
767 if (*newp == NULL)
768 {
769 *newp = vim_strsave(startp);
770 if (*newp != NULL)
771 p = *newp + (p - startp);
772 }
773 if (*newp != NULL)
774 mch_memmove(p, p + 1, STRLEN(p));
775 else
776 ++p;
777 }
778 else
779 ++p; /* skip next character */
780 if (*p == 'v')
781 mymagic = MAGIC_ALL;
782 else if (*p == 'V')
783 mymagic = MAGIC_NONE;
784 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000785 }
786 return p;
787}
788
789/*
790 * vim_regcomp - compile a regular expression into internal code
791 *
792 * We can't allocate space until we know how big the compiled form will be,
793 * but we can't compile it (and thus know how big it is) until we've got a
794 * place to put the code. So we cheat: we compile it twice, once with code
795 * generation turned off and size counting turned on, and once "for real".
796 * This also means that we don't allocate space until we are sure that the
797 * thing really will compile successfully, and we never have to move the
798 * code and thus invalidate pointers into it. (Note that it has to be in
799 * one piece because vim_free() must be able to free it all.)
800 *
801 * Whether upper/lower case is to be ignored is decided when executing the
802 * program, it does not matter here.
803 *
804 * Beware that the optimization-preparation code in here knows about some
805 * of the structure of the compiled regexp.
806 * "re_flags": RE_MAGIC and/or RE_STRING.
807 */
808 regprog_T *
809vim_regcomp(expr, re_flags)
810 char_u *expr;
811 int re_flags;
812{
813 regprog_T *r;
814 char_u *scan;
815 char_u *longest;
816 int len;
817 int flags;
818
819 if (expr == NULL)
820 EMSG_RET_NULL(_(e_null));
821
822 init_class_tab();
823
824 /*
825 * First pass: determine size, legality.
826 */
827 regcomp_start(expr, re_flags);
828 regcode = JUST_CALC_SIZE;
829 regc(REGMAGIC);
830 if (reg(REG_NOPAREN, &flags) == NULL)
831 return NULL;
832
833 /* Small enough for pointer-storage convention? */
834#ifdef SMALL_MALLOC /* 16 bit storage allocation */
835 if (regsize >= 65536L - 256L)
836 EMSG_RET_NULL(_("E339: Pattern too long"));
837#endif
838
839 /* Allocate space. */
840 r = (regprog_T *)lalloc(sizeof(regprog_T) + regsize, TRUE);
841 if (r == NULL)
842 return NULL;
843
844 /*
845 * Second pass: emit code.
846 */
847 regcomp_start(expr, re_flags);
848 regcode = r->program;
849 regc(REGMAGIC);
850 if (reg(REG_NOPAREN, &flags) == NULL)
851 {
852 vim_free(r);
853 return NULL;
854 }
855
856 /* Dig out information for optimizations. */
857 r->regstart = NUL; /* Worst-case defaults. */
858 r->reganch = 0;
859 r->regmust = NULL;
860 r->regmlen = 0;
861 r->regflags = regflags;
862 if (flags & HASNL)
863 r->regflags |= RF_HASNL;
864 if (flags & HASLOOKBH)
865 r->regflags |= RF_LOOKBH;
866#ifdef FEAT_SYN_HL
867 /* Remember whether this pattern has any \z specials in it. */
868 r->reghasz = re_has_z;
869#endif
870 scan = r->program + 1; /* First BRANCH. */
871 if (OP(regnext(scan)) == END) /* Only one top-level choice. */
872 {
873 scan = OPERAND(scan);
874
875 /* Starting-point info. */
876 if (OP(scan) == BOL || OP(scan) == RE_BOF)
877 {
878 r->reganch++;
879 scan = regnext(scan);
880 }
881
882 if (OP(scan) == EXACTLY)
883 {
884#ifdef FEAT_MBYTE
885 if (has_mbyte)
886 r->regstart = (*mb_ptr2char)(OPERAND(scan));
887 else
888#endif
889 r->regstart = *OPERAND(scan);
890 }
891 else if ((OP(scan) == BOW
892 || OP(scan) == EOW
893 || OP(scan) == NOTHING
894 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
895 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
896 && OP(regnext(scan)) == EXACTLY)
897 {
898#ifdef FEAT_MBYTE
899 if (has_mbyte)
900 r->regstart = (*mb_ptr2char)(OPERAND(regnext(scan)));
901 else
902#endif
903 r->regstart = *OPERAND(regnext(scan));
904 }
905
906 /*
907 * If there's something expensive in the r.e., find the longest
908 * literal string that must appear and make it the regmust. Resolve
909 * ties in favor of later strings, since the regstart check works
910 * with the beginning of the r.e. and avoiding duplication
911 * strengthens checking. Not a strong reason, but sufficient in the
912 * absence of others.
913 */
914 /*
915 * When the r.e. starts with BOW, it is faster to look for a regmust
916 * first. Used a lot for "#" and "*" commands. (Added by mool).
917 */
918 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
919 && !(flags & HASNL))
920 {
921 longest = NULL;
922 len = 0;
923 for (; scan != NULL; scan = regnext(scan))
924 if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len)
925 {
926 longest = OPERAND(scan);
927 len = (int)STRLEN(OPERAND(scan));
928 }
929 r->regmust = longest;
930 r->regmlen = len;
931 }
932 }
933#ifdef DEBUG
934 regdump(expr, r);
935#endif
936 return r;
937}
938
939/*
940 * Setup to parse the regexp. Used once to get the length and once to do it.
941 */
942 static void
943regcomp_start(expr, re_flags)
944 char_u *expr;
945 int re_flags; /* see vim_regcomp() */
946{
947 initchr(expr);
948 if (re_flags & RE_MAGIC)
949 reg_magic = MAGIC_ON;
950 else
951 reg_magic = MAGIC_OFF;
952 reg_string = (re_flags & RE_STRING);
953
954 num_complex_braces = 0;
955 regnpar = 1;
956 vim_memset(had_endbrace, 0, sizeof(had_endbrace));
957#ifdef FEAT_SYN_HL
958 regnzpar = 1;
959 re_has_z = 0;
960#endif
961 regsize = 0L;
962 regflags = 0;
963#if defined(FEAT_SYN_HL) || defined(PROTO)
964 had_eol = FALSE;
965#endif
966}
967
968#if defined(FEAT_SYN_HL) || defined(PROTO)
969/*
970 * Check if during the previous call to vim_regcomp the EOL item "$" has been
971 * found. This is messy, but it works fine.
972 */
973 int
974vim_regcomp_had_eol()
975{
976 return had_eol;
977}
978#endif
979
980/*
981 * reg - regular expression, i.e. main body or parenthesized thing
982 *
983 * Caller must absorb opening parenthesis.
984 *
985 * Combining parenthesis handling with the base level of regular expression
986 * is a trifle forced, but the need to tie the tails of the branches to what
987 * follows makes it hard to avoid.
988 */
989 static char_u *
990reg(paren, flagp)
991 int paren; /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */
992 int *flagp;
993{
994 char_u *ret;
995 char_u *br;
996 char_u *ender;
997 int parno = 0;
998 int flags;
999
1000 *flagp = HASWIDTH; /* Tentatively. */
1001
1002#ifdef FEAT_SYN_HL
1003 if (paren == REG_ZPAREN)
1004 {
1005 /* Make a ZOPEN node. */
1006 if (regnzpar >= NSUBEXP)
1007 EMSG_RET_NULL(_("E50: Too many \\z("));
1008 parno = regnzpar;
1009 regnzpar++;
1010 ret = regnode(ZOPEN + parno);
1011 }
1012 else
1013#endif
1014 if (paren == REG_PAREN)
1015 {
1016 /* Make a MOPEN node. */
1017 if (regnpar >= NSUBEXP)
1018 EMSG_M_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL);
1019 parno = regnpar;
1020 ++regnpar;
1021 ret = regnode(MOPEN + parno);
1022 }
1023 else if (paren == REG_NPAREN)
1024 {
1025 /* Make a NOPEN node. */
1026 ret = regnode(NOPEN);
1027 }
1028 else
1029 ret = NULL;
1030
1031 /* Pick up the branches, linking them together. */
1032 br = regbranch(&flags);
1033 if (br == NULL)
1034 return NULL;
1035 if (ret != NULL)
1036 regtail(ret, br); /* [MZ]OPEN -> first. */
1037 else
1038 ret = br;
1039 /* If one of the branches can be zero-width, the whole thing can.
1040 * If one of the branches has * at start or matches a line-break, the
1041 * whole thing can. */
1042 if (!(flags & HASWIDTH))
1043 *flagp &= ~HASWIDTH;
1044 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1045 while (peekchr() == Magic('|'))
1046 {
1047 skipchr();
1048 br = regbranch(&flags);
1049 if (br == NULL)
1050 return NULL;
1051 regtail(ret, br); /* BRANCH -> BRANCH. */
1052 if (!(flags & HASWIDTH))
1053 *flagp &= ~HASWIDTH;
1054 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1055 }
1056
1057 /* Make a closing node, and hook it on the end. */
1058 ender = regnode(
1059#ifdef FEAT_SYN_HL
1060 paren == REG_ZPAREN ? ZCLOSE + parno :
1061#endif
1062 paren == REG_PAREN ? MCLOSE + parno :
1063 paren == REG_NPAREN ? NCLOSE : END);
1064 regtail(ret, ender);
1065
1066 /* Hook the tails of the branches to the closing node. */
1067 for (br = ret; br != NULL; br = regnext(br))
1068 regoptail(br, ender);
1069
1070 /* Check for proper termination. */
1071 if (paren != REG_NOPAREN && getchr() != Magic(')'))
1072 {
1073#ifdef FEAT_SYN_HL
1074 if (paren == REG_ZPAREN)
1075 EMSG_RET_NULL(_("E52: Unmatched \\z("))
1076 else
1077#endif
1078 if (paren == REG_NPAREN)
1079 EMSG_M_RET_NULL(_("E53: Unmatched %s%%("), reg_magic == MAGIC_ALL)
1080 else
1081 EMSG_M_RET_NULL(_("E54: Unmatched %s("), reg_magic == MAGIC_ALL)
1082 }
1083 else if (paren == REG_NOPAREN && peekchr() != NUL)
1084 {
1085 if (curchr == Magic(')'))
1086 EMSG_M_RET_NULL(_("E55: Unmatched %s)"), reg_magic == MAGIC_ALL)
1087 else
1088 EMSG_RET_NULL(_(e_trailing)) /* "Can't happen". */
1089 /* NOTREACHED */
1090 }
1091 /*
1092 * Here we set the flag allowing back references to this set of
1093 * parentheses.
1094 */
1095 if (paren == REG_PAREN)
1096 had_endbrace[parno] = TRUE; /* have seen the close paren */
1097 return ret;
1098}
1099
1100/*
1101 * regbranch - one alternative of an | operator
1102 *
1103 * Implements the & operator.
1104 */
1105 static char_u *
1106regbranch(flagp)
1107 int *flagp;
1108{
1109 char_u *ret;
1110 char_u *chain = NULL;
1111 char_u *latest;
1112 int flags;
1113
1114 *flagp = WORST | HASNL; /* Tentatively. */
1115
1116 ret = regnode(BRANCH);
1117 for (;;)
1118 {
1119 latest = regconcat(&flags);
1120 if (latest == NULL)
1121 return NULL;
1122 /* If one of the branches has width, the whole thing has. If one of
1123 * the branches anchors at start-of-line, the whole thing does.
1124 * If one of the branches uses look-behind, the whole thing does. */
1125 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
1126 /* If one of the branches doesn't match a line-break, the whole thing
1127 * doesn't. */
1128 *flagp &= ~HASNL | (flags & HASNL);
1129 if (chain != NULL)
1130 regtail(chain, latest);
1131 if (peekchr() != Magic('&'))
1132 break;
1133 skipchr();
1134 regtail(latest, regnode(END)); /* operand ends */
1135 reginsert(MATCH, latest);
1136 chain = latest;
1137 }
1138
1139 return ret;
1140}
1141
1142/*
1143 * regbranch - one alternative of an | or & operator
1144 *
1145 * Implements the concatenation operator.
1146 */
1147 static char_u *
1148regconcat(flagp)
1149 int *flagp;
1150{
1151 char_u *first = NULL;
1152 char_u *chain = NULL;
1153 char_u *latest;
1154 int flags;
1155 int cont = TRUE;
1156
1157 *flagp = WORST; /* Tentatively. */
1158
1159 while (cont)
1160 {
1161 switch (peekchr())
1162 {
1163 case NUL:
1164 case Magic('|'):
1165 case Magic('&'):
1166 case Magic(')'):
1167 cont = FALSE;
1168 break;
1169 case Magic('Z'):
1170#ifdef FEAT_MBYTE
1171 regflags |= RF_ICOMBINE;
1172#endif
1173 skipchr_keepstart();
1174 break;
1175 case Magic('c'):
1176 regflags |= RF_ICASE;
1177 skipchr_keepstart();
1178 break;
1179 case Magic('C'):
1180 regflags |= RF_NOICASE;
1181 skipchr_keepstart();
1182 break;
1183 case Magic('v'):
1184 reg_magic = MAGIC_ALL;
1185 skipchr_keepstart();
1186 curchr = -1;
1187 break;
1188 case Magic('m'):
1189 reg_magic = MAGIC_ON;
1190 skipchr_keepstart();
1191 curchr = -1;
1192 break;
1193 case Magic('M'):
1194 reg_magic = MAGIC_OFF;
1195 skipchr_keepstart();
1196 curchr = -1;
1197 break;
1198 case Magic('V'):
1199 reg_magic = MAGIC_NONE;
1200 skipchr_keepstart();
1201 curchr = -1;
1202 break;
1203 default:
1204 latest = regpiece(&flags);
1205 if (latest == NULL)
1206 return NULL;
1207 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
1208 if (chain == NULL) /* First piece. */
1209 *flagp |= flags & SPSTART;
1210 else
1211 regtail(chain, latest);
1212 chain = latest;
1213 if (first == NULL)
1214 first = latest;
1215 break;
1216 }
1217 }
1218 if (first == NULL) /* Loop ran zero times. */
1219 first = regnode(NOTHING);
1220 return first;
1221}
1222
1223/*
1224 * regpiece - something followed by possible [*+=]
1225 *
1226 * Note that the branching code sequences used for = and the general cases
1227 * of * and + are somewhat optimized: they use the same NOTHING node as
1228 * both the endmarker for their branch list and the body of the last branch.
1229 * It might seem that this node could be dispensed with entirely, but the
1230 * endmarker role is not redundant.
1231 */
1232 static char_u *
1233regpiece(flagp)
1234 int *flagp;
1235{
1236 char_u *ret;
1237 int op;
1238 char_u *next;
1239 int flags;
1240 long minval;
1241 long maxval;
1242
1243 ret = regatom(&flags);
1244 if (ret == NULL)
1245 return NULL;
1246
1247 op = peekchr();
1248 if (re_multi_type(op) == NOT_MULTI)
1249 {
1250 *flagp = flags;
1251 return ret;
1252 }
1253 if (!(flags & HASWIDTH) && re_multi_type(op) == MULTI_MULT)
1254 {
1255 if (op == Magic('*'))
1256 EMSG_M_RET_NULL(_("E56: %s* operand could be empty"),
1257 reg_magic >= MAGIC_ON);
1258 if (op == Magic('+'))
1259 EMSG_M_RET_NULL(_("E57: %s+ operand could be empty"),
1260 reg_magic == MAGIC_ALL);
1261 /* "\{}" is checked below, it's allowed when there is an upper limit */
1262 }
1263 /* default flags */
1264 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
1265
1266 skipchr();
1267 switch (op)
1268 {
1269 case Magic('*'):
1270 if (flags & SIMPLE)
1271 reginsert(STAR, ret);
1272 else
1273 {
1274 /* Emit x* as (x&|), where & means "self". */
1275 reginsert(BRANCH, ret); /* Either x */
1276 regoptail(ret, regnode(BACK)); /* and loop */
1277 regoptail(ret, ret); /* back */
1278 regtail(ret, regnode(BRANCH)); /* or */
1279 regtail(ret, regnode(NOTHING)); /* null. */
1280 }
1281 break;
1282
1283 case Magic('+'):
1284 if (flags & SIMPLE)
1285 reginsert(PLUS, ret);
1286 else
1287 {
1288 /* Emit x+ as x(&|), where & means "self". */
1289 next = regnode(BRANCH); /* Either */
1290 regtail(ret, next);
1291 regtail(regnode(BACK), ret); /* loop back */
1292 regtail(next, regnode(BRANCH)); /* or */
1293 regtail(ret, regnode(NOTHING)); /* null. */
1294 }
1295 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1296 break;
1297
1298 case Magic('@'):
1299 {
1300 int lop = END;
1301
1302 switch (no_Magic(getchr()))
1303 {
1304 case '=': lop = MATCH; break; /* \@= */
1305 case '!': lop = NOMATCH; break; /* \@! */
1306 case '>': lop = SUBPAT; break; /* \@> */
1307 case '<': switch (no_Magic(getchr()))
1308 {
1309 case '=': lop = BEHIND; break; /* \@<= */
1310 case '!': lop = NOBEHIND; break; /* \@<! */
1311 }
1312 }
1313 if (lop == END)
1314 EMSG_M_RET_NULL(_("E59: invalid character after %s@"),
1315 reg_magic == MAGIC_ALL);
1316 /* Look behind must match with behind_pos. */
1317 if (lop == BEHIND || lop == NOBEHIND)
1318 {
1319 regtail(ret, regnode(BHPOS));
1320 *flagp |= HASLOOKBH;
1321 }
1322 regtail(ret, regnode(END)); /* operand ends */
1323 reginsert(lop, ret);
1324 break;
1325 }
1326
1327 case Magic('?'):
1328 case Magic('='):
1329 /* Emit x= as (x|) */
1330 reginsert(BRANCH, ret); /* Either x */
1331 regtail(ret, regnode(BRANCH)); /* or */
1332 next = regnode(NOTHING); /* null. */
1333 regtail(ret, next);
1334 regoptail(ret, next);
1335 break;
1336
1337 case Magic('{'):
1338 if (!read_limits(&minval, &maxval))
1339 return NULL;
1340 if (!(flags & HASWIDTH) && (maxval > minval
1341 ? maxval >= MAX_LIMIT : minval >= MAX_LIMIT))
1342 EMSG_M_RET_NULL(_("E58: %s{ operand could be empty"),
1343 reg_magic == MAGIC_ALL);
1344 if (flags & SIMPLE)
1345 {
1346 reginsert(BRACE_SIMPLE, ret);
1347 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1348 }
1349 else
1350 {
1351 if (num_complex_braces >= 10)
1352 EMSG_M_RET_NULL(_("E60: Too many complex %s{...}s"),
1353 reg_magic == MAGIC_ALL);
1354 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
1355 regoptail(ret, regnode(BACK));
1356 regoptail(ret, ret);
1357 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1358 ++num_complex_braces;
1359 }
1360 if (minval > 0 && maxval > 0)
1361 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1362 break;
1363 }
1364 if (re_multi_type(peekchr()) != NOT_MULTI)
1365 {
1366 /* Can't have a multi follow a multi. */
1367 if (peekchr() == Magic('*'))
1368 sprintf((char *)IObuff, _("E61: Nested %s*"),
1369 reg_magic >= MAGIC_ON ? "" : "\\");
1370 else
1371 sprintf((char *)IObuff, _("E62: Nested %s%c"),
1372 reg_magic == MAGIC_ALL ? "" : "\\", no_Magic(peekchr()));
1373 EMSG_RET_NULL(IObuff);
1374 }
1375
1376 return ret;
1377}
1378
1379/*
1380 * regatom - the lowest level
1381 *
1382 * Optimization: gobbles an entire sequence of ordinary characters so that
1383 * it can turn them into a single node, which is smaller to store and
1384 * faster to run. Don't do this when one_exactly is set.
1385 */
1386 static char_u *
1387regatom(flagp)
1388 int *flagp;
1389{
1390 char_u *ret;
1391 int flags;
1392 int cpo_lit; /* 'cpoptions' contains 'l' flag */
1393 int c;
1394 static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
1395 static int classcodes[] = {ANY, IDENT, SIDENT, KWORD, SKWORD,
1396 FNAME, SFNAME, PRINT, SPRINT,
1397 WHITE, NWHITE, DIGIT, NDIGIT,
1398 HEX, NHEX, OCTAL, NOCTAL,
1399 WORD, NWORD, HEAD, NHEAD,
1400 ALPHA, NALPHA, LOWER, NLOWER,
1401 UPPER, NUPPER
1402 };
1403 char_u *p;
1404 int extra = 0;
1405
1406 *flagp = WORST; /* Tentatively. */
1407 cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
1408
1409 c = getchr();
1410 switch (c)
1411 {
1412 case Magic('^'):
1413 ret = regnode(BOL);
1414 break;
1415
1416 case Magic('$'):
1417 ret = regnode(EOL);
1418#if defined(FEAT_SYN_HL) || defined(PROTO)
1419 had_eol = TRUE;
1420#endif
1421 break;
1422
1423 case Magic('<'):
1424 ret = regnode(BOW);
1425 break;
1426
1427 case Magic('>'):
1428 ret = regnode(EOW);
1429 break;
1430
1431 case Magic('_'):
1432 c = no_Magic(getchr());
1433 if (c == '^') /* "\_^" is start-of-line */
1434 {
1435 ret = regnode(BOL);
1436 break;
1437 }
1438 if (c == '$') /* "\_$" is end-of-line */
1439 {
1440 ret = regnode(EOL);
1441#if defined(FEAT_SYN_HL) || defined(PROTO)
1442 had_eol = TRUE;
1443#endif
1444 break;
1445 }
1446
1447 extra = ADD_NL;
1448 *flagp |= HASNL;
1449
1450 /* "\_[" is character range plus newline */
1451 if (c == '[')
1452 goto collection;
1453
1454 /* "\_x" is character class plus newline */
1455 /*FALLTHROUGH*/
1456
1457 /*
1458 * Character classes.
1459 */
1460 case Magic('.'):
1461 case Magic('i'):
1462 case Magic('I'):
1463 case Magic('k'):
1464 case Magic('K'):
1465 case Magic('f'):
1466 case Magic('F'):
1467 case Magic('p'):
1468 case Magic('P'):
1469 case Magic('s'):
1470 case Magic('S'):
1471 case Magic('d'):
1472 case Magic('D'):
1473 case Magic('x'):
1474 case Magic('X'):
1475 case Magic('o'):
1476 case Magic('O'):
1477 case Magic('w'):
1478 case Magic('W'):
1479 case Magic('h'):
1480 case Magic('H'):
1481 case Magic('a'):
1482 case Magic('A'):
1483 case Magic('l'):
1484 case Magic('L'):
1485 case Magic('u'):
1486 case Magic('U'):
1487 p = vim_strchr(classchars, no_Magic(c));
1488 if (p == NULL)
1489 EMSG_RET_NULL(_("E63: invalid use of \\_"));
1490 ret = regnode(classcodes[p - classchars] + extra);
1491 *flagp |= HASWIDTH | SIMPLE;
1492 break;
1493
1494 case Magic('n'):
1495 if (reg_string)
1496 {
1497 /* In a string "\n" matches a newline character. */
1498 ret = regnode(EXACTLY);
1499 regc(NL);
1500 regc(NUL);
1501 *flagp |= HASWIDTH | SIMPLE;
1502 }
1503 else
1504 {
1505 /* In buffer text "\n" matches the end of a line. */
1506 ret = regnode(NEWL);
1507 *flagp |= HASWIDTH | HASNL;
1508 }
1509 break;
1510
1511 case Magic('('):
1512 if (one_exactly)
1513 EMSG_ONE_RET_NULL;
1514 ret = reg(REG_PAREN, &flags);
1515 if (ret == NULL)
1516 return NULL;
1517 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1518 break;
1519
1520 case NUL:
1521 case Magic('|'):
1522 case Magic('&'):
1523 case Magic(')'):
1524 EMSG_RET_NULL(_(e_internal)); /* Supposed to be caught earlier. */
1525 /* NOTREACHED */
1526
1527 case Magic('='):
1528 case Magic('?'):
1529 case Magic('+'):
1530 case Magic('@'):
1531 case Magic('{'):
1532 case Magic('*'):
1533 c = no_Magic(c);
1534 sprintf((char *)IObuff, _("E64: %s%c follows nothing"),
1535 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL)
1536 ? "" : "\\", c);
1537 EMSG_RET_NULL(IObuff);
1538 /* NOTREACHED */
1539
1540 case Magic('~'): /* previous substitute pattern */
1541 if (reg_prev_sub)
1542 {
1543 char_u *lp;
1544
1545 ret = regnode(EXACTLY);
1546 lp = reg_prev_sub;
1547 while (*lp != NUL)
1548 regc(*lp++);
1549 regc(NUL);
1550 if (*reg_prev_sub != NUL)
1551 {
1552 *flagp |= HASWIDTH;
1553 if ((lp - reg_prev_sub) == 1)
1554 *flagp |= SIMPLE;
1555 }
1556 }
1557 else
1558 EMSG_RET_NULL(_(e_nopresub));
1559 break;
1560
1561 case Magic('1'):
1562 case Magic('2'):
1563 case Magic('3'):
1564 case Magic('4'):
1565 case Magic('5'):
1566 case Magic('6'):
1567 case Magic('7'):
1568 case Magic('8'):
1569 case Magic('9'):
1570 {
1571 int refnum;
1572
1573 refnum = c - Magic('0');
1574 /*
1575 * Check if the back reference is legal. We must have seen the
1576 * close brace.
1577 * TODO: Should also check that we don't refer to something
1578 * that is repeated (+*=): what instance of the repetition
1579 * should we match?
1580 */
1581 if (!had_endbrace[refnum])
1582 {
1583 /* Trick: check if "@<=" or "@<!" follows, in which case
1584 * the \1 can appear before the referenced match. */
1585 for (p = regparse; *p != NUL; ++p)
1586 if (p[0] == '@' && p[1] == '<'
1587 && (p[2] == '!' || p[2] == '='))
1588 break;
1589 if (*p == NUL)
1590 EMSG_RET_NULL(_("E65: Illegal back reference"));
1591 }
1592 ret = regnode(BACKREF + refnum);
1593 }
1594 break;
1595
1596#ifdef FEAT_SYN_HL
1597 case Magic('z'):
1598 {
1599 c = no_Magic(getchr());
1600 switch (c)
1601 {
1602 case '(': if (reg_do_extmatch != REX_SET)
1603 EMSG_RET_NULL(_("E66: \\z( not allowed here"));
1604 if (one_exactly)
1605 EMSG_ONE_RET_NULL;
1606 ret = reg(REG_ZPAREN, &flags);
1607 if (ret == NULL)
1608 return NULL;
1609 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1610 re_has_z = REX_SET;
1611 break;
1612
1613 case '1':
1614 case '2':
1615 case '3':
1616 case '4':
1617 case '5':
1618 case '6':
1619 case '7':
1620 case '8':
1621 case '9': if (reg_do_extmatch != REX_USE)
1622 EMSG_RET_NULL(_("E67: \\z1 et al. not allowed here"));
1623 ret = regnode(ZREF + c - '0');
1624 re_has_z = REX_USE;
1625 break;
1626
1627 case 's': ret = regnode(MOPEN + 0);
1628 break;
1629
1630 case 'e': ret = regnode(MCLOSE + 0);
1631 break;
1632
1633 default: EMSG_RET_NULL(_("E68: Invalid character after \\z"));
1634 }
1635 }
1636 break;
1637#endif
1638
1639 case Magic('%'):
1640 {
1641 c = no_Magic(getchr());
1642 switch (c)
1643 {
1644 /* () without a back reference */
1645 case '(':
1646 if (one_exactly)
1647 EMSG_ONE_RET_NULL;
1648 ret = reg(REG_NPAREN, &flags);
1649 if (ret == NULL)
1650 return NULL;
1651 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1652 break;
1653
1654 /* Catch \%^ and \%$ regardless of where they appear in the
1655 * pattern -- regardless of whether or not it makes sense. */
1656 case '^':
1657 ret = regnode(RE_BOF);
1658 break;
1659
1660 case '$':
1661 ret = regnode(RE_EOF);
1662 break;
1663
1664 case '#':
1665 ret = regnode(CURSOR);
1666 break;
1667
1668 /* \%[abc]: Emit as a list of branches, all ending at the last
1669 * branch which matches nothing. */
1670 case '[':
1671 if (one_exactly) /* doesn't nest */
1672 EMSG_ONE_RET_NULL;
1673 {
1674 char_u *lastbranch;
1675 char_u *lastnode = NULL;
1676 char_u *br;
1677
1678 ret = NULL;
1679 while ((c = getchr()) != ']')
1680 {
1681 if (c == NUL)
1682 EMSG_M_RET_NULL(_("E69: Missing ] after %s%%["),
1683 reg_magic == MAGIC_ALL);
1684 br = regnode(BRANCH);
1685 if (ret == NULL)
1686 ret = br;
1687 else
1688 regtail(lastnode, br);
1689
1690 ungetchr();
1691 one_exactly = TRUE;
1692 lastnode = regatom(flagp);
1693 one_exactly = FALSE;
1694 if (lastnode == NULL)
1695 return NULL;
1696 }
1697 if (ret == NULL)
1698 EMSG_M_RET_NULL(_("E70: Empty %s%%[]"),
1699 reg_magic == MAGIC_ALL);
1700 lastbranch = regnode(BRANCH);
1701 br = regnode(NOTHING);
1702 if (ret != JUST_CALC_SIZE)
1703 {
1704 regtail(lastnode, br);
1705 regtail(lastbranch, br);
1706 /* connect all branches to the NOTHING
1707 * branch at the end */
1708 for (br = ret; br != lastnode; )
1709 {
1710 if (OP(br) == BRANCH)
1711 {
1712 regtail(br, lastbranch);
1713 br = OPERAND(br);
1714 }
1715 else
1716 br = regnext(br);
1717 }
1718 }
1719 *flagp &= ~HASWIDTH;
1720 break;
1721 }
1722
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001723 case 'd': /* %d123 decimal */
1724 case 'o': /* %o123 octal */
1725 case 'x': /* %xab hex 2 */
1726 case 'u': /* %uabcd hex 4 */
1727 case 'U': /* %U1234abcd hex 8 */
1728 {
1729 int i;
1730
1731 switch (c)
1732 {
1733 case 'd': i = getdecchrs(); break;
1734 case 'o': i = getoctchrs(); break;
1735 case 'x': i = gethexchrs(2); break;
1736 case 'u': i = gethexchrs(4); break;
1737 case 'U': i = gethexchrs(8); break;
1738 default: i = -1; break;
1739 }
1740
1741 if (i < 0)
1742 EMSG_M_RET_NULL(
1743 _("E678: Invalid character after %s%%[dxouU]"),
1744 reg_magic == MAGIC_ALL);
1745 ret = regnode(EXACTLY);
1746 if (i == 0)
1747 regc(0x0a);
1748 else
1749#ifdef FEAT_MBYTE
1750 regmbc(i);
1751#else
1752 regc(i);
1753#endif
1754 regc(NUL);
1755 *flagp |= HASWIDTH;
1756 break;
1757 }
1758
Bram Moolenaar071d4272004-06-13 20:20:40 +00001759 default:
1760 if (VIM_ISDIGIT(c) || c == '<' || c == '>')
1761 {
1762 long_u n = 0;
1763 int cmp;
1764
1765 cmp = c;
1766 if (cmp == '<' || cmp == '>')
1767 c = getchr();
1768 while (VIM_ISDIGIT(c))
1769 {
1770 n = n * 10 + (c - '0');
1771 c = getchr();
1772 }
1773 if (c == 'l' || c == 'c' || c == 'v')
1774 {
1775 if (c == 'l')
1776 ret = regnode(RE_LNUM);
1777 else if (c == 'c')
1778 ret = regnode(RE_COL);
1779 else
1780 ret = regnode(RE_VCOL);
1781 if (ret == JUST_CALC_SIZE)
1782 regsize += 5;
1783 else
1784 {
1785 /* put the number and the optional
1786 * comparator after the opcode */
1787 regcode = re_put_long(regcode, n);
1788 *regcode++ = cmp;
1789 }
1790 break;
1791 }
1792 }
1793
1794 EMSG_M_RET_NULL(_("E71: Invalid character after %s%%"),
1795 reg_magic == MAGIC_ALL);
1796 }
1797 }
1798 break;
1799
1800 case Magic('['):
1801collection:
1802 {
1803 char_u *lp;
1804
1805 /*
1806 * If there is no matching ']', we assume the '[' is a normal
1807 * character. This makes 'incsearch' and ":help [" work.
1808 */
1809 lp = skip_anyof(regparse);
1810 if (*lp == ']') /* there is a matching ']' */
1811 {
1812 int startc = -1; /* > 0 when next '-' is a range */
1813 int endc;
1814
1815 /*
1816 * In a character class, different parsing rules apply.
1817 * Not even \ is special anymore, nothing is.
1818 */
1819 if (*regparse == '^') /* Complement of range. */
1820 {
1821 ret = regnode(ANYBUT + extra);
1822 regparse++;
1823 }
1824 else
1825 ret = regnode(ANYOF + extra);
1826
1827 /* At the start ']' and '-' mean the literal character. */
1828 if (*regparse == ']' || *regparse == '-')
1829 regc(*regparse++);
1830
1831 while (*regparse != NUL && *regparse != ']')
1832 {
1833 if (*regparse == '-')
1834 {
1835 ++regparse;
1836 /* The '-' is not used for a range at the end and
1837 * after or before a '\n'. */
1838 if (*regparse == ']' || *regparse == NUL
1839 || startc == -1
1840 || (regparse[0] == '\\' && regparse[1] == 'n'))
1841 {
1842 regc('-');
1843 startc = '-'; /* [--x] is a range */
1844 }
1845 else
1846 {
1847#ifdef FEAT_MBYTE
1848 if (has_mbyte)
1849 endc = mb_ptr2char_adv(&regparse);
1850 else
1851#endif
1852 endc = *regparse++;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001853
1854 /* Handle \o40, \x20 and \u20AC style sequences */
1855 if (endc == '\\' && !cpo_lit)
1856 endc = coll_get_char();
1857
Bram Moolenaar071d4272004-06-13 20:20:40 +00001858 if (startc > endc)
1859 EMSG_RET_NULL(_(e_invrange));
1860#ifdef FEAT_MBYTE
1861 if (has_mbyte && ((*mb_char2len)(startc) > 1
1862 || (*mb_char2len)(endc) > 1))
1863 {
1864 /* Limit to a range of 256 chars */
1865 if (endc > startc + 256)
1866 EMSG_RET_NULL(_(e_invrange));
1867 while (++startc <= endc)
1868 regmbc(startc);
1869 }
1870 else
1871#endif
1872 {
1873#ifdef EBCDIC
1874 int alpha_only = FALSE;
1875
1876 /* for alphabetical range skip the gaps
1877 * 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'. */
1878 if (isalpha(startc) && isalpha(endc))
1879 alpha_only = TRUE;
1880#endif
1881 while (++startc <= endc)
1882#ifdef EBCDIC
1883 if (!alpha_only || isalpha(startc))
1884#endif
1885 regc(startc);
1886 }
1887 startc = -1;
1888 }
1889 }
1890 /*
1891 * Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1892 * accepts "\t", "\e", etc., but only when the 'l' flag in
1893 * 'cpoptions' is not included.
1894 */
1895 else if (*regparse == '\\'
1896 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1897 || (!cpo_lit
1898 && vim_strchr(REGEXP_ABBR,
1899 regparse[1]) != NULL)))
1900 {
1901 regparse++;
1902 if (*regparse == 'n')
1903 {
1904 /* '\n' in range: also match NL */
1905 if (ret != JUST_CALC_SIZE)
1906 {
1907 if (*ret == ANYBUT)
1908 *ret = ANYBUT + ADD_NL;
1909 else if (*ret == ANYOF)
1910 *ret = ANYOF + ADD_NL;
1911 /* else: must have had a \n already */
1912 }
1913 *flagp |= HASNL;
1914 regparse++;
1915 startc = -1;
1916 }
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001917 else if (*regparse == 'd'
1918 || *regparse == 'o'
1919 || *regparse == 'x'
1920 || *regparse == 'u'
1921 || *regparse == 'U')
1922 {
1923 startc = coll_get_char();
1924 if (startc == 0)
1925 regc(0x0a);
1926 else
1927#ifdef FEAT_MBYTE
1928 regmbc(startc);
1929#else
1930 regc(startc);
1931#endif
1932 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001933 else
1934 {
1935 startc = backslash_trans(*regparse++);
1936 regc(startc);
1937 }
1938 }
1939 else if (*regparse == '[')
1940 {
1941 int c_class;
1942 int cu;
1943
1944 c_class = skip_class_name(&regparse);
1945 startc = -1;
1946 /* Characters assumed to be 8 bits! */
1947 switch (c_class)
1948 {
1949 case CLASS_NONE:
1950 /* literal '[', allow [[-x] as a range */
1951 startc = *regparse++;
1952 regc(startc);
1953 break;
1954 case CLASS_ALNUM:
1955 for (cu = 1; cu <= 255; cu++)
1956 if (isalnum(cu))
1957 regc(cu);
1958 break;
1959 case CLASS_ALPHA:
1960 for (cu = 1; cu <= 255; cu++)
1961 if (isalpha(cu))
1962 regc(cu);
1963 break;
1964 case CLASS_BLANK:
1965 regc(' ');
1966 regc('\t');
1967 break;
1968 case CLASS_CNTRL:
1969 for (cu = 1; cu <= 255; cu++)
1970 if (iscntrl(cu))
1971 regc(cu);
1972 break;
1973 case CLASS_DIGIT:
1974 for (cu = 1; cu <= 255; cu++)
1975 if (VIM_ISDIGIT(cu))
1976 regc(cu);
1977 break;
1978 case CLASS_GRAPH:
1979 for (cu = 1; cu <= 255; cu++)
1980 if (isgraph(cu))
1981 regc(cu);
1982 break;
1983 case CLASS_LOWER:
1984 for (cu = 1; cu <= 255; cu++)
1985 if (islower(cu))
1986 regc(cu);
1987 break;
1988 case CLASS_PRINT:
1989 for (cu = 1; cu <= 255; cu++)
1990 if (vim_isprintc(cu))
1991 regc(cu);
1992 break;
1993 case CLASS_PUNCT:
1994 for (cu = 1; cu <= 255; cu++)
1995 if (ispunct(cu))
1996 regc(cu);
1997 break;
1998 case CLASS_SPACE:
1999 for (cu = 9; cu <= 13; cu++)
2000 regc(cu);
2001 regc(' ');
2002 break;
2003 case CLASS_UPPER:
2004 for (cu = 1; cu <= 255; cu++)
2005 if (isupper(cu))
2006 regc(cu);
2007 break;
2008 case CLASS_XDIGIT:
2009 for (cu = 1; cu <= 255; cu++)
2010 if (vim_isxdigit(cu))
2011 regc(cu);
2012 break;
2013 case CLASS_TAB:
2014 regc('\t');
2015 break;
2016 case CLASS_RETURN:
2017 regc('\r');
2018 break;
2019 case CLASS_BACKSPACE:
2020 regc('\b');
2021 break;
2022 case CLASS_ESCAPE:
2023 regc('\033');
2024 break;
2025 }
2026 }
2027 else
2028 {
2029#ifdef FEAT_MBYTE
2030 if (has_mbyte)
2031 {
2032 int len;
2033
2034 /* produce a multibyte character, including any
2035 * following composing characters */
2036 startc = mb_ptr2char(regparse);
2037 len = (*mb_ptr2len_check)(regparse);
2038 if (enc_utf8 && utf_char2len(startc) != len)
2039 startc = -1; /* composing chars */
2040 while (--len >= 0)
2041 regc(*regparse++);
2042 }
2043 else
2044#endif
2045 {
2046 startc = *regparse++;
2047 regc(startc);
2048 }
2049 }
2050 }
2051 regc(NUL);
2052 prevchr_len = 1; /* last char was the ']' */
2053 if (*regparse != ']')
2054 EMSG_RET_NULL(_(e_toomsbra)); /* Cannot happen? */
2055 skipchr(); /* let's be friends with the lexer again */
2056 *flagp |= HASWIDTH | SIMPLE;
2057 break;
2058 }
2059 }
2060 /* FALLTHROUGH */
2061
2062 default:
2063 {
2064 int len;
2065
2066#ifdef FEAT_MBYTE
2067 /* A multi-byte character is handled as a separate atom if it's
2068 * before a multi. */
2069 if (has_mbyte && (*mb_char2len)(c) > 1
2070 && re_multi_type(peekchr()) != NOT_MULTI)
2071 {
2072 ret = regnode(MULTIBYTECODE);
2073 regmbc(c);
2074 *flagp |= HASWIDTH | SIMPLE;
2075 break;
2076 }
2077#endif
2078
2079 ret = regnode(EXACTLY);
2080
2081 /*
2082 * Append characters as long as:
2083 * - there is no following multi, we then need the character in
2084 * front of it as a single character operand
2085 * - not running into a Magic character
2086 * - "one_exactly" is not set
2087 * But always emit at least one character. Might be a Multi,
2088 * e.g., a "[" without matching "]".
2089 */
2090 for (len = 0; c != NUL && (len == 0
2091 || (re_multi_type(peekchr()) == NOT_MULTI
2092 && !one_exactly
2093 && !is_Magic(c))); ++len)
2094 {
2095 c = no_Magic(c);
2096#ifdef FEAT_MBYTE
2097 if (has_mbyte)
2098 {
2099 regmbc(c);
2100 if (enc_utf8)
2101 {
2102 int off;
2103 int l;
2104
2105 /* Need to get composing character too, directly
2106 * access regparse for that, because skipchr() skips
2107 * over composing chars. */
2108 ungetchr();
2109 if (*regparse == '\\' && regparse[1] != NUL)
2110 off = 1;
2111 else
2112 off = 0;
2113 for (;;)
2114 {
2115 l = utf_ptr2len_check(regparse + off);
2116 if (!UTF_COMPOSINGLIKE(regparse + off,
2117 regparse + off + l))
2118 break;
2119 off += l;
2120 regmbc(utf_ptr2char(regparse + off));
2121 }
2122 skipchr();
2123 }
2124 }
2125 else
2126#endif
2127 regc(c);
2128 c = getchr();
2129 }
2130 ungetchr();
2131
2132 regc(NUL);
2133 *flagp |= HASWIDTH;
2134 if (len == 1)
2135 *flagp |= SIMPLE;
2136 }
2137 break;
2138 }
2139
2140 return ret;
2141}
2142
2143/*
2144 * emit a node
2145 * Return pointer to generated code.
2146 */
2147 static char_u *
2148regnode(op)
2149 int op;
2150{
2151 char_u *ret;
2152
2153 ret = regcode;
2154 if (ret == JUST_CALC_SIZE)
2155 regsize += 3;
2156 else
2157 {
2158 *regcode++ = op;
2159 *regcode++ = NUL; /* Null "next" pointer. */
2160 *regcode++ = NUL;
2161 }
2162 return ret;
2163}
2164
2165/*
2166 * Emit (if appropriate) a byte of code
2167 */
2168 static void
2169regc(b)
2170 int b;
2171{
2172 if (regcode == JUST_CALC_SIZE)
2173 regsize++;
2174 else
2175 *regcode++ = b;
2176}
2177
2178#ifdef FEAT_MBYTE
2179/*
2180 * Emit (if appropriate) a multi-byte character of code
2181 */
2182 static void
2183regmbc(c)
2184 int c;
2185{
2186 if (regcode == JUST_CALC_SIZE)
2187 regsize += (*mb_char2len)(c);
2188 else
2189 regcode += (*mb_char2bytes)(c, regcode);
2190}
2191#endif
2192
2193/*
2194 * reginsert - insert an operator in front of already-emitted operand
2195 *
2196 * Means relocating the operand.
2197 */
2198 static void
2199reginsert(op, opnd)
2200 int op;
2201 char_u *opnd;
2202{
2203 char_u *src;
2204 char_u *dst;
2205 char_u *place;
2206
2207 if (regcode == JUST_CALC_SIZE)
2208 {
2209 regsize += 3;
2210 return;
2211 }
2212 src = regcode;
2213 regcode += 3;
2214 dst = regcode;
2215 while (src > opnd)
2216 *--dst = *--src;
2217
2218 place = opnd; /* Op node, where operand used to be. */
2219 *place++ = op;
2220 *place++ = NUL;
2221 *place = NUL;
2222}
2223
2224/*
2225 * reginsert_limits - insert an operator in front of already-emitted operand.
2226 * The operator has the given limit values as operands. Also set next pointer.
2227 *
2228 * Means relocating the operand.
2229 */
2230 static void
2231reginsert_limits(op, minval, maxval, opnd)
2232 int op;
2233 long minval;
2234 long maxval;
2235 char_u *opnd;
2236{
2237 char_u *src;
2238 char_u *dst;
2239 char_u *place;
2240
2241 if (regcode == JUST_CALC_SIZE)
2242 {
2243 regsize += 11;
2244 return;
2245 }
2246 src = regcode;
2247 regcode += 11;
2248 dst = regcode;
2249 while (src > opnd)
2250 *--dst = *--src;
2251
2252 place = opnd; /* Op node, where operand used to be. */
2253 *place++ = op;
2254 *place++ = NUL;
2255 *place++ = NUL;
2256 place = re_put_long(place, (long_u)minval);
2257 place = re_put_long(place, (long_u)maxval);
2258 regtail(opnd, place);
2259}
2260
2261/*
2262 * Write a long as four bytes at "p" and return pointer to the next char.
2263 */
2264 static char_u *
2265re_put_long(p, val)
2266 char_u *p;
2267 long_u val;
2268{
2269 *p++ = (char_u) ((val >> 24) & 0377);
2270 *p++ = (char_u) ((val >> 16) & 0377);
2271 *p++ = (char_u) ((val >> 8) & 0377);
2272 *p++ = (char_u) (val & 0377);
2273 return p;
2274}
2275
2276/*
2277 * regtail - set the next-pointer at the end of a node chain
2278 */
2279 static void
2280regtail(p, val)
2281 char_u *p;
2282 char_u *val;
2283{
2284 char_u *scan;
2285 char_u *temp;
2286 int offset;
2287
2288 if (p == JUST_CALC_SIZE)
2289 return;
2290
2291 /* Find last node. */
2292 scan = p;
2293 for (;;)
2294 {
2295 temp = regnext(scan);
2296 if (temp == NULL)
2297 break;
2298 scan = temp;
2299 }
2300
2301 if (OP(scan) == BACK)
2302 offset = (int)(scan - val);
2303 else
2304 offset = (int)(val - scan);
2305 *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377);
2306 *(scan + 2) = (char_u) (offset & 0377);
2307}
2308
2309/*
2310 * regoptail - regtail on item after a BRANCH; nop if none
2311 */
2312 static void
2313regoptail(p, val)
2314 char_u *p;
2315 char_u *val;
2316{
2317 /* When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless" */
2318 if (p == NULL || p == JUST_CALC_SIZE
2319 || (OP(p) != BRANCH
2320 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
2321 return;
2322 regtail(OPERAND(p), val);
2323}
2324
2325/*
2326 * getchr() - get the next character from the pattern. We know about
2327 * magic and such, so therefore we need a lexical analyzer.
2328 */
2329
2330/* static int curchr; */
2331static int prevprevchr;
2332static int prevchr;
2333static int nextchr; /* used for ungetchr() */
2334/*
2335 * Note: prevchr is sometimes -1 when we are not at the start,
2336 * eg in /[ ^I]^ the pattern was never found even if it existed, because ^ was
2337 * taken to be magic -- webb
2338 */
2339static int at_start; /* True when on the first character */
2340static int prev_at_start; /* True when on the second character */
2341
2342 static void
2343initchr(str)
2344 char_u *str;
2345{
2346 regparse = str;
2347 prevchr_len = 0;
2348 curchr = prevprevchr = prevchr = nextchr = -1;
2349 at_start = TRUE;
2350 prev_at_start = FALSE;
2351}
2352
2353 static int
2354peekchr()
2355{
2356 if (curchr == -1)
2357 {
2358 switch (curchr = regparse[0])
2359 {
2360 case '.':
2361 case '[':
2362 case '~':
2363 /* magic when 'magic' is on */
2364 if (reg_magic >= MAGIC_ON)
2365 curchr = Magic(curchr);
2366 break;
2367 case '(':
2368 case ')':
2369 case '{':
2370 case '%':
2371 case '+':
2372 case '=':
2373 case '?':
2374 case '@':
2375 case '!':
2376 case '&':
2377 case '|':
2378 case '<':
2379 case '>':
2380 case '#': /* future ext. */
2381 case '"': /* future ext. */
2382 case '\'': /* future ext. */
2383 case ',': /* future ext. */
2384 case '-': /* future ext. */
2385 case ':': /* future ext. */
2386 case ';': /* future ext. */
2387 case '`': /* future ext. */
2388 case '/': /* Can't be used in / command */
2389 /* magic only after "\v" */
2390 if (reg_magic == MAGIC_ALL)
2391 curchr = Magic(curchr);
2392 break;
2393 case '*':
2394 /* * is not magic as the very first character, eg "?*ptr" and when
2395 * after '^', eg "/^*ptr" */
2396 if (reg_magic >= MAGIC_ON && !at_start
2397 && !(prev_at_start && prevchr == Magic('^')))
2398 curchr = Magic('*');
2399 break;
2400 case '^':
2401 /* '^' is only magic as the very first character and if it's after
2402 * "\(", "\|", "\&' or "\n" */
2403 if (reg_magic >= MAGIC_OFF
2404 && (at_start
2405 || reg_magic == MAGIC_ALL
2406 || prevchr == Magic('(')
2407 || prevchr == Magic('|')
2408 || prevchr == Magic('&')
2409 || prevchr == Magic('n')
2410 || (no_Magic(prevchr) == '('
2411 && prevprevchr == Magic('%'))))
2412 {
2413 curchr = Magic('^');
2414 at_start = TRUE;
2415 prev_at_start = FALSE;
2416 }
2417 break;
2418 case '$':
2419 /* '$' is only magic as the very last char and if it's in front of
2420 * either "\|", "\)", "\&", or "\n" */
2421 if (reg_magic >= MAGIC_OFF)
2422 {
2423 char_u *p = regparse + 1;
2424
2425 /* ignore \c \C \m and \M after '$' */
2426 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
2427 || p[1] == 'm' || p[1] == 'M' || p[1] == 'Z'))
2428 p += 2;
2429 if (p[0] == NUL
2430 || (p[0] == '\\'
2431 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
2432 || p[1] == 'n'))
2433 || reg_magic == MAGIC_ALL)
2434 curchr = Magic('$');
2435 }
2436 break;
2437 case '\\':
2438 {
2439 int c = regparse[1];
2440
2441 if (c == NUL)
2442 curchr = '\\'; /* trailing '\' */
2443 else if (
2444#ifdef EBCDIC
2445 vim_strchr(META, c)
2446#else
2447 c <= '~' && META_flags[c]
2448#endif
2449 )
2450 {
2451 /*
2452 * META contains everything that may be magic sometimes,
2453 * except ^ and $ ("\^" and "\$" are only magic after
2454 * "\v"). We now fetch the next character and toggle its
2455 * magicness. Therefore, \ is so meta-magic that it is
2456 * not in META.
2457 */
2458 curchr = -1;
2459 prev_at_start = at_start;
2460 at_start = FALSE; /* be able to say "/\*ptr" */
2461 ++regparse;
2462 peekchr();
2463 --regparse;
2464 curchr = toggle_Magic(curchr);
2465 }
2466 else if (vim_strchr(REGEXP_ABBR, c))
2467 {
2468 /*
2469 * Handle abbreviations, like "\t" for TAB -- webb
2470 */
2471 curchr = backslash_trans(c);
2472 }
2473 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
2474 curchr = toggle_Magic(c);
2475 else
2476 {
2477 /*
2478 * Next character can never be (made) magic?
2479 * Then backslashing it won't do anything.
2480 */
2481#ifdef FEAT_MBYTE
2482 if (has_mbyte)
2483 curchr = (*mb_ptr2char)(regparse + 1);
2484 else
2485#endif
2486 curchr = c;
2487 }
2488 break;
2489 }
2490
2491#ifdef FEAT_MBYTE
2492 default:
2493 if (has_mbyte)
2494 curchr = (*mb_ptr2char)(regparse);
2495#endif
2496 }
2497 }
2498
2499 return curchr;
2500}
2501
2502/*
2503 * Eat one lexed character. Do this in a way that we can undo it.
2504 */
2505 static void
2506skipchr()
2507{
2508 /* peekchr() eats a backslash, do the same here */
2509 if (*regparse == '\\')
2510 prevchr_len = 1;
2511 else
2512 prevchr_len = 0;
2513 if (regparse[prevchr_len] != NUL)
2514 {
2515#ifdef FEAT_MBYTE
2516 if (has_mbyte)
2517 prevchr_len += (*mb_ptr2len_check)(regparse + prevchr_len);
2518 else
2519#endif
2520 ++prevchr_len;
2521 }
2522 regparse += prevchr_len;
2523 prev_at_start = at_start;
2524 at_start = FALSE;
2525 prevprevchr = prevchr;
2526 prevchr = curchr;
2527 curchr = nextchr; /* use previously unget char, or -1 */
2528 nextchr = -1;
2529}
2530
2531/*
2532 * Skip a character while keeping the value of prev_at_start for at_start.
2533 * prevchr and prevprevchr are also kept.
2534 */
2535 static void
2536skipchr_keepstart()
2537{
2538 int as = prev_at_start;
2539 int pr = prevchr;
2540 int prpr = prevprevchr;
2541
2542 skipchr();
2543 at_start = as;
2544 prevchr = pr;
2545 prevprevchr = prpr;
2546}
2547
2548 static int
2549getchr()
2550{
2551 int chr = peekchr();
2552
2553 skipchr();
2554 return chr;
2555}
2556
2557/*
2558 * put character back. Works only once!
2559 */
2560 static void
2561ungetchr()
2562{
2563 nextchr = curchr;
2564 curchr = prevchr;
2565 prevchr = prevprevchr;
2566 at_start = prev_at_start;
2567 prev_at_start = FALSE;
2568
2569 /* Backup regparse, so that it's at the same position as before the
2570 * getchr(). */
2571 regparse -= prevchr_len;
2572}
2573
2574/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +00002575 * Get and return the value of the hex string at the current position.
2576 * Return -1 if there is no valid hex number.
2577 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +00002578 * blahblah\%x20asdf
2579 * before-^ ^-after
2580 * The parameter controls the maximum number of input characters. This will be
2581 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
2582 */
2583 static int
2584gethexchrs(maxinputlen)
2585 int maxinputlen;
2586{
2587 int nr = 0;
2588 int c;
2589 int i;
2590
2591 for (i = 0; i < maxinputlen; ++i)
2592 {
2593 c = regparse[0];
2594 if (!vim_isxdigit(c))
2595 break;
2596 nr <<= 4;
2597 nr |= hex2nr(c);
2598 ++regparse;
2599 }
2600
2601 if (i == 0)
2602 return -1;
2603 return nr;
2604}
2605
2606/*
2607 * get and return the value of the decimal string immediately after the
2608 * current position. Return -1 for invalid. Consumes all digits.
2609 */
2610 static int
2611getdecchrs()
2612{
2613 int nr = 0;
2614 int c;
2615 int i;
2616
2617 for (i = 0; ; ++i)
2618 {
2619 c = regparse[0];
2620 if (c < '0' || c > '9')
2621 break;
2622 nr *= 10;
2623 nr += c - '0';
2624 ++regparse;
2625 }
2626
2627 if (i == 0)
2628 return -1;
2629 return nr;
2630}
2631
2632/*
2633 * get and return the value of the octal string immediately after the current
2634 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
2635 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
2636 * treat 8 or 9 as recognised characters. Position is updated:
2637 * blahblah\%o210asdf
2638 * before-^ ^-after
2639 */
2640 static int
2641getoctchrs()
2642{
2643 int nr = 0;
2644 int c;
2645 int i;
2646
2647 for (i = 0; i < 3 && nr < 040; ++i)
2648 {
2649 c = regparse[0];
2650 if (c < '0' || c > '7')
2651 break;
2652 nr <<= 3;
2653 nr |= hex2nr(c);
2654 ++regparse;
2655 }
2656
2657 if (i == 0)
2658 return -1;
2659 return nr;
2660}
2661
2662/*
2663 * Get a number after a backslash that is inside [].
2664 * When nothing is recognized return a backslash.
2665 */
2666 static int
2667coll_get_char()
2668{
2669 int nr = -1;
2670
2671 switch (*regparse++)
2672 {
2673 case 'd': nr = getdecchrs(); break;
2674 case 'o': nr = getoctchrs(); break;
2675 case 'x': nr = gethexchrs(2); break;
2676 case 'u': nr = gethexchrs(4); break;
2677 case 'U': nr = gethexchrs(8); break;
2678 }
2679 if (nr < 0)
2680 {
2681 /* If getting the number fails be backwards compatible: the character
2682 * is a backslash. */
2683 --regparse;
2684 nr = '\\';
2685 }
2686 return nr;
2687}
2688
2689/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00002690 * read_limits - Read two integers to be taken as a minimum and maximum.
2691 * If the first character is '-', then the range is reversed.
2692 * Should end with 'end'. If minval is missing, zero is default, if maxval is
2693 * missing, a very big number is the default.
2694 */
2695 static int
2696read_limits(minval, maxval)
2697 long *minval;
2698 long *maxval;
2699{
2700 int reverse = FALSE;
2701 char_u *first_char;
2702 long tmp;
2703
2704 if (*regparse == '-')
2705 {
2706 /* Starts with '-', so reverse the range later */
2707 regparse++;
2708 reverse = TRUE;
2709 }
2710 first_char = regparse;
2711 *minval = getdigits(&regparse);
2712 if (*regparse == ',') /* There is a comma */
2713 {
2714 if (vim_isdigit(*++regparse))
2715 *maxval = getdigits(&regparse);
2716 else
2717 *maxval = MAX_LIMIT;
2718 }
2719 else if (VIM_ISDIGIT(*first_char))
2720 *maxval = *minval; /* It was \{n} or \{-n} */
2721 else
2722 *maxval = MAX_LIMIT; /* It was \{} or \{-} */
2723 if (*regparse == '\\')
2724 regparse++; /* Allow either \{...} or \{...\} */
2725 if (*regparse != '}' || (*maxval == 0 && *minval == 0))
2726 {
2727 sprintf((char *)IObuff, _("E554: Syntax error in %s{...}"),
2728 reg_magic == MAGIC_ALL ? "" : "\\");
2729 EMSG_RET_FAIL(IObuff);
2730 }
2731
2732 /*
2733 * Reverse the range if there was a '-', or make sure it is in the right
2734 * order otherwise.
2735 */
2736 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
2737 {
2738 tmp = *minval;
2739 *minval = *maxval;
2740 *maxval = tmp;
2741 }
2742 skipchr(); /* let's be friends with the lexer again */
2743 return OK;
2744}
2745
2746/*
2747 * vim_regexec and friends
2748 */
2749
2750/*
2751 * Global work variables for vim_regexec().
2752 */
2753
2754/* The current match-position is remembered with these variables: */
2755static linenr_T reglnum; /* line number, relative to first line */
2756static char_u *regline; /* start of current line */
2757static char_u *reginput; /* current input, points into "regline" */
2758
2759static int need_clear_subexpr; /* subexpressions still need to be
2760 * cleared */
2761#ifdef FEAT_SYN_HL
2762static int need_clear_zsubexpr = FALSE; /* extmatch subexpressions
2763 * still need to be cleared */
2764#endif
2765
2766static int out_of_stack; /* TRUE when ran out of stack space */
2767
2768/*
2769 * Structure used to save the current input state, when it needs to be
2770 * restored after trying a match. Used by reg_save() and reg_restore().
2771 */
2772typedef struct
2773{
2774 union
2775 {
2776 char_u *ptr; /* reginput pointer, for single-line regexp */
2777 lpos_T pos; /* reginput pos, for multi-line regexp */
2778 } rs_u;
2779} regsave_T;
2780
2781/* struct to save start/end pointer/position in for \(\) */
2782typedef struct
2783{
2784 union
2785 {
2786 char_u *ptr;
2787 lpos_T pos;
2788 } se_u;
2789} save_se_T;
2790
2791static char_u *reg_getline __ARGS((linenr_T lnum));
2792static long vim_regexec_both __ARGS((char_u *line, colnr_T col));
2793static long regtry __ARGS((regprog_T *prog, colnr_T col));
2794static void cleanup_subexpr __ARGS((void));
2795#ifdef FEAT_SYN_HL
2796static void cleanup_zsubexpr __ARGS((void));
2797#endif
2798static void reg_nextline __ARGS((void));
2799static void reg_save __ARGS((regsave_T *save));
2800static void reg_restore __ARGS((regsave_T *save));
2801static int reg_save_equal __ARGS((regsave_T *save));
2802static void save_se_multi __ARGS((save_se_T *savep, lpos_T *posp));
2803static void save_se_one __ARGS((save_se_T *savep, char_u **pp));
2804
2805/* Save the sub-expressions before attempting a match. */
2806#define save_se(savep, posp, pp) \
2807 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
2808
2809/* After a failed match restore the sub-expressions. */
2810#define restore_se(savep, posp, pp) { \
2811 if (REG_MULTI) \
2812 *(posp) = (savep)->se_u.pos; \
2813 else \
2814 *(pp) = (savep)->se_u.ptr; }
2815
2816static int re_num_cmp __ARGS((long_u val, char_u *scan));
2817static int regmatch __ARGS((char_u *prog));
2818static int regrepeat __ARGS((char_u *p, long maxcount));
2819
2820#ifdef DEBUG
2821int regnarrate = 0;
2822#endif
2823
2824/*
2825 * Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
2826 * Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
2827 * contains '\c' or '\C' the value is overruled.
2828 */
2829static int ireg_ic;
2830
2831#ifdef FEAT_MBYTE
2832/*
2833 * Similar to ireg_ic, but only for 'combining' characters. Set with \Z flag
2834 * in the regexp. Defaults to false, always.
2835 */
2836static int ireg_icombine;
2837#endif
2838
2839/*
2840 * Sometimes need to save a copy of a line. Since alloc()/free() is very
2841 * slow, we keep one allocated piece of memory and only re-allocate it when
2842 * it's too small. It's freed in vim_regexec_both() when finished.
2843 */
2844static char_u *reg_tofree;
2845static unsigned reg_tofreelen;
2846
2847/*
2848 * These variables are set when executing a regexp to speed up the execution.
2849 * Which ones are set depends on whethere a single-line or multi-line match is
2850 * done:
2851 * single-line multi-line
2852 * reg_match &regmatch_T NULL
2853 * reg_mmatch NULL &regmmatch_T
2854 * reg_startp reg_match->startp <invalid>
2855 * reg_endp reg_match->endp <invalid>
2856 * reg_startpos <invalid> reg_mmatch->startpos
2857 * reg_endpos <invalid> reg_mmatch->endpos
2858 * reg_win NULL window in which to search
2859 * reg_buf <invalid> buffer in which to search
2860 * reg_firstlnum <invalid> first line in which to search
2861 * reg_maxline 0 last line nr
2862 * reg_line_lbr FALSE or TRUE FALSE
2863 */
2864static regmatch_T *reg_match;
2865static regmmatch_T *reg_mmatch;
2866static char_u **reg_startp = NULL;
2867static char_u **reg_endp = NULL;
2868static lpos_T *reg_startpos = NULL;
2869static lpos_T *reg_endpos = NULL;
2870static win_T *reg_win;
2871static buf_T *reg_buf;
2872static linenr_T reg_firstlnum;
2873static linenr_T reg_maxline;
2874static int reg_line_lbr; /* "\n" in string is line break */
2875
2876/*
2877 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
2878 */
2879 static char_u *
2880reg_getline(lnum)
2881 linenr_T lnum;
2882{
2883 /* when looking behind for a match/no-match lnum is negative. But we
2884 * can't go before line 1 */
2885 if (reg_firstlnum + lnum < 1)
2886 return NULL;
2887 return ml_get_buf(reg_buf, reg_firstlnum + lnum, FALSE);
2888}
2889
2890static regsave_T behind_pos;
2891
2892#ifdef FEAT_SYN_HL
2893static char_u *reg_startzp[NSUBEXP]; /* Workspace to mark beginning */
2894static char_u *reg_endzp[NSUBEXP]; /* and end of \z(...\) matches */
2895static lpos_T reg_startzpos[NSUBEXP]; /* idem, beginning pos */
2896static lpos_T reg_endzpos[NSUBEXP]; /* idem, end pos */
2897#endif
2898
2899/* TRUE if using multi-line regexp. */
2900#define REG_MULTI (reg_match == NULL)
2901
2902/*
2903 * Match a regexp against a string.
2904 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
2905 * Uses curbuf for line count and 'iskeyword'.
2906 *
2907 * Return TRUE if there is a match, FALSE if not.
2908 */
2909 int
2910vim_regexec(rmp, line, col)
2911 regmatch_T *rmp;
2912 char_u *line; /* string to match against */
2913 colnr_T col; /* column to start looking for match */
2914{
2915 reg_match = rmp;
2916 reg_mmatch = NULL;
2917 reg_maxline = 0;
2918 reg_line_lbr = FALSE;
2919 reg_win = NULL;
2920 ireg_ic = rmp->rm_ic;
2921#ifdef FEAT_MBYTE
2922 ireg_icombine = FALSE;
2923#endif
2924 return (vim_regexec_both(line, col) != 0);
2925}
2926
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00002927#if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \
2928 || defined(FIND_REPLACE_DIALOG) || defined(PROTO)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002929/*
2930 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
2931 */
2932 int
2933vim_regexec_nl(rmp, line, col)
2934 regmatch_T *rmp;
2935 char_u *line; /* string to match against */
2936 colnr_T col; /* column to start looking for match */
2937{
2938 reg_match = rmp;
2939 reg_mmatch = NULL;
2940 reg_maxline = 0;
2941 reg_line_lbr = TRUE;
2942 reg_win = NULL;
2943 ireg_ic = rmp->rm_ic;
2944#ifdef FEAT_MBYTE
2945 ireg_icombine = FALSE;
2946#endif
2947 return (vim_regexec_both(line, col) != 0);
2948}
2949#endif
2950
2951/*
2952 * Match a regexp against multiple lines.
2953 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
2954 * Uses curbuf for line count and 'iskeyword'.
2955 *
2956 * Return zero if there is no match. Return number of lines contained in the
2957 * match otherwise.
2958 */
2959 long
2960vim_regexec_multi(rmp, win, buf, lnum, col)
2961 regmmatch_T *rmp;
2962 win_T *win; /* window in which to search or NULL */
2963 buf_T *buf; /* buffer in which to search */
2964 linenr_T lnum; /* nr of line to start looking for match */
2965 colnr_T col; /* column to start looking for match */
2966{
2967 long r;
2968 buf_T *save_curbuf = curbuf;
2969
2970 reg_match = NULL;
2971 reg_mmatch = rmp;
2972 reg_buf = buf;
2973 reg_win = win;
2974 reg_firstlnum = lnum;
2975 reg_maxline = reg_buf->b_ml.ml_line_count - lnum;
2976 reg_line_lbr = FALSE;
2977 ireg_ic = rmp->rmm_ic;
2978#ifdef FEAT_MBYTE
2979 ireg_icombine = FALSE;
2980#endif
2981
2982 /* Need to switch to buffer "buf" to make vim_iswordc() work. */
2983 curbuf = buf;
2984 r = vim_regexec_both(NULL, col);
2985 curbuf = save_curbuf;
2986
2987 return r;
2988}
2989
2990/*
2991 * Match a regexp against a string ("line" points to the string) or multiple
2992 * lines ("line" is NULL, use reg_getline()).
2993 */
2994#ifdef HAVE_SETJMP_H
2995 static long
2996vim_regexec_both(line_arg, col_arg)
2997 char_u *line_arg;
2998 colnr_T col_arg; /* column to start looking for match */
2999#else
3000 static long
3001vim_regexec_both(line, col)
3002 char_u *line;
3003 colnr_T col; /* column to start looking for match */
3004#endif
3005{
3006 regprog_T *prog;
3007 char_u *s;
3008 long retval;
3009#ifdef HAVE_SETJMP_H
3010 char_u *line;
3011 colnr_T col;
3012#endif
3013
3014 reg_tofree = NULL;
3015
3016#ifdef HAVE_TRY_EXCEPT
3017 __try
3018 {
3019#endif
3020
3021#ifdef HAVE_SETJMP_H
3022 /*
3023 * Matching with a regexp may cause a very deep recursive call of
3024 * regmatch(). Vim will crash when running out of stack space. Catch
3025 * this here if the system supports it.
3026 */
3027 mch_startjmp();
3028 if (SETJMP(lc_jump_env) != 0)
3029 {
3030 mch_didjmp();
3031# ifdef SIGHASARG
3032 if (lc_signal != SIGINT)
3033# endif
3034 EMSG(_("E361: Crash intercepted; regexp too complex?"));
3035 retval = 0L;
3036 goto theend;
3037 }
3038
3039 /* Trick to avoid "might be clobbered by `longjmp'" warning from gcc. */
3040 line = line_arg;
3041 col = col_arg;
3042#endif
3043 retval = 0L;
3044
3045 if (REG_MULTI)
3046 {
3047 prog = reg_mmatch->regprog;
3048 line = reg_getline((linenr_T)0);
3049 reg_startpos = reg_mmatch->startpos;
3050 reg_endpos = reg_mmatch->endpos;
3051 }
3052 else
3053 {
3054 prog = reg_match->regprog;
3055 reg_startp = reg_match->startp;
3056 reg_endp = reg_match->endp;
3057 }
3058
3059 /* Be paranoid... */
3060 if (prog == NULL || line == NULL)
3061 {
3062 EMSG(_(e_null));
3063 goto theend;
3064 }
3065
3066 /* Check validity of program. */
3067 if (prog_magic_wrong())
3068 goto theend;
3069
3070 /* If pattern contains "\c" or "\C": overrule value of ireg_ic */
3071 if (prog->regflags & RF_ICASE)
3072 ireg_ic = TRUE;
3073 else if (prog->regflags & RF_NOICASE)
3074 ireg_ic = FALSE;
3075
3076#ifdef FEAT_MBYTE
3077 /* If pattern contains "\Z" overrule value of ireg_icombine */
3078 if (prog->regflags & RF_ICOMBINE)
3079 ireg_icombine = TRUE;
3080#endif
3081
3082 /* If there is a "must appear" string, look for it. */
3083 if (prog->regmust != NULL)
3084 {
3085 int c;
3086
3087#ifdef FEAT_MBYTE
3088 if (has_mbyte)
3089 c = (*mb_ptr2char)(prog->regmust);
3090 else
3091#endif
3092 c = *prog->regmust;
3093 s = line + col;
3094 while ((s = cstrchr(s, c)) != NULL)
3095 {
3096 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
3097 break; /* Found it. */
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00003098 mb_ptr_adv(s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00003099 }
3100 if (s == NULL) /* Not present. */
3101 goto theend;
3102 }
3103
3104 regline = line;
3105 reglnum = 0;
3106 out_of_stack = FALSE;
3107
3108 /* Simplest case: Anchored match need be tried only once. */
3109 if (prog->reganch)
3110 {
3111 int c;
3112
3113#ifdef FEAT_MBYTE
3114 if (has_mbyte)
3115 c = (*mb_ptr2char)(regline + col);
3116 else
3117#endif
3118 c = regline[col];
3119 if (prog->regstart == NUL
3120 || prog->regstart == c
3121 || (ireg_ic && ((
3122#ifdef FEAT_MBYTE
3123 (enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
3124 || (c < 255 && prog->regstart < 255 &&
3125#endif
3126 TOLOWER_LOC(prog->regstart) == TOLOWER_LOC(c)))))
3127 retval = regtry(prog, col);
3128 else
3129 retval = 0;
3130 }
3131 else
3132 {
3133 /* Messy cases: unanchored match. */
3134 while (!got_int && !out_of_stack)
3135 {
3136 if (prog->regstart != NUL)
3137 {
3138 /* Skip until the char we know it must start with. */
3139 s = cstrchr(regline + col, prog->regstart);
3140 if (s == NULL)
3141 {
3142 retval = 0;
3143 break;
3144 }
3145 col = (int)(s - regline);
3146 }
3147
3148 retval = regtry(prog, col);
3149 if (retval > 0)
3150 break;
3151
3152 /* if not currently on the first line, get it again */
3153 if (reglnum != 0)
3154 {
3155 regline = reg_getline((linenr_T)0);
3156 reglnum = 0;
3157 }
3158 if (regline[col] == NUL)
3159 break;
3160#ifdef FEAT_MBYTE
3161 if (has_mbyte)
3162 col += (*mb_ptr2len_check)(regline + col);
3163 else
3164#endif
3165 ++col;
3166 }
3167 }
3168
3169 if (out_of_stack)
3170 EMSG(_("E363: pattern caused out-of-stack error"));
3171
3172#ifdef HAVE_TRY_EXCEPT
3173 }
3174 __except(EXCEPTION_EXECUTE_HANDLER)
3175 {
3176 if (GetExceptionCode() == EXCEPTION_STACK_OVERFLOW)
3177 {
3178 RESETSTKOFLW();
3179 EMSG(_("E363: pattern caused out-of-stack error"));
3180 }
3181 else
3182 EMSG(_("E361: Crash intercepted; regexp too complex?"));
3183 retval = 0L;
3184 }
3185#endif
3186
3187theend:
3188 /* Didn't find a match. */
3189 vim_free(reg_tofree);
3190#ifdef HAVE_SETJMP_H
3191 mch_endjmp();
3192#endif
3193 return retval;
3194}
3195
3196#ifdef FEAT_SYN_HL
3197static reg_extmatch_T *make_extmatch __ARGS((void));
3198
3199/*
3200 * Create a new extmatch and mark it as referenced once.
3201 */
3202 static reg_extmatch_T *
3203make_extmatch()
3204{
3205 reg_extmatch_T *em;
3206
3207 em = (reg_extmatch_T *)alloc_clear((unsigned)sizeof(reg_extmatch_T));
3208 if (em != NULL)
3209 em->refcnt = 1;
3210 return em;
3211}
3212
3213/*
3214 * Add a reference to an extmatch.
3215 */
3216 reg_extmatch_T *
3217ref_extmatch(em)
3218 reg_extmatch_T *em;
3219{
3220 if (em != NULL)
3221 em->refcnt++;
3222 return em;
3223}
3224
3225/*
3226 * Remove a reference to an extmatch. If there are no references left, free
3227 * the info.
3228 */
3229 void
3230unref_extmatch(em)
3231 reg_extmatch_T *em;
3232{
3233 int i;
3234
3235 if (em != NULL && --em->refcnt <= 0)
3236 {
3237 for (i = 0; i < NSUBEXP; ++i)
3238 vim_free(em->matches[i]);
3239 vim_free(em);
3240 }
3241}
3242#endif
3243
3244/*
3245 * regtry - try match of "prog" with at regline["col"].
3246 * Returns 0 for failure, number of lines contained in the match otherwise.
3247 */
3248 static long
3249regtry(prog, col)
3250 regprog_T *prog;
3251 colnr_T col;
3252{
3253 reginput = regline + col;
3254 need_clear_subexpr = TRUE;
3255#ifdef FEAT_SYN_HL
3256 /* Clear the external match subpointers if necessary. */
3257 if (prog->reghasz == REX_SET)
3258 need_clear_zsubexpr = TRUE;
3259#endif
3260
3261 if (regmatch(prog->program + 1))
3262 {
3263 cleanup_subexpr();
3264 if (REG_MULTI)
3265 {
3266 if (reg_startpos[0].lnum < 0)
3267 {
3268 reg_startpos[0].lnum = 0;
3269 reg_startpos[0].col = col;
3270 }
3271 if (reg_endpos[0].lnum < 0)
3272 {
3273 reg_endpos[0].lnum = reglnum;
3274 reg_endpos[0].col = (int)(reginput - regline);
3275 }
3276 else
3277 /* Use line number of "\ze". */
3278 reglnum = reg_endpos[0].lnum;
3279 }
3280 else
3281 {
3282 if (reg_startp[0] == NULL)
3283 reg_startp[0] = regline + col;
3284 if (reg_endp[0] == NULL)
3285 reg_endp[0] = reginput;
3286 }
3287#ifdef FEAT_SYN_HL
3288 /* Package any found \z(...\) matches for export. Default is none. */
3289 unref_extmatch(re_extmatch_out);
3290 re_extmatch_out = NULL;
3291
3292 if (prog->reghasz == REX_SET)
3293 {
3294 int i;
3295
3296 cleanup_zsubexpr();
3297 re_extmatch_out = make_extmatch();
3298 for (i = 0; i < NSUBEXP; i++)
3299 {
3300 if (REG_MULTI)
3301 {
3302 /* Only accept single line matches. */
3303 if (reg_startzpos[i].lnum >= 0
3304 && reg_endzpos[i].lnum == reg_startzpos[i].lnum)
3305 re_extmatch_out->matches[i] =
3306 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
3307 + reg_startzpos[i].col,
3308 reg_endzpos[i].col - reg_startzpos[i].col);
3309 }
3310 else
3311 {
3312 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
3313 re_extmatch_out->matches[i] =
3314 vim_strnsave(reg_startzp[i],
3315 (int)(reg_endzp[i] - reg_startzp[i]));
3316 }
3317 }
3318 }
3319#endif
3320 return 1 + reglnum;
3321 }
3322 return 0;
3323}
3324
3325#ifdef FEAT_MBYTE
Bram Moolenaar071d4272004-06-13 20:20:40 +00003326static int reg_prev_class __ARGS((void));
3327
Bram Moolenaar071d4272004-06-13 20:20:40 +00003328/*
3329 * Get class of previous character.
3330 */
3331 static int
3332reg_prev_class()
3333{
3334 if (reginput > regline)
3335 return mb_get_class(reginput - 1
3336 - (*mb_head_off)(regline, reginput - 1));
3337 return -1;
3338}
3339
Bram Moolenaar071d4272004-06-13 20:20:40 +00003340#endif
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00003341#define ADVANCE_REGINPUT() mb_ptr_adv(reginput)
Bram Moolenaar071d4272004-06-13 20:20:40 +00003342
3343/*
3344 * The arguments from BRACE_LIMITS are stored here. They are actually local
3345 * to regmatch(), but they are here to reduce the amount of stack space used
3346 * (it can be called recursively many times).
3347 */
3348static long bl_minval;
3349static long bl_maxval;
3350
3351/*
3352 * regmatch - main matching routine
3353 *
3354 * Conceptually the strategy is simple: Check to see whether the current
3355 * node matches, call self recursively to see whether the rest matches,
3356 * and then act accordingly. In practice we make some effort to avoid
3357 * recursion, in particular by going through "ordinary" nodes (that don't
3358 * need to know whether the rest of the match failed) by a loop instead of
3359 * by recursion.
3360 *
3361 * Returns TRUE when there is a match. Leaves reginput and reglnum just after
3362 * the last matched character.
3363 * Returns FALSE when there is no match. Leaves reginput and reglnum in an
3364 * undefined state!
3365 */
3366 static int
3367regmatch(scan)
3368 char_u *scan; /* Current node. */
3369{
3370 char_u *next; /* Next node. */
3371 int op;
3372 int c;
3373
3374#ifdef HAVE_GETRLIMIT
3375 /* Check if we are running out of stack space. Could be caused by
3376 * recursively calling ourselves. */
3377 if (out_of_stack || mch_stackcheck((char *)&op) == FAIL)
3378 {
3379 out_of_stack = TRUE;
3380 return FALSE;
3381 }
3382#endif
3383
3384 /* Some patterns my cause a long time to match, even though they are not
3385 * illegal. E.g., "\([a-z]\+\)\+Q". Allow breaking them with CTRL-C. */
3386 fast_breakcheck();
3387
3388#ifdef DEBUG
3389 if (scan != NULL && regnarrate)
3390 {
3391 mch_errmsg(regprop(scan));
3392 mch_errmsg("(\n");
3393 }
3394#endif
3395 while (scan != NULL)
3396 {
3397 if (got_int || out_of_stack)
3398 return FALSE;
3399#ifdef DEBUG
3400 if (regnarrate)
3401 {
3402 mch_errmsg(regprop(scan));
3403 mch_errmsg("...\n");
3404# ifdef FEAT_SYN_HL
3405 if (re_extmatch_in != NULL)
3406 {
3407 int i;
3408
3409 mch_errmsg(_("External submatches:\n"));
3410 for (i = 0; i < NSUBEXP; i++)
3411 {
3412 mch_errmsg(" \"");
3413 if (re_extmatch_in->matches[i] != NULL)
3414 mch_errmsg(re_extmatch_in->matches[i]);
3415 mch_errmsg("\"\n");
3416 }
3417 }
3418# endif
3419 }
3420#endif
3421 next = regnext(scan);
3422
3423 op = OP(scan);
3424 /* Check for character class with NL added. */
3425 if (WITH_NL(op) && *reginput == NUL && reglnum < reg_maxline)
3426 {
3427 reg_nextline();
3428 }
3429 else if (reg_line_lbr && WITH_NL(op) && *reginput == '\n')
3430 {
3431 ADVANCE_REGINPUT();
3432 }
3433 else
3434 {
3435 if (WITH_NL(op))
3436 op -= ADD_NL;
3437#ifdef FEAT_MBYTE
3438 if (has_mbyte)
3439 c = (*mb_ptr2char)(reginput);
3440 else
3441#endif
3442 c = *reginput;
3443 switch (op)
3444 {
3445 case BOL:
3446 if (reginput != regline)
3447 return FALSE;
3448 break;
3449
3450 case EOL:
3451 if (c != NUL)
3452 return FALSE;
3453 break;
3454
3455 case RE_BOF:
3456 /* Passing -1 to the getline() function provided for the search
3457 * should always return NULL if the current line is the first
3458 * line of the file. */
3459 if (reglnum != 0 || reginput != regline
3460 || (REG_MULTI && reg_getline((linenr_T)-1) != NULL))
3461 return FALSE;
3462 break;
3463
3464 case RE_EOF:
3465 if (reglnum != reg_maxline || c != NUL)
3466 return FALSE;
3467 break;
3468
3469 case CURSOR:
3470 /* Check if the buffer is in a window and compare the
3471 * reg_win->w_cursor position to the match position. */
3472 if (reg_win == NULL
3473 || (reglnum + reg_firstlnum != reg_win->w_cursor.lnum)
3474 || ((colnr_T)(reginput - regline) != reg_win->w_cursor.col))
3475 return FALSE;
3476 break;
3477
3478 case RE_LNUM:
3479 if (!REG_MULTI || !re_num_cmp((long_u)(reglnum + reg_firstlnum),
3480 scan))
3481 return FALSE;
3482 break;
3483
3484 case RE_COL:
3485 if (!re_num_cmp((long_u)(reginput - regline) + 1, scan))
3486 return FALSE;
3487 break;
3488
3489 case RE_VCOL:
3490 if (!re_num_cmp((long_u)win_linetabsize(
3491 reg_win == NULL ? curwin : reg_win,
3492 regline, (colnr_T)(reginput - regline)) + 1, scan))
3493 return FALSE;
3494 break;
3495
3496 case BOW: /* \<word; reginput points to w */
3497 if (c == NUL) /* Can't match at end of line */
3498 return FALSE;
3499#ifdef FEAT_MBYTE
3500 if (has_mbyte)
3501 {
3502 int this_class;
3503
3504 /* Get class of current and previous char (if it exists). */
3505 this_class = mb_get_class(reginput);
3506 if (this_class <= 1)
3507 return FALSE; /* not on a word at all */
3508 if (reg_prev_class() == this_class)
3509 return FALSE; /* previous char is in same word */
3510 }
3511#endif
3512 else
3513 {
3514 if (!vim_iswordc(c)
3515 || (reginput > regline && vim_iswordc(reginput[-1])))
3516 return FALSE;
3517 }
3518 break;
3519
3520 case EOW: /* word\>; reginput points after d */
3521 if (reginput == regline) /* Can't match at start of line */
3522 return FALSE;
3523#ifdef FEAT_MBYTE
3524 if (has_mbyte)
3525 {
3526 int this_class, prev_class;
3527
3528 /* Get class of current and previous char (if it exists). */
3529 this_class = mb_get_class(reginput);
3530 prev_class = reg_prev_class();
3531 if (this_class == prev_class)
3532 return FALSE;
3533 if (prev_class == 0 || prev_class == 1)
3534 return FALSE;
3535 }
3536 else
3537#endif
3538 {
3539 if (!vim_iswordc(reginput[-1]))
3540 return FALSE;
3541 if (reginput[0] != NUL && vim_iswordc(c))
3542 return FALSE;
3543 }
3544 break; /* Matched with EOW */
3545
3546 case ANY:
3547 if (c == NUL)
3548 return FALSE;
3549 ADVANCE_REGINPUT();
3550 break;
3551
3552 case IDENT:
3553 if (!vim_isIDc(c))
3554 return FALSE;
3555 ADVANCE_REGINPUT();
3556 break;
3557
3558 case SIDENT:
3559 if (VIM_ISDIGIT(*reginput) || !vim_isIDc(c))
3560 return FALSE;
3561 ADVANCE_REGINPUT();
3562 break;
3563
3564 case KWORD:
3565 if (!vim_iswordp(reginput))
3566 return FALSE;
3567 ADVANCE_REGINPUT();
3568 break;
3569
3570 case SKWORD:
3571 if (VIM_ISDIGIT(*reginput) || !vim_iswordp(reginput))
3572 return FALSE;
3573 ADVANCE_REGINPUT();
3574 break;
3575
3576 case FNAME:
3577 if (!vim_isfilec(c))
3578 return FALSE;
3579 ADVANCE_REGINPUT();
3580 break;
3581
3582 case SFNAME:
3583 if (VIM_ISDIGIT(*reginput) || !vim_isfilec(c))
3584 return FALSE;
3585 ADVANCE_REGINPUT();
3586 break;
3587
3588 case PRINT:
3589 if (ptr2cells(reginput) != 1)
3590 return FALSE;
3591 ADVANCE_REGINPUT();
3592 break;
3593
3594 case SPRINT:
3595 if (VIM_ISDIGIT(*reginput) || ptr2cells(reginput) != 1)
3596 return FALSE;
3597 ADVANCE_REGINPUT();
3598 break;
3599
3600 case WHITE:
3601 if (!vim_iswhite(c))
3602 return FALSE;
3603 ADVANCE_REGINPUT();
3604 break;
3605
3606 case NWHITE:
3607 if (c == NUL || vim_iswhite(c))
3608 return FALSE;
3609 ADVANCE_REGINPUT();
3610 break;
3611
3612 case DIGIT:
3613 if (!ri_digit(c))
3614 return FALSE;
3615 ADVANCE_REGINPUT();
3616 break;
3617
3618 case NDIGIT:
3619 if (c == NUL || ri_digit(c))
3620 return FALSE;
3621 ADVANCE_REGINPUT();
3622 break;
3623
3624 case HEX:
3625 if (!ri_hex(c))
3626 return FALSE;
3627 ADVANCE_REGINPUT();
3628 break;
3629
3630 case NHEX:
3631 if (c == NUL || ri_hex(c))
3632 return FALSE;
3633 ADVANCE_REGINPUT();
3634 break;
3635
3636 case OCTAL:
3637 if (!ri_octal(c))
3638 return FALSE;
3639 ADVANCE_REGINPUT();
3640 break;
3641
3642 case NOCTAL:
3643 if (c == NUL || ri_octal(c))
3644 return FALSE;
3645 ADVANCE_REGINPUT();
3646 break;
3647
3648 case WORD:
3649 if (!ri_word(c))
3650 return FALSE;
3651 ADVANCE_REGINPUT();
3652 break;
3653
3654 case NWORD:
3655 if (c == NUL || ri_word(c))
3656 return FALSE;
3657 ADVANCE_REGINPUT();
3658 break;
3659
3660 case HEAD:
3661 if (!ri_head(c))
3662 return FALSE;
3663 ADVANCE_REGINPUT();
3664 break;
3665
3666 case NHEAD:
3667 if (c == NUL || ri_head(c))
3668 return FALSE;
3669 ADVANCE_REGINPUT();
3670 break;
3671
3672 case ALPHA:
3673 if (!ri_alpha(c))
3674 return FALSE;
3675 ADVANCE_REGINPUT();
3676 break;
3677
3678 case NALPHA:
3679 if (c == NUL || ri_alpha(c))
3680 return FALSE;
3681 ADVANCE_REGINPUT();
3682 break;
3683
3684 case LOWER:
3685 if (!ri_lower(c))
3686 return FALSE;
3687 ADVANCE_REGINPUT();
3688 break;
3689
3690 case NLOWER:
3691 if (c == NUL || ri_lower(c))
3692 return FALSE;
3693 ADVANCE_REGINPUT();
3694 break;
3695
3696 case UPPER:
3697 if (!ri_upper(c))
3698 return FALSE;
3699 ADVANCE_REGINPUT();
3700 break;
3701
3702 case NUPPER:
3703 if (c == NUL || ri_upper(c))
3704 return FALSE;
3705 ADVANCE_REGINPUT();
3706 break;
3707
3708 case EXACTLY:
3709 {
3710 int len;
3711 char_u *opnd;
3712
3713 opnd = OPERAND(scan);
3714 /* Inline the first byte, for speed. */
3715 if (*opnd != *reginput
3716 && (!ireg_ic || (
3717#ifdef FEAT_MBYTE
3718 !enc_utf8 &&
3719#endif
3720 TOLOWER_LOC(*opnd) != TOLOWER_LOC(*reginput))))
3721 return FALSE;
3722 if (*opnd == NUL)
3723 {
3724 /* match empty string always works; happens when "~" is
3725 * empty. */
3726 }
3727 else if (opnd[1] == NUL
3728#ifdef FEAT_MBYTE
3729 && !(enc_utf8 && ireg_ic)
3730#endif
3731 )
3732 ++reginput; /* matched a single char */
3733 else
3734 {
3735 len = (int)STRLEN(opnd);
3736 /* Need to match first byte again for multi-byte. */
3737 if (cstrncmp(opnd, reginput, &len) != 0)
3738 return FALSE;
3739#ifdef FEAT_MBYTE
3740 /* Check for following composing character. */
3741 if (enc_utf8 && UTF_COMPOSINGLIKE(reginput, reginput + len))
3742 {
3743 /* raaron: This code makes a composing character get
3744 * ignored, which is the correct behavior (sometimes)
3745 * for voweled Hebrew texts. */
3746 if (!ireg_icombine)
3747 return FALSE;
3748 }
3749 else
3750#endif
3751 reginput += len;
3752 }
3753 }
3754 break;
3755
3756 case ANYOF:
3757 case ANYBUT:
3758 if (c == NUL)
3759 return FALSE;
3760 if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
3761 return FALSE;
3762 ADVANCE_REGINPUT();
3763 break;
3764
3765#ifdef FEAT_MBYTE
3766 case MULTIBYTECODE:
3767 if (has_mbyte)
3768 {
3769 int i, len;
3770 char_u *opnd;
3771
3772 opnd = OPERAND(scan);
3773 /* Safety check (just in case 'encoding' was changed since
3774 * compiling the program). */
3775 if ((len = (*mb_ptr2len_check)(opnd)) < 2)
3776 return FALSE;
3777 for (i = 0; i < len; ++i)
3778 if (opnd[i] != reginput[i])
3779 return FALSE;
3780 reginput += len;
3781 }
3782 else
3783 return FALSE;
3784 break;
3785#endif
3786
3787 case NOTHING:
3788 break;
3789
3790 case BACK:
3791 break;
3792
3793 case MOPEN + 0: /* Match start: \zs */
3794 case MOPEN + 1: /* \( */
3795 case MOPEN + 2:
3796 case MOPEN + 3:
3797 case MOPEN + 4:
3798 case MOPEN + 5:
3799 case MOPEN + 6:
3800 case MOPEN + 7:
3801 case MOPEN + 8:
3802 case MOPEN + 9:
3803 {
3804 int no;
3805 save_se_T save;
3806
3807 no = op - MOPEN;
3808 cleanup_subexpr();
3809 save_se(&save, &reg_startpos[no], &reg_startp[no]);
3810
3811 if (regmatch(next))
3812 return TRUE;
3813
3814 restore_se(&save, &reg_startpos[no], &reg_startp[no]);
3815 return FALSE;
3816 }
3817 /* break; Not Reached */
3818
3819 case NOPEN: /* \%( */
3820 case NCLOSE: /* \) after \%( */
3821 if (regmatch(next))
3822 return TRUE;
3823 return FALSE;
3824 /* break; Not Reached */
3825
3826#ifdef FEAT_SYN_HL
3827 case ZOPEN + 1:
3828 case ZOPEN + 2:
3829 case ZOPEN + 3:
3830 case ZOPEN + 4:
3831 case ZOPEN + 5:
3832 case ZOPEN + 6:
3833 case ZOPEN + 7:
3834 case ZOPEN + 8:
3835 case ZOPEN + 9:
3836 {
3837 int no;
3838 save_se_T save;
3839
3840 no = op - ZOPEN;
3841 cleanup_zsubexpr();
3842 save_se(&save, &reg_startzpos[no], &reg_startzp[no]);
3843
3844 if (regmatch(next))
3845 return TRUE;
3846
3847 restore_se(&save, &reg_startzpos[no], &reg_startzp[no]);
3848 return FALSE;
3849 }
3850 /* break; Not Reached */
3851#endif
3852
3853 case MCLOSE + 0: /* Match end: \ze */
3854 case MCLOSE + 1: /* \) */
3855 case MCLOSE + 2:
3856 case MCLOSE + 3:
3857 case MCLOSE + 4:
3858 case MCLOSE + 5:
3859 case MCLOSE + 6:
3860 case MCLOSE + 7:
3861 case MCLOSE + 8:
3862 case MCLOSE + 9:
3863 {
3864 int no;
3865 save_se_T save;
3866
3867 no = op - MCLOSE;
3868 cleanup_subexpr();
3869 save_se(&save, &reg_endpos[no], &reg_endp[no]);
3870
3871 if (regmatch(next))
3872 return TRUE;
3873
3874 restore_se(&save, &reg_endpos[no], &reg_endp[no]);
3875 return FALSE;
3876 }
3877 /* break; Not Reached */
3878
3879#ifdef FEAT_SYN_HL
3880 case ZCLOSE + 1: /* \) after \z( */
3881 case ZCLOSE + 2:
3882 case ZCLOSE + 3:
3883 case ZCLOSE + 4:
3884 case ZCLOSE + 5:
3885 case ZCLOSE + 6:
3886 case ZCLOSE + 7:
3887 case ZCLOSE + 8:
3888 case ZCLOSE + 9:
3889 {
3890 int no;
3891 save_se_T save;
3892
3893 no = op - ZCLOSE;
3894 cleanup_zsubexpr();
3895 save_se(&save, &reg_endzpos[no], &reg_endzp[no]);
3896
3897 if (regmatch(next))
3898 return TRUE;
3899
3900 restore_se(&save, &reg_endzpos[no], &reg_endzp[no]);
3901 return FALSE;
3902 }
3903 /* break; Not Reached */
3904#endif
3905
3906 case BACKREF + 1:
3907 case BACKREF + 2:
3908 case BACKREF + 3:
3909 case BACKREF + 4:
3910 case BACKREF + 5:
3911 case BACKREF + 6:
3912 case BACKREF + 7:
3913 case BACKREF + 8:
3914 case BACKREF + 9:
3915 {
3916 int no;
3917 int len;
3918 linenr_T clnum;
3919 colnr_T ccol;
3920 char_u *p;
3921
3922 no = op - BACKREF;
3923 cleanup_subexpr();
3924 if (!REG_MULTI) /* Single-line regexp */
3925 {
3926 if (reg_endp[no] == NULL)
3927 {
3928 /* Backref was not set: Match an empty string. */
3929 len = 0;
3930 }
3931 else
3932 {
3933 /* Compare current input with back-ref in the same
3934 * line. */
3935 len = (int)(reg_endp[no] - reg_startp[no]);
3936 if (cstrncmp(reg_startp[no], reginput, &len) != 0)
3937 return FALSE;
3938 }
3939 }
3940 else /* Multi-line regexp */
3941 {
3942 if (reg_endpos[no].lnum < 0)
3943 {
3944 /* Backref was not set: Match an empty string. */
3945 len = 0;
3946 }
3947 else
3948 {
3949 if (reg_startpos[no].lnum == reglnum
3950 && reg_endpos[no].lnum == reglnum)
3951 {
3952 /* Compare back-ref within the current line. */
3953 len = reg_endpos[no].col - reg_startpos[no].col;
3954 if (cstrncmp(regline + reg_startpos[no].col,
3955 reginput, &len) != 0)
3956 return FALSE;
3957 }
3958 else
3959 {
3960 /* Messy situation: Need to compare between two
3961 * lines. */
3962 ccol = reg_startpos[no].col;
3963 clnum = reg_startpos[no].lnum;
3964 for (;;)
3965 {
3966 /* Since getting one line may invalidate
3967 * the other, need to make copy. Slow! */
3968 if (regline != reg_tofree)
3969 {
3970 len = (int)STRLEN(regline);
3971 if (reg_tofree == NULL
3972 || len >= (int)reg_tofreelen)
3973 {
3974 len += 50; /* get some extra */
3975 vim_free(reg_tofree);
3976 reg_tofree = alloc(len);
3977 if (reg_tofree == NULL)
3978 return FALSE; /* out of memory! */
3979 reg_tofreelen = len;
3980 }
3981 STRCPY(reg_tofree, regline);
3982 reginput = reg_tofree
3983 + (reginput - regline);
3984 regline = reg_tofree;
3985 }
3986
3987 /* Get the line to compare with. */
3988 p = reg_getline(clnum);
3989 if (clnum == reg_endpos[no].lnum)
3990 len = reg_endpos[no].col - ccol;
3991 else
3992 len = (int)STRLEN(p + ccol);
3993
3994 if (cstrncmp(p + ccol, reginput, &len) != 0)
3995 return FALSE; /* doesn't match */
3996 if (clnum == reg_endpos[no].lnum)
3997 break; /* match and at end! */
3998 if (reglnum == reg_maxline)
3999 return FALSE; /* text too short */
4000
4001 /* Advance to next line. */
4002 reg_nextline();
4003 ++clnum;
4004 ccol = 0;
4005 if (got_int || out_of_stack)
4006 return FALSE;
4007 }
4008
4009 /* found a match! Note that regline may now point
4010 * to a copy of the line, that should not matter. */
4011 }
4012 }
4013 }
4014
4015 /* Matched the backref, skip over it. */
4016 reginput += len;
4017 }
4018 break;
4019
4020#ifdef FEAT_SYN_HL
4021 case ZREF + 1:
4022 case ZREF + 2:
4023 case ZREF + 3:
4024 case ZREF + 4:
4025 case ZREF + 5:
4026 case ZREF + 6:
4027 case ZREF + 7:
4028 case ZREF + 8:
4029 case ZREF + 9:
4030 {
4031 int no;
4032 int len;
4033
4034 cleanup_zsubexpr();
4035 no = op - ZREF;
4036 if (re_extmatch_in != NULL
4037 && re_extmatch_in->matches[no] != NULL)
4038 {
4039 len = (int)STRLEN(re_extmatch_in->matches[no]);
4040 if (cstrncmp(re_extmatch_in->matches[no],
4041 reginput, &len) != 0)
4042 return FALSE;
4043 reginput += len;
4044 }
4045 else
4046 {
4047 /* Backref was not set: Match an empty string. */
4048 }
4049 }
4050 break;
4051#endif
4052
4053 case BRANCH:
4054 {
4055 if (OP(next) != BRANCH) /* No choice. */
4056 next = OPERAND(scan); /* Avoid recursion. */
4057 else
4058 {
4059 regsave_T save;
4060
4061 do
4062 {
4063 reg_save(&save);
4064 if (regmatch(OPERAND(scan)))
4065 return TRUE;
4066 reg_restore(&save);
4067 scan = regnext(scan);
4068 } while (scan != NULL && OP(scan) == BRANCH);
4069 return FALSE;
4070 /* NOTREACHED */
4071 }
4072 }
4073 break;
4074
4075 case BRACE_LIMITS:
4076 {
4077 int no;
4078
4079 if (OP(next) == BRACE_SIMPLE)
4080 {
4081 bl_minval = OPERAND_MIN(scan);
4082 bl_maxval = OPERAND_MAX(scan);
4083 }
4084 else if (OP(next) >= BRACE_COMPLEX
4085 && OP(next) < BRACE_COMPLEX + 10)
4086 {
4087 no = OP(next) - BRACE_COMPLEX;
4088 brace_min[no] = OPERAND_MIN(scan);
4089 brace_max[no] = OPERAND_MAX(scan);
4090 brace_count[no] = 0;
4091 }
4092 else
4093 {
4094 EMSG(_(e_internal)); /* Shouldn't happen */
4095 return FALSE;
4096 }
4097 }
4098 break;
4099
4100 case BRACE_COMPLEX + 0:
4101 case BRACE_COMPLEX + 1:
4102 case BRACE_COMPLEX + 2:
4103 case BRACE_COMPLEX + 3:
4104 case BRACE_COMPLEX + 4:
4105 case BRACE_COMPLEX + 5:
4106 case BRACE_COMPLEX + 6:
4107 case BRACE_COMPLEX + 7:
4108 case BRACE_COMPLEX + 8:
4109 case BRACE_COMPLEX + 9:
4110 {
4111 int no;
4112 regsave_T save;
4113
4114 no = op - BRACE_COMPLEX;
4115 ++brace_count[no];
4116
4117 /* If not matched enough times yet, try one more */
4118 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
4119 ? brace_min[no] : brace_max[no]))
4120 {
4121 reg_save(&save);
4122 if (regmatch(OPERAND(scan)))
4123 return TRUE;
4124 reg_restore(&save);
4125 --brace_count[no]; /* failed, decrement match count */
4126 return FALSE;
4127 }
4128
4129 /* If matched enough times, may try matching some more */
4130 if (brace_min[no] <= brace_max[no])
4131 {
4132 /* Range is the normal way around, use longest match */
4133 if (brace_count[no] <= brace_max[no])
4134 {
4135 reg_save(&save);
4136 if (regmatch(OPERAND(scan)))
4137 return TRUE; /* matched some more times */
4138 reg_restore(&save);
4139 --brace_count[no]; /* matched just enough times */
4140 /* continue with the items after \{} */
4141 }
4142 }
4143 else
4144 {
4145 /* Range is backwards, use shortest match first */
4146 if (brace_count[no] <= brace_min[no])
4147 {
4148 reg_save(&save);
4149 if (regmatch(next))
4150 return TRUE;
4151 reg_restore(&save);
4152 next = OPERAND(scan);
4153 /* must try to match one more item */
4154 }
4155 }
4156 }
4157 break;
4158
4159 case BRACE_SIMPLE:
4160 case STAR:
4161 case PLUS:
4162 {
4163 int nextb; /* next byte */
4164 int nextb_ic; /* next byte reverse case */
4165 long count;
4166 regsave_T save;
4167 long minval;
4168 long maxval;
4169
4170 /*
4171 * Lookahead to avoid useless match attempts when we know
4172 * what character comes next.
4173 */
4174 if (OP(next) == EXACTLY)
4175 {
4176 nextb = *OPERAND(next);
4177 if (ireg_ic)
4178 {
4179 if (isupper(nextb))
4180 nextb_ic = TOLOWER_LOC(nextb);
4181 else
4182 nextb_ic = TOUPPER_LOC(nextb);
4183 }
4184 else
4185 nextb_ic = nextb;
4186 }
4187 else
4188 {
4189 nextb = NUL;
4190 nextb_ic = NUL;
4191 }
4192 if (op != BRACE_SIMPLE)
4193 {
4194 minval = (op == STAR) ? 0 : 1;
4195 maxval = MAX_LIMIT;
4196 }
4197 else
4198 {
4199 minval = bl_minval;
4200 maxval = bl_maxval;
4201 }
4202
4203 /*
4204 * When maxval > minval, try matching as much as possible, up
4205 * to maxval. When maxval < minval, try matching at least the
4206 * minimal number (since the range is backwards, that's also
4207 * maxval!).
4208 */
4209 count = regrepeat(OPERAND(scan), maxval);
4210 if (got_int)
4211 return FALSE;
4212 if (minval <= maxval)
4213 {
4214 /* Range is the normal way around, use longest match */
4215 while (count >= minval)
4216 {
4217 /* If it could match, try it. */
4218 if (nextb == NUL || *reginput == nextb
4219 || *reginput == nextb_ic)
4220 {
4221 reg_save(&save);
4222 if (regmatch(next))
4223 return TRUE;
4224 reg_restore(&save);
4225 }
4226 /* Couldn't or didn't match -- back up one char. */
4227 if (--count < minval)
4228 break;
4229 if (reginput == regline)
4230 {
4231 /* backup to last char of previous line */
4232 --reglnum;
4233 regline = reg_getline(reglnum);
4234 /* Just in case regrepeat() didn't count right. */
4235 if (regline == NULL)
4236 return FALSE;
4237 reginput = regline + STRLEN(regline);
4238 fast_breakcheck();
4239 if (got_int || out_of_stack)
4240 return FALSE;
4241 }
4242 else
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004243 mb_ptr_back(regline, reginput);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004244 }
4245 }
4246 else
4247 {
4248 /* Range is backwards, use shortest match first.
4249 * Careful: maxval and minval are exchanged! */
4250 if (count < maxval)
4251 return FALSE;
4252 for (;;)
4253 {
4254 /* If it could work, try it. */
4255 if (nextb == NUL || *reginput == nextb
4256 || *reginput == nextb_ic)
4257 {
4258 reg_save(&save);
4259 if (regmatch(next))
4260 return TRUE;
4261 reg_restore(&save);
4262 }
4263 /* Couldn't or didn't match: try advancing one char. */
4264 if (count == minval
4265 || regrepeat(OPERAND(scan), 1L) == 0)
4266 break;
4267 ++count;
4268 if (got_int || out_of_stack)
4269 return FALSE;
4270 }
4271 }
4272 return FALSE;
4273 }
4274 /* break; Not Reached */
4275
4276 case NOMATCH:
4277 {
4278 regsave_T save;
4279
4280 /* If the operand matches, we fail. Otherwise backup and
4281 * continue with the next item. */
4282 reg_save(&save);
4283 if (regmatch(OPERAND(scan)))
4284 return FALSE;
4285 reg_restore(&save);
4286 }
4287 break;
4288
4289 case MATCH:
4290 case SUBPAT:
4291 {
4292 regsave_T save;
4293
4294 /* If the operand doesn't match, we fail. Otherwise backup
4295 * and continue with the next item. */
4296 reg_save(&save);
4297 if (!regmatch(OPERAND(scan)))
4298 return FALSE;
4299 if (op == MATCH) /* zero-width */
4300 reg_restore(&save);
4301 }
4302 break;
4303
4304 case BEHIND:
4305 case NOBEHIND:
4306 {
4307 regsave_T save_after, save_start;
4308 regsave_T save_behind_pos;
4309 int needmatch = (op == BEHIND);
4310
4311 /*
4312 * Look back in the input of the operand matches or not. This
4313 * must be done at every position in the input and checking if
4314 * the match ends at the current position.
4315 * First check if the next item matches, that's probably
4316 * faster.
4317 */
4318 reg_save(&save_start);
4319 if (regmatch(next))
4320 {
4321 /* save the position after the found match for next */
4322 reg_save(&save_after);
4323
4324 /* start looking for a match with operand at the current
4325 * postion. Go back one character until we find the
4326 * result, hitting the start of the line or the previous
4327 * line (for multi-line matching).
4328 * Set behind_pos to where the match should end, BHPOS
4329 * will match it. */
4330 save_behind_pos = behind_pos;
4331 behind_pos = save_start;
4332 for (;;)
4333 {
4334 reg_restore(&save_start);
4335 if (regmatch(OPERAND(scan))
4336 && reg_save_equal(&behind_pos))
4337 {
4338 behind_pos = save_behind_pos;
4339 /* found a match that ends where "next" started */
4340 if (needmatch)
4341 {
4342 reg_restore(&save_after);
4343 return TRUE;
4344 }
4345 return FALSE;
4346 }
4347 /*
4348 * No match: Go back one character. May go to
4349 * previous line once.
4350 */
4351 if (REG_MULTI)
4352 {
4353 if (save_start.rs_u.pos.col == 0)
4354 {
4355 if (save_start.rs_u.pos.lnum
4356 < behind_pos.rs_u.pos.lnum
4357 || reg_getline(
4358 --save_start.rs_u.pos.lnum) == NULL)
4359 break;
4360 reg_restore(&save_start);
4361 save_start.rs_u.pos.col =
4362 (colnr_T)STRLEN(regline);
4363 }
4364 else
4365 --save_start.rs_u.pos.col;
4366 }
4367 else
4368 {
4369 if (save_start.rs_u.ptr == regline)
4370 break;
4371 --save_start.rs_u.ptr;
4372 }
4373 }
4374
4375 /* NOBEHIND succeeds when no match was found */
4376 behind_pos = save_behind_pos;
4377 if (!needmatch)
4378 {
4379 reg_restore(&save_after);
4380 return TRUE;
4381 }
4382 }
4383 return FALSE;
4384 }
4385
4386 case BHPOS:
4387 if (REG_MULTI)
4388 {
4389 if (behind_pos.rs_u.pos.col != (colnr_T)(reginput - regline)
4390 || behind_pos.rs_u.pos.lnum != reglnum)
4391 return FALSE;
4392 }
4393 else if (behind_pos.rs_u.ptr != reginput)
4394 return FALSE;
4395 break;
4396
4397 case NEWL:
4398 if ((c != NUL || reglnum == reg_maxline)
4399 && (c != '\n' || !reg_line_lbr))
4400 return FALSE;
4401 if (reg_line_lbr)
4402 ADVANCE_REGINPUT();
4403 else
4404 reg_nextline();
4405 break;
4406
4407 case END:
4408 return TRUE; /* Success! */
4409
4410 default:
4411 EMSG(_(e_re_corr));
4412#ifdef DEBUG
4413 printf("Illegal op code %d\n", op);
4414#endif
4415 return FALSE;
4416 }
4417 }
4418
4419 scan = next;
4420 }
4421
4422 /*
4423 * We get here only if there's trouble -- normally "case END" is the
4424 * terminating point.
4425 */
4426 EMSG(_(e_re_corr));
4427#ifdef DEBUG
4428 printf("Premature EOL\n");
4429#endif
4430 return FALSE;
4431}
4432
Bram Moolenaar071d4272004-06-13 20:20:40 +00004433/*
4434 * regrepeat - repeatedly match something simple, return how many.
4435 * Advances reginput (and reglnum) to just after the matched chars.
4436 */
4437 static int
4438regrepeat(p, maxcount)
4439 char_u *p;
4440 long maxcount; /* maximum number of matches allowed */
4441{
4442 long count = 0;
4443 char_u *scan;
4444 char_u *opnd;
4445 int mask;
4446 int testval = 0;
4447
4448 scan = reginput; /* Make local copy of reginput for speed. */
4449 opnd = OPERAND(p);
4450 switch (OP(p))
4451 {
4452 case ANY:
4453 case ANY + ADD_NL:
4454 while (count < maxcount)
4455 {
4456 /* Matching anything means we continue until end-of-line (or
4457 * end-of-file for ANY + ADD_NL), only limited by maxcount. */
4458 while (*scan != NUL && count < maxcount)
4459 {
4460 ++count;
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004461 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004462 }
4463 if (!WITH_NL(OP(p)) || reglnum == reg_maxline || count == maxcount)
4464 break;
4465 ++count; /* count the line-break */
4466 reg_nextline();
4467 scan = reginput;
4468 if (got_int)
4469 break;
4470 }
4471 break;
4472
4473 case IDENT:
4474 case IDENT + ADD_NL:
4475 testval = TRUE;
4476 /*FALLTHROUGH*/
4477 case SIDENT:
4478 case SIDENT + ADD_NL:
4479 while (count < maxcount)
4480 {
4481 if (vim_isIDc(*scan) && (testval || !VIM_ISDIGIT(*scan)))
4482 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004483 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004484 }
4485 else if (*scan == NUL)
4486 {
4487 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4488 break;
4489 reg_nextline();
4490 scan = reginput;
4491 if (got_int)
4492 break;
4493 }
4494 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4495 ++scan;
4496 else
4497 break;
4498 ++count;
4499 }
4500 break;
4501
4502 case KWORD:
4503 case KWORD + ADD_NL:
4504 testval = TRUE;
4505 /*FALLTHROUGH*/
4506 case SKWORD:
4507 case SKWORD + ADD_NL:
4508 while (count < maxcount)
4509 {
4510 if (vim_iswordp(scan) && (testval || !VIM_ISDIGIT(*scan)))
4511 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004512 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004513 }
4514 else if (*scan == NUL)
4515 {
4516 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4517 break;
4518 reg_nextline();
4519 scan = reginput;
4520 if (got_int)
4521 break;
4522 }
4523 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4524 ++scan;
4525 else
4526 break;
4527 ++count;
4528 }
4529 break;
4530
4531 case FNAME:
4532 case FNAME + ADD_NL:
4533 testval = TRUE;
4534 /*FALLTHROUGH*/
4535 case SFNAME:
4536 case SFNAME + ADD_NL:
4537 while (count < maxcount)
4538 {
4539 if (vim_isfilec(*scan) && (testval || !VIM_ISDIGIT(*scan)))
4540 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004541 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004542 }
4543 else if (*scan == NUL)
4544 {
4545 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4546 break;
4547 reg_nextline();
4548 scan = reginput;
4549 if (got_int)
4550 break;
4551 }
4552 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4553 ++scan;
4554 else
4555 break;
4556 ++count;
4557 }
4558 break;
4559
4560 case PRINT:
4561 case PRINT + ADD_NL:
4562 testval = TRUE;
4563 /*FALLTHROUGH*/
4564 case SPRINT:
4565 case SPRINT + ADD_NL:
4566 while (count < maxcount)
4567 {
4568 if (*scan == NUL)
4569 {
4570 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4571 break;
4572 reg_nextline();
4573 scan = reginput;
4574 if (got_int)
4575 break;
4576 }
4577 else if (ptr2cells(scan) == 1 && (testval || !VIM_ISDIGIT(*scan)))
4578 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004579 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004580 }
4581 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4582 ++scan;
4583 else
4584 break;
4585 ++count;
4586 }
4587 break;
4588
4589 case WHITE:
4590 case WHITE + ADD_NL:
4591 testval = mask = RI_WHITE;
4592do_class:
4593 while (count < maxcount)
4594 {
4595#ifdef FEAT_MBYTE
4596 int l;
4597#endif
4598 if (*scan == NUL)
4599 {
4600 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4601 break;
4602 reg_nextline();
4603 scan = reginput;
4604 if (got_int)
4605 break;
4606 }
4607#ifdef FEAT_MBYTE
4608 else if (has_mbyte && (l = (*mb_ptr2len_check)(scan)) > 1)
4609 {
4610 if (testval != 0)
4611 break;
4612 scan += l;
4613 }
4614#endif
4615 else if ((class_tab[*scan] & mask) == testval)
4616 ++scan;
4617 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4618 ++scan;
4619 else
4620 break;
4621 ++count;
4622 }
4623 break;
4624
4625 case NWHITE:
4626 case NWHITE + ADD_NL:
4627 mask = RI_WHITE;
4628 goto do_class;
4629 case DIGIT:
4630 case DIGIT + ADD_NL:
4631 testval = mask = RI_DIGIT;
4632 goto do_class;
4633 case NDIGIT:
4634 case NDIGIT + ADD_NL:
4635 mask = RI_DIGIT;
4636 goto do_class;
4637 case HEX:
4638 case HEX + ADD_NL:
4639 testval = mask = RI_HEX;
4640 goto do_class;
4641 case NHEX:
4642 case NHEX + ADD_NL:
4643 mask = RI_HEX;
4644 goto do_class;
4645 case OCTAL:
4646 case OCTAL + ADD_NL:
4647 testval = mask = RI_OCTAL;
4648 goto do_class;
4649 case NOCTAL:
4650 case NOCTAL + ADD_NL:
4651 mask = RI_OCTAL;
4652 goto do_class;
4653 case WORD:
4654 case WORD + ADD_NL:
4655 testval = mask = RI_WORD;
4656 goto do_class;
4657 case NWORD:
4658 case NWORD + ADD_NL:
4659 mask = RI_WORD;
4660 goto do_class;
4661 case HEAD:
4662 case HEAD + ADD_NL:
4663 testval = mask = RI_HEAD;
4664 goto do_class;
4665 case NHEAD:
4666 case NHEAD + ADD_NL:
4667 mask = RI_HEAD;
4668 goto do_class;
4669 case ALPHA:
4670 case ALPHA + ADD_NL:
4671 testval = mask = RI_ALPHA;
4672 goto do_class;
4673 case NALPHA:
4674 case NALPHA + ADD_NL:
4675 mask = RI_ALPHA;
4676 goto do_class;
4677 case LOWER:
4678 case LOWER + ADD_NL:
4679 testval = mask = RI_LOWER;
4680 goto do_class;
4681 case NLOWER:
4682 case NLOWER + ADD_NL:
4683 mask = RI_LOWER;
4684 goto do_class;
4685 case UPPER:
4686 case UPPER + ADD_NL:
4687 testval = mask = RI_UPPER;
4688 goto do_class;
4689 case NUPPER:
4690 case NUPPER + ADD_NL:
4691 mask = RI_UPPER;
4692 goto do_class;
4693
4694 case EXACTLY:
4695 {
4696 int cu, cl;
4697
4698 /* This doesn't do a multi-byte character, because a MULTIBYTECODE
4699 * would have been used for it. */
4700 if (ireg_ic)
4701 {
4702 cu = TOUPPER_LOC(*opnd);
4703 cl = TOLOWER_LOC(*opnd);
4704 while (count < maxcount && (*scan == cu || *scan == cl))
4705 {
4706 count++;
4707 scan++;
4708 }
4709 }
4710 else
4711 {
4712 cu = *opnd;
4713 while (count < maxcount && *scan == cu)
4714 {
4715 count++;
4716 scan++;
4717 }
4718 }
4719 break;
4720 }
4721
4722#ifdef FEAT_MBYTE
4723 case MULTIBYTECODE:
4724 {
4725 int i, len, cf = 0;
4726
4727 /* Safety check (just in case 'encoding' was changed since
4728 * compiling the program). */
4729 if ((len = (*mb_ptr2len_check)(opnd)) > 1)
4730 {
4731 if (ireg_ic && enc_utf8)
4732 cf = utf_fold(utf_ptr2char(opnd));
4733 while (count < maxcount)
4734 {
4735 for (i = 0; i < len; ++i)
4736 if (opnd[i] != scan[i])
4737 break;
4738 if (i < len && (!ireg_ic || !enc_utf8
4739 || utf_fold(utf_ptr2char(scan)) != cf))
4740 break;
4741 scan += len;
4742 ++count;
4743 }
4744 }
4745 }
4746 break;
4747#endif
4748
4749 case ANYOF:
4750 case ANYOF + ADD_NL:
4751 testval = TRUE;
4752 /*FALLTHROUGH*/
4753
4754 case ANYBUT:
4755 case ANYBUT + ADD_NL:
4756 while (count < maxcount)
4757 {
4758#ifdef FEAT_MBYTE
4759 int len;
4760#endif
4761 if (*scan == NUL)
4762 {
4763 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4764 break;
4765 reg_nextline();
4766 scan = reginput;
4767 if (got_int)
4768 break;
4769 }
4770 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4771 ++scan;
4772#ifdef FEAT_MBYTE
4773 else if (has_mbyte && (len = (*mb_ptr2len_check)(scan)) > 1)
4774 {
4775 if ((cstrchr(opnd, (*mb_ptr2char)(scan)) == NULL) == testval)
4776 break;
4777 scan += len;
4778 }
4779#endif
4780 else
4781 {
4782 if ((cstrchr(opnd, *scan) == NULL) == testval)
4783 break;
4784 ++scan;
4785 }
4786 ++count;
4787 }
4788 break;
4789
4790 case NEWL:
4791 while (count < maxcount
4792 && ((*scan == NUL && reglnum < reg_maxline)
4793 || (*scan == '\n' && reg_line_lbr)))
4794 {
4795 count++;
4796 if (reg_line_lbr)
4797 ADVANCE_REGINPUT();
4798 else
4799 reg_nextline();
4800 scan = reginput;
4801 if (got_int)
4802 break;
4803 }
4804 break;
4805
4806 default: /* Oh dear. Called inappropriately. */
4807 EMSG(_(e_re_corr));
4808#ifdef DEBUG
4809 printf("Called regrepeat with op code %d\n", OP(p));
4810#endif
4811 break;
4812 }
4813
4814 reginput = scan;
4815
4816 return (int)count;
4817}
4818
4819/*
4820 * regnext - dig the "next" pointer out of a node
4821 */
4822 static char_u *
4823regnext(p)
4824 char_u *p;
4825{
4826 int offset;
4827
4828 if (p == JUST_CALC_SIZE)
4829 return NULL;
4830
4831 offset = NEXT(p);
4832 if (offset == 0)
4833 return NULL;
4834
4835 if (OP(p) == BACK)
4836 return p - offset;
4837 else
4838 return p + offset;
4839}
4840
4841/*
4842 * Check the regexp program for its magic number.
4843 * Return TRUE if it's wrong.
4844 */
4845 static int
4846prog_magic_wrong()
4847{
4848 if (UCHARAT(REG_MULTI
4849 ? reg_mmatch->regprog->program
4850 : reg_match->regprog->program) != REGMAGIC)
4851 {
4852 EMSG(_(e_re_corr));
4853 return TRUE;
4854 }
4855 return FALSE;
4856}
4857
4858/*
4859 * Cleanup the subexpressions, if this wasn't done yet.
4860 * This construction is used to clear the subexpressions only when they are
4861 * used (to increase speed).
4862 */
4863 static void
4864cleanup_subexpr()
4865{
4866 if (need_clear_subexpr)
4867 {
4868 if (REG_MULTI)
4869 {
4870 /* Use 0xff to set lnum to -1 */
4871 vim_memset(reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4872 vim_memset(reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4873 }
4874 else
4875 {
4876 vim_memset(reg_startp, 0, sizeof(char_u *) * NSUBEXP);
4877 vim_memset(reg_endp, 0, sizeof(char_u *) * NSUBEXP);
4878 }
4879 need_clear_subexpr = FALSE;
4880 }
4881}
4882
4883#ifdef FEAT_SYN_HL
4884 static void
4885cleanup_zsubexpr()
4886{
4887 if (need_clear_zsubexpr)
4888 {
4889 if (REG_MULTI)
4890 {
4891 /* Use 0xff to set lnum to -1 */
4892 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4893 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4894 }
4895 else
4896 {
4897 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
4898 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
4899 }
4900 need_clear_zsubexpr = FALSE;
4901 }
4902}
4903#endif
4904
4905/*
4906 * Advance reglnum, regline and reginput to the next line.
4907 */
4908 static void
4909reg_nextline()
4910{
4911 regline = reg_getline(++reglnum);
4912 reginput = regline;
4913 fast_breakcheck();
4914}
4915
4916/*
4917 * Save the input line and position in a regsave_T.
4918 */
4919 static void
4920reg_save(save)
4921 regsave_T *save;
4922{
4923 if (REG_MULTI)
4924 {
4925 save->rs_u.pos.col = (colnr_T)(reginput - regline);
4926 save->rs_u.pos.lnum = reglnum;
4927 }
4928 else
4929 save->rs_u.ptr = reginput;
4930}
4931
4932/*
4933 * Restore the input line and position from a regsave_T.
4934 */
4935 static void
4936reg_restore(save)
4937 regsave_T *save;
4938{
4939 if (REG_MULTI)
4940 {
4941 if (reglnum != save->rs_u.pos.lnum)
4942 {
4943 /* only call reg_getline() when the line number changed to save
4944 * a bit of time */
4945 reglnum = save->rs_u.pos.lnum;
4946 regline = reg_getline(reglnum);
4947 }
4948 reginput = regline + save->rs_u.pos.col;
4949 }
4950 else
4951 reginput = save->rs_u.ptr;
4952}
4953
4954/*
4955 * Return TRUE if current position is equal to saved position.
4956 */
4957 static int
4958reg_save_equal(save)
4959 regsave_T *save;
4960{
4961 if (REG_MULTI)
4962 return reglnum == save->rs_u.pos.lnum
4963 && reginput == regline + save->rs_u.pos.col;
4964 return reginput == save->rs_u.ptr;
4965}
4966
4967/*
4968 * Tentatively set the sub-expression start to the current position (after
4969 * calling regmatch() they will have changed). Need to save the existing
4970 * values for when there is no match.
4971 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
4972 * depending on REG_MULTI.
4973 */
4974 static void
4975save_se_multi(savep, posp)
4976 save_se_T *savep;
4977 lpos_T *posp;
4978{
4979 savep->se_u.pos = *posp;
4980 posp->lnum = reglnum;
4981 posp->col = (colnr_T)(reginput - regline);
4982}
4983
4984 static void
4985save_se_one(savep, pp)
4986 save_se_T *savep;
4987 char_u **pp;
4988{
4989 savep->se_u.ptr = *pp;
4990 *pp = reginput;
4991}
4992
4993/*
4994 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
4995 */
4996 static int
4997re_num_cmp(val, scan)
4998 long_u val;
4999 char_u *scan;
5000{
5001 long_u n = OPERAND_MIN(scan);
5002
5003 if (OPERAND_CMP(scan) == '>')
5004 return val > n;
5005 if (OPERAND_CMP(scan) == '<')
5006 return val < n;
5007 return val == n;
5008}
5009
5010
5011#ifdef DEBUG
5012
5013/*
5014 * regdump - dump a regexp onto stdout in vaguely comprehensible form
5015 */
5016 static void
5017regdump(pattern, r)
5018 char_u *pattern;
5019 regprog_T *r;
5020{
5021 char_u *s;
5022 int op = EXACTLY; /* Arbitrary non-END op. */
5023 char_u *next;
5024 char_u *end = NULL;
5025
5026 printf("\r\nregcomp(%s):\r\n", pattern);
5027
5028 s = r->program + 1;
5029 /*
5030 * Loop until we find the END that isn't before a referred next (an END
5031 * can also appear in a NOMATCH operand).
5032 */
5033 while (op != END || s <= end)
5034 {
5035 op = OP(s);
5036 printf("%2d%s", (int)(s - r->program), regprop(s)); /* Where, what. */
5037 next = regnext(s);
5038 if (next == NULL) /* Next ptr. */
5039 printf("(0)");
5040 else
5041 printf("(%d)", (int)((s - r->program) + (next - s)));
5042 if (end < next)
5043 end = next;
5044 if (op == BRACE_LIMITS)
5045 {
5046 /* Two short ints */
5047 printf(" minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
5048 s += 8;
5049 }
5050 s += 3;
5051 if (op == ANYOF || op == ANYOF + ADD_NL
5052 || op == ANYBUT || op == ANYBUT + ADD_NL
5053 || op == EXACTLY)
5054 {
5055 /* Literal string, where present. */
5056 while (*s != NUL)
5057 printf("%c", *s++);
5058 s++;
5059 }
5060 printf("\r\n");
5061 }
5062
5063 /* Header fields of interest. */
5064 if (r->regstart != NUL)
5065 printf("start `%s' 0x%x; ", r->regstart < 256
5066 ? (char *)transchar(r->regstart)
5067 : "multibyte", r->regstart);
5068 if (r->reganch)
5069 printf("anchored; ");
5070 if (r->regmust != NULL)
5071 printf("must have \"%s\"", r->regmust);
5072 printf("\r\n");
5073}
5074
5075/*
5076 * regprop - printable representation of opcode
5077 */
5078 static char_u *
5079regprop(op)
5080 char_u *op;
5081{
5082 char_u *p;
5083 static char_u buf[50];
5084
5085 (void) strcpy(buf, ":");
5086
5087 switch (OP(op))
5088 {
5089 case BOL:
5090 p = "BOL";
5091 break;
5092 case EOL:
5093 p = "EOL";
5094 break;
5095 case RE_BOF:
5096 p = "BOF";
5097 break;
5098 case RE_EOF:
5099 p = "EOF";
5100 break;
5101 case CURSOR:
5102 p = "CURSOR";
5103 break;
5104 case RE_LNUM:
5105 p = "RE_LNUM";
5106 break;
5107 case RE_COL:
5108 p = "RE_COL";
5109 break;
5110 case RE_VCOL:
5111 p = "RE_VCOL";
5112 break;
5113 case BOW:
5114 p = "BOW";
5115 break;
5116 case EOW:
5117 p = "EOW";
5118 break;
5119 case ANY:
5120 p = "ANY";
5121 break;
5122 case ANY + ADD_NL:
5123 p = "ANY+NL";
5124 break;
5125 case ANYOF:
5126 p = "ANYOF";
5127 break;
5128 case ANYOF + ADD_NL:
5129 p = "ANYOF+NL";
5130 break;
5131 case ANYBUT:
5132 p = "ANYBUT";
5133 break;
5134 case ANYBUT + ADD_NL:
5135 p = "ANYBUT+NL";
5136 break;
5137 case IDENT:
5138 p = "IDENT";
5139 break;
5140 case IDENT + ADD_NL:
5141 p = "IDENT+NL";
5142 break;
5143 case SIDENT:
5144 p = "SIDENT";
5145 break;
5146 case SIDENT + ADD_NL:
5147 p = "SIDENT+NL";
5148 break;
5149 case KWORD:
5150 p = "KWORD";
5151 break;
5152 case KWORD + ADD_NL:
5153 p = "KWORD+NL";
5154 break;
5155 case SKWORD:
5156 p = "SKWORD";
5157 break;
5158 case SKWORD + ADD_NL:
5159 p = "SKWORD+NL";
5160 break;
5161 case FNAME:
5162 p = "FNAME";
5163 break;
5164 case FNAME + ADD_NL:
5165 p = "FNAME+NL";
5166 break;
5167 case SFNAME:
5168 p = "SFNAME";
5169 break;
5170 case SFNAME + ADD_NL:
5171 p = "SFNAME+NL";
5172 break;
5173 case PRINT:
5174 p = "PRINT";
5175 break;
5176 case PRINT + ADD_NL:
5177 p = "PRINT+NL";
5178 break;
5179 case SPRINT:
5180 p = "SPRINT";
5181 break;
5182 case SPRINT + ADD_NL:
5183 p = "SPRINT+NL";
5184 break;
5185 case WHITE:
5186 p = "WHITE";
5187 break;
5188 case WHITE + ADD_NL:
5189 p = "WHITE+NL";
5190 break;
5191 case NWHITE:
5192 p = "NWHITE";
5193 break;
5194 case NWHITE + ADD_NL:
5195 p = "NWHITE+NL";
5196 break;
5197 case DIGIT:
5198 p = "DIGIT";
5199 break;
5200 case DIGIT + ADD_NL:
5201 p = "DIGIT+NL";
5202 break;
5203 case NDIGIT:
5204 p = "NDIGIT";
5205 break;
5206 case NDIGIT + ADD_NL:
5207 p = "NDIGIT+NL";
5208 break;
5209 case HEX:
5210 p = "HEX";
5211 break;
5212 case HEX + ADD_NL:
5213 p = "HEX+NL";
5214 break;
5215 case NHEX:
5216 p = "NHEX";
5217 break;
5218 case NHEX + ADD_NL:
5219 p = "NHEX+NL";
5220 break;
5221 case OCTAL:
5222 p = "OCTAL";
5223 break;
5224 case OCTAL + ADD_NL:
5225 p = "OCTAL+NL";
5226 break;
5227 case NOCTAL:
5228 p = "NOCTAL";
5229 break;
5230 case NOCTAL + ADD_NL:
5231 p = "NOCTAL+NL";
5232 break;
5233 case WORD:
5234 p = "WORD";
5235 break;
5236 case WORD + ADD_NL:
5237 p = "WORD+NL";
5238 break;
5239 case NWORD:
5240 p = "NWORD";
5241 break;
5242 case NWORD + ADD_NL:
5243 p = "NWORD+NL";
5244 break;
5245 case HEAD:
5246 p = "HEAD";
5247 break;
5248 case HEAD + ADD_NL:
5249 p = "HEAD+NL";
5250 break;
5251 case NHEAD:
5252 p = "NHEAD";
5253 break;
5254 case NHEAD + ADD_NL:
5255 p = "NHEAD+NL";
5256 break;
5257 case ALPHA:
5258 p = "ALPHA";
5259 break;
5260 case ALPHA + ADD_NL:
5261 p = "ALPHA+NL";
5262 break;
5263 case NALPHA:
5264 p = "NALPHA";
5265 break;
5266 case NALPHA + ADD_NL:
5267 p = "NALPHA+NL";
5268 break;
5269 case LOWER:
5270 p = "LOWER";
5271 break;
5272 case LOWER + ADD_NL:
5273 p = "LOWER+NL";
5274 break;
5275 case NLOWER:
5276 p = "NLOWER";
5277 break;
5278 case NLOWER + ADD_NL:
5279 p = "NLOWER+NL";
5280 break;
5281 case UPPER:
5282 p = "UPPER";
5283 break;
5284 case UPPER + ADD_NL:
5285 p = "UPPER+NL";
5286 break;
5287 case NUPPER:
5288 p = "NUPPER";
5289 break;
5290 case NUPPER + ADD_NL:
5291 p = "NUPPER+NL";
5292 break;
5293 case BRANCH:
5294 p = "BRANCH";
5295 break;
5296 case EXACTLY:
5297 p = "EXACTLY";
5298 break;
5299 case NOTHING:
5300 p = "NOTHING";
5301 break;
5302 case BACK:
5303 p = "BACK";
5304 break;
5305 case END:
5306 p = "END";
5307 break;
5308 case MOPEN + 0:
5309 p = "MATCH START";
5310 break;
5311 case MOPEN + 1:
5312 case MOPEN + 2:
5313 case MOPEN + 3:
5314 case MOPEN + 4:
5315 case MOPEN + 5:
5316 case MOPEN + 6:
5317 case MOPEN + 7:
5318 case MOPEN + 8:
5319 case MOPEN + 9:
5320 sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
5321 p = NULL;
5322 break;
5323 case MCLOSE + 0:
5324 p = "MATCH END";
5325 break;
5326 case MCLOSE + 1:
5327 case MCLOSE + 2:
5328 case MCLOSE + 3:
5329 case MCLOSE + 4:
5330 case MCLOSE + 5:
5331 case MCLOSE + 6:
5332 case MCLOSE + 7:
5333 case MCLOSE + 8:
5334 case MCLOSE + 9:
5335 sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
5336 p = NULL;
5337 break;
5338 case BACKREF + 1:
5339 case BACKREF + 2:
5340 case BACKREF + 3:
5341 case BACKREF + 4:
5342 case BACKREF + 5:
5343 case BACKREF + 6:
5344 case BACKREF + 7:
5345 case BACKREF + 8:
5346 case BACKREF + 9:
5347 sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
5348 p = NULL;
5349 break;
5350 case NOPEN:
5351 p = "NOPEN";
5352 break;
5353 case NCLOSE:
5354 p = "NCLOSE";
5355 break;
5356#ifdef FEAT_SYN_HL
5357 case ZOPEN + 1:
5358 case ZOPEN + 2:
5359 case ZOPEN + 3:
5360 case ZOPEN + 4:
5361 case ZOPEN + 5:
5362 case ZOPEN + 6:
5363 case ZOPEN + 7:
5364 case ZOPEN + 8:
5365 case ZOPEN + 9:
5366 sprintf(buf + STRLEN(buf), "ZOPEN%d", OP(op) - ZOPEN);
5367 p = NULL;
5368 break;
5369 case ZCLOSE + 1:
5370 case ZCLOSE + 2:
5371 case ZCLOSE + 3:
5372 case ZCLOSE + 4:
5373 case ZCLOSE + 5:
5374 case ZCLOSE + 6:
5375 case ZCLOSE + 7:
5376 case ZCLOSE + 8:
5377 case ZCLOSE + 9:
5378 sprintf(buf + STRLEN(buf), "ZCLOSE%d", OP(op) - ZCLOSE);
5379 p = NULL;
5380 break;
5381 case ZREF + 1:
5382 case ZREF + 2:
5383 case ZREF + 3:
5384 case ZREF + 4:
5385 case ZREF + 5:
5386 case ZREF + 6:
5387 case ZREF + 7:
5388 case ZREF + 8:
5389 case ZREF + 9:
5390 sprintf(buf + STRLEN(buf), "ZREF%d", OP(op) - ZREF);
5391 p = NULL;
5392 break;
5393#endif
5394 case STAR:
5395 p = "STAR";
5396 break;
5397 case PLUS:
5398 p = "PLUS";
5399 break;
5400 case NOMATCH:
5401 p = "NOMATCH";
5402 break;
5403 case MATCH:
5404 p = "MATCH";
5405 break;
5406 case BEHIND:
5407 p = "BEHIND";
5408 break;
5409 case NOBEHIND:
5410 p = "NOBEHIND";
5411 break;
5412 case SUBPAT:
5413 p = "SUBPAT";
5414 break;
5415 case BRACE_LIMITS:
5416 p = "BRACE_LIMITS";
5417 break;
5418 case BRACE_SIMPLE:
5419 p = "BRACE_SIMPLE";
5420 break;
5421 case BRACE_COMPLEX + 0:
5422 case BRACE_COMPLEX + 1:
5423 case BRACE_COMPLEX + 2:
5424 case BRACE_COMPLEX + 3:
5425 case BRACE_COMPLEX + 4:
5426 case BRACE_COMPLEX + 5:
5427 case BRACE_COMPLEX + 6:
5428 case BRACE_COMPLEX + 7:
5429 case BRACE_COMPLEX + 8:
5430 case BRACE_COMPLEX + 9:
5431 sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
5432 p = NULL;
5433 break;
5434#ifdef FEAT_MBYTE
5435 case MULTIBYTECODE:
5436 p = "MULTIBYTECODE";
5437 break;
5438#endif
5439 case NEWL:
5440 p = "NEWL";
5441 break;
5442 default:
5443 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
5444 p = NULL;
5445 break;
5446 }
5447 if (p != NULL)
5448 (void) strcat(buf, p);
5449 return buf;
5450}
5451#endif
5452
5453#ifdef FEAT_MBYTE
5454static void mb_decompose __ARGS((int c, int *c1, int *c2, int *c3));
5455
5456typedef struct
5457{
5458 int a, b, c;
5459} decomp_T;
5460
5461
5462/* 0xfb20 - 0xfb4f */
5463decomp_T decomp_table[0xfb4f-0xfb20+1] =
5464{
5465 {0x5e2,0,0}, /* 0xfb20 alt ayin */
5466 {0x5d0,0,0}, /* 0xfb21 alt alef */
5467 {0x5d3,0,0}, /* 0xfb22 alt dalet */
5468 {0x5d4,0,0}, /* 0xfb23 alt he */
5469 {0x5db,0,0}, /* 0xfb24 alt kaf */
5470 {0x5dc,0,0}, /* 0xfb25 alt lamed */
5471 {0x5dd,0,0}, /* 0xfb26 alt mem-sofit */
5472 {0x5e8,0,0}, /* 0xfb27 alt resh */
5473 {0x5ea,0,0}, /* 0xfb28 alt tav */
5474 {'+', 0, 0}, /* 0xfb29 alt plus */
5475 {0x5e9, 0x5c1, 0}, /* 0xfb2a shin+shin-dot */
5476 {0x5e9, 0x5c2, 0}, /* 0xfb2b shin+sin-dot */
5477 {0x5e9, 0x5c1, 0x5bc}, /* 0xfb2c shin+shin-dot+dagesh */
5478 {0x5e9, 0x5c2, 0x5bc}, /* 0xfb2d shin+sin-dot+dagesh */
5479 {0x5d0, 0x5b7, 0}, /* 0xfb2e alef+patah */
5480 {0x5d0, 0x5b8, 0}, /* 0xfb2f alef+qamats */
5481 {0x5d0, 0x5b4, 0}, /* 0xfb30 alef+hiriq */
5482 {0x5d1, 0x5bc, 0}, /* 0xfb31 bet+dagesh */
5483 {0x5d2, 0x5bc, 0}, /* 0xfb32 gimel+dagesh */
5484 {0x5d3, 0x5bc, 0}, /* 0xfb33 dalet+dagesh */
5485 {0x5d4, 0x5bc, 0}, /* 0xfb34 he+dagesh */
5486 {0x5d5, 0x5bc, 0}, /* 0xfb35 vav+dagesh */
5487 {0x5d6, 0x5bc, 0}, /* 0xfb36 zayin+dagesh */
5488 {0xfb37, 0, 0}, /* 0xfb37 -- UNUSED */
5489 {0x5d8, 0x5bc, 0}, /* 0xfb38 tet+dagesh */
5490 {0x5d9, 0x5bc, 0}, /* 0xfb39 yud+dagesh */
5491 {0x5da, 0x5bc, 0}, /* 0xfb3a kaf sofit+dagesh */
5492 {0x5db, 0x5bc, 0}, /* 0xfb3b kaf+dagesh */
5493 {0x5dc, 0x5bc, 0}, /* 0xfb3c lamed+dagesh */
5494 {0xfb3d, 0, 0}, /* 0xfb3d -- UNUSED */
5495 {0x5de, 0x5bc, 0}, /* 0xfb3e mem+dagesh */
5496 {0xfb3f, 0, 0}, /* 0xfb3f -- UNUSED */
5497 {0x5e0, 0x5bc, 0}, /* 0xfb40 nun+dagesh */
5498 {0x5e1, 0x5bc, 0}, /* 0xfb41 samech+dagesh */
5499 {0xfb42, 0, 0}, /* 0xfb42 -- UNUSED */
5500 {0x5e3, 0x5bc, 0}, /* 0xfb43 pe sofit+dagesh */
5501 {0x5e4, 0x5bc,0}, /* 0xfb44 pe+dagesh */
5502 {0xfb45, 0, 0}, /* 0xfb45 -- UNUSED */
5503 {0x5e6, 0x5bc, 0}, /* 0xfb46 tsadi+dagesh */
5504 {0x5e7, 0x5bc, 0}, /* 0xfb47 qof+dagesh */
5505 {0x5e8, 0x5bc, 0}, /* 0xfb48 resh+dagesh */
5506 {0x5e9, 0x5bc, 0}, /* 0xfb49 shin+dagesh */
5507 {0x5ea, 0x5bc, 0}, /* 0xfb4a tav+dagesh */
5508 {0x5d5, 0x5b9, 0}, /* 0xfb4b vav+holam */
5509 {0x5d1, 0x5bf, 0}, /* 0xfb4c bet+rafe */
5510 {0x5db, 0x5bf, 0}, /* 0xfb4d kaf+rafe */
5511 {0x5e4, 0x5bf, 0}, /* 0xfb4e pe+rafe */
5512 {0x5d0, 0x5dc, 0} /* 0xfb4f alef-lamed */
5513};
5514
5515 static void
5516mb_decompose(c, c1, c2, c3)
5517 int c, *c1, *c2, *c3;
5518{
5519 decomp_T d;
5520
5521 if (c >= 0x4b20 && c <= 0xfb4f)
5522 {
5523 d = decomp_table[c - 0xfb20];
5524 *c1 = d.a;
5525 *c2 = d.b;
5526 *c3 = d.c;
5527 }
5528 else
5529 {
5530 *c1 = c;
5531 *c2 = *c3 = 0;
5532 }
5533}
5534#endif
5535
5536/*
5537 * Compare two strings, ignore case if ireg_ic set.
5538 * Return 0 if strings match, non-zero otherwise.
5539 * Correct the length "*n" when composing characters are ignored.
5540 */
5541 static int
5542cstrncmp(s1, s2, n)
5543 char_u *s1, *s2;
5544 int *n;
5545{
5546 int result;
5547
5548 if (!ireg_ic)
5549 result = STRNCMP(s1, s2, *n);
5550 else
5551 result = MB_STRNICMP(s1, s2, *n);
5552
5553#ifdef FEAT_MBYTE
5554 /* if it failed and it's utf8 and we want to combineignore: */
5555 if (result != 0 && enc_utf8 && ireg_icombine)
5556 {
5557 char_u *str1, *str2;
5558 int c1, c2, c11, c12;
5559 int ix;
5560 int junk;
5561
5562 /* we have to handle the strcmp ourselves, since it is necessary to
5563 * deal with the composing characters by ignoring them: */
5564 str1 = s1;
5565 str2 = s2;
5566 c1 = c2 = 0;
5567 for (ix = 0; ix < *n; )
5568 {
5569 c1 = mb_ptr2char_adv(&str1);
5570 c2 = mb_ptr2char_adv(&str2);
5571 ix += utf_char2len(c1);
5572
5573 /* decompose the character if necessary, into 'base' characters
5574 * because I don't care about Arabic, I will hard-code the Hebrew
5575 * which I *do* care about! So sue me... */
5576 if (c1 != c2 && (!ireg_ic || utf_fold(c1) != utf_fold(c2)))
5577 {
5578 /* decomposition necessary? */
5579 mb_decompose(c1, &c11, &junk, &junk);
5580 mb_decompose(c2, &c12, &junk, &junk);
5581 c1 = c11;
5582 c2 = c12;
5583 if (c11 != c12 && (!ireg_ic || utf_fold(c11) != utf_fold(c12)))
5584 break;
5585 }
5586 }
5587 result = c2 - c1;
5588 if (result == 0)
5589 *n = (int)(str2 - s2);
5590 }
5591#endif
5592
5593 return result;
5594}
5595
5596/*
5597 * cstrchr: This function is used a lot for simple searches, keep it fast!
5598 */
5599 static char_u *
5600cstrchr(s, c)
5601 char_u *s;
5602 int c;
5603{
5604 char_u *p;
5605 int cc;
5606
5607 if (!ireg_ic
5608#ifdef FEAT_MBYTE
5609 || (!enc_utf8 && mb_char2len(c) > 1)
5610#endif
5611 )
5612 return vim_strchr(s, c);
5613
5614 /* tolower() and toupper() can be slow, comparing twice should be a lot
5615 * faster (esp. when using MS Visual C++!).
5616 * For UTF-8 need to use folded case. */
5617#ifdef FEAT_MBYTE
5618 if (enc_utf8 && c > 0x80)
5619 cc = utf_fold(c);
5620 else
5621#endif
5622 if (isupper(c))
5623 cc = TOLOWER_LOC(c);
5624 else if (islower(c))
5625 cc = TOUPPER_LOC(c);
5626 else
5627 return vim_strchr(s, c);
5628
5629#ifdef FEAT_MBYTE
5630 if (has_mbyte)
5631 {
5632 for (p = s; *p != NUL; p += (*mb_ptr2len_check)(p))
5633 {
5634 if (enc_utf8 && c > 0x80)
5635 {
5636 if (utf_fold(utf_ptr2char(p)) == cc)
5637 return p;
5638 }
5639 else if (*p == c || *p == cc)
5640 return p;
5641 }
5642 }
5643 else
5644#endif
5645 /* Faster version for when there are no multi-byte characters. */
5646 for (p = s; *p != NUL; ++p)
5647 if (*p == c || *p == cc)
5648 return p;
5649
5650 return NULL;
5651}
5652
5653/***************************************************************
5654 * regsub stuff *
5655 ***************************************************************/
5656
5657/* This stuff below really confuses cc on an SGI -- webb */
5658#ifdef __sgi
5659# undef __ARGS
5660# define __ARGS(x) ()
5661#endif
5662
5663/*
5664 * We should define ftpr as a pointer to a function returning a pointer to
5665 * a function returning a pointer to a function ...
5666 * This is impossible, so we declare a pointer to a function returning a
5667 * pointer to a function returning void. This should work for all compilers.
5668 */
5669typedef void (*(*fptr) __ARGS((char_u *, int)))();
5670
5671static fptr do_upper __ARGS((char_u *, int));
5672static fptr do_Upper __ARGS((char_u *, int));
5673static fptr do_lower __ARGS((char_u *, int));
5674static fptr do_Lower __ARGS((char_u *, int));
5675
5676static int vim_regsub_both __ARGS((char_u *source, char_u *dest, int copy, int magic, int backslash));
5677
5678 static fptr
5679do_upper(d, c)
5680 char_u *d;
5681 int c;
5682{
5683 *d = TOUPPER_LOC(c);
5684
5685 return (fptr)NULL;
5686}
5687
5688 static fptr
5689do_Upper(d, c)
5690 char_u *d;
5691 int c;
5692{
5693 *d = TOUPPER_LOC(c);
5694
5695 return (fptr)do_Upper;
5696}
5697
5698 static fptr
5699do_lower(d, c)
5700 char_u *d;
5701 int c;
5702{
5703 *d = TOLOWER_LOC(c);
5704
5705 return (fptr)NULL;
5706}
5707
5708 static fptr
5709do_Lower(d, c)
5710 char_u *d;
5711 int c;
5712{
5713 *d = TOLOWER_LOC(c);
5714
5715 return (fptr)do_Lower;
5716}
5717
5718/*
5719 * regtilde(): Replace tildes in the pattern by the old pattern.
5720 *
5721 * Short explanation of the tilde: It stands for the previous replacement
5722 * pattern. If that previous pattern also contains a ~ we should go back a
5723 * step further... But we insert the previous pattern into the current one
5724 * and remember that.
5725 * This still does not handle the case where "magic" changes. TODO?
5726 *
5727 * The tildes are parsed once before the first call to vim_regsub().
5728 */
5729 char_u *
5730regtilde(source, magic)
5731 char_u *source;
5732 int magic;
5733{
5734 char_u *newsub = source;
5735 char_u *tmpsub;
5736 char_u *p;
5737 int len;
5738 int prevlen;
5739
5740 for (p = newsub; *p; ++p)
5741 {
5742 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
5743 {
5744 if (reg_prev_sub != NULL)
5745 {
5746 /* length = len(newsub) - 1 + len(prev_sub) + 1 */
5747 prevlen = (int)STRLEN(reg_prev_sub);
5748 tmpsub = alloc((unsigned)(STRLEN(newsub) + prevlen));
5749 if (tmpsub != NULL)
5750 {
5751 /* copy prefix */
5752 len = (int)(p - newsub); /* not including ~ */
5753 mch_memmove(tmpsub, newsub, (size_t)len);
5754 /* interpretate tilde */
5755 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
5756 /* copy postfix */
5757 if (!magic)
5758 ++p; /* back off \ */
5759 STRCPY(tmpsub + len + prevlen, p + 1);
5760
5761 if (newsub != source) /* already allocated newsub */
5762 vim_free(newsub);
5763 newsub = tmpsub;
5764 p = newsub + len + prevlen;
5765 }
5766 }
5767 else if (magic)
5768 STRCPY(p, p + 1); /* remove '~' */
5769 else
5770 STRCPY(p, p + 2); /* remove '\~' */
5771 --p;
5772 }
5773 else
5774 {
5775 if (*p == '\\' && p[1]) /* skip escaped characters */
5776 ++p;
5777#ifdef FEAT_MBYTE
5778 if (has_mbyte)
5779 p += (*mb_ptr2len_check)(p) - 1;
5780#endif
5781 }
5782 }
5783
5784 vim_free(reg_prev_sub);
5785 if (newsub != source) /* newsub was allocated, just keep it */
5786 reg_prev_sub = newsub;
5787 else /* no ~ found, need to save newsub */
5788 reg_prev_sub = vim_strsave(newsub);
5789 return newsub;
5790}
5791
5792#ifdef FEAT_EVAL
5793static int can_f_submatch = FALSE; /* TRUE when submatch() can be used */
5794
5795/* These pointers are used instead of reg_match and reg_mmatch for
5796 * reg_submatch(). Needed for when the substitution string is an expression
5797 * that contains a call to substitute() and submatch(). */
5798static regmatch_T *submatch_match;
5799static regmmatch_T *submatch_mmatch;
5800#endif
5801
5802#if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) || defined(PROTO)
5803/*
5804 * vim_regsub() - perform substitutions after a vim_regexec() or
5805 * vim_regexec_multi() match.
5806 *
5807 * If "copy" is TRUE really copy into "dest".
5808 * If "copy" is FALSE nothing is copied, this is just to find out the length
5809 * of the result.
5810 *
5811 * If "backslash" is TRUE, a backslash will be removed later, need to double
5812 * them to keep them, and insert a backslash before a CR to avoid it being
5813 * replaced with a line break later.
5814 *
5815 * Note: The matched text must not change between the call of
5816 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
5817 * references invalid!
5818 *
5819 * Returns the size of the replacement, including terminating NUL.
5820 */
5821 int
5822vim_regsub(rmp, source, dest, copy, magic, backslash)
5823 regmatch_T *rmp;
5824 char_u *source;
5825 char_u *dest;
5826 int copy;
5827 int magic;
5828 int backslash;
5829{
5830 reg_match = rmp;
5831 reg_mmatch = NULL;
5832 reg_maxline = 0;
5833 return vim_regsub_both(source, dest, copy, magic, backslash);
5834}
5835#endif
5836
5837 int
5838vim_regsub_multi(rmp, lnum, source, dest, copy, magic, backslash)
5839 regmmatch_T *rmp;
5840 linenr_T lnum;
5841 char_u *source;
5842 char_u *dest;
5843 int copy;
5844 int magic;
5845 int backslash;
5846{
5847 reg_match = NULL;
5848 reg_mmatch = rmp;
5849 reg_buf = curbuf; /* always works on the current buffer! */
5850 reg_firstlnum = lnum;
5851 reg_maxline = curbuf->b_ml.ml_line_count - lnum;
5852 return vim_regsub_both(source, dest, copy, magic, backslash);
5853}
5854
5855 static int
5856vim_regsub_both(source, dest, copy, magic, backslash)
5857 char_u *source;
5858 char_u *dest;
5859 int copy;
5860 int magic;
5861 int backslash;
5862{
5863 char_u *src;
5864 char_u *dst;
5865 char_u *s;
5866 int c;
5867 int no = -1;
5868 fptr func = (fptr)NULL;
5869 linenr_T clnum = 0; /* init for GCC */
5870 int len = 0; /* init for GCC */
5871#ifdef FEAT_EVAL
5872 static char_u *eval_result = NULL;
5873#endif
5874#ifdef FEAT_MBYTE
5875 int l;
5876#endif
5877
5878
5879 /* Be paranoid... */
5880 if (source == NULL || dest == NULL)
5881 {
5882 EMSG(_(e_null));
5883 return 0;
5884 }
5885 if (prog_magic_wrong())
5886 return 0;
5887 src = source;
5888 dst = dest;
5889
5890 /*
5891 * When the substitute part starts with "\=" evaluate it as an expression.
5892 */
5893 if (source[0] == '\\' && source[1] == '='
5894#ifdef FEAT_EVAL
5895 && !can_f_submatch /* can't do this recursively */
5896#endif
5897 )
5898 {
5899#ifdef FEAT_EVAL
5900 /* To make sure that the length doesn't change between checking the
5901 * length and copying the string, and to speed up things, the
5902 * resulting string is saved from the call with "copy" == FALSE to the
5903 * call with "copy" == TRUE. */
5904 if (copy)
5905 {
5906 if (eval_result != NULL)
5907 {
5908 STRCPY(dest, eval_result);
5909 dst += STRLEN(eval_result);
5910 vim_free(eval_result);
5911 eval_result = NULL;
5912 }
5913 }
5914 else
5915 {
5916 linenr_T save_reg_maxline;
5917 win_T *save_reg_win;
5918 int save_ireg_ic;
5919
5920 vim_free(eval_result);
5921
5922 /* The expression may contain substitute(), which calls us
5923 * recursively. Make sure submatch() gets the text from the first
5924 * level. Don't need to save "reg_buf", because
5925 * vim_regexec_multi() can't be called recursively. */
5926 submatch_match = reg_match;
5927 submatch_mmatch = reg_mmatch;
5928 save_reg_maxline = reg_maxline;
5929 save_reg_win = reg_win;
5930 save_ireg_ic = ireg_ic;
5931 can_f_submatch = TRUE;
5932
5933 eval_result = eval_to_string(source + 2, NULL);
5934 if (eval_result != NULL)
5935 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00005936 for (s = eval_result; *s != NUL; mb_ptr_adv(s))
Bram Moolenaar071d4272004-06-13 20:20:40 +00005937 {
5938 /* Change NL to CR, so that it becomes a line break.
5939 * Skip over a backslashed character. */
5940 if (*s == NL)
5941 *s = CAR;
5942 else if (*s == '\\' && s[1] != NUL)
5943 ++s;
Bram Moolenaar071d4272004-06-13 20:20:40 +00005944 }
5945
5946 dst += STRLEN(eval_result);
5947 }
5948
5949 reg_match = submatch_match;
5950 reg_mmatch = submatch_mmatch;
5951 reg_maxline = save_reg_maxline;
5952 reg_win = save_reg_win;
5953 ireg_ic = save_ireg_ic;
5954 can_f_submatch = FALSE;
5955 }
5956#endif
5957 }
5958 else
5959 while ((c = *src++) != NUL)
5960 {
5961 if (c == '&' && magic)
5962 no = 0;
5963 else if (c == '\\' && *src != NUL)
5964 {
5965 if (*src == '&' && !magic)
5966 {
5967 ++src;
5968 no = 0;
5969 }
5970 else if ('0' <= *src && *src <= '9')
5971 {
5972 no = *src++ - '0';
5973 }
5974 else if (vim_strchr((char_u *)"uUlLeE", *src))
5975 {
5976 switch (*src++)
5977 {
5978 case 'u': func = (fptr)do_upper;
5979 continue;
5980 case 'U': func = (fptr)do_Upper;
5981 continue;
5982 case 'l': func = (fptr)do_lower;
5983 continue;
5984 case 'L': func = (fptr)do_Lower;
5985 continue;
5986 case 'e':
5987 case 'E': func = (fptr)NULL;
5988 continue;
5989 }
5990 }
5991 }
5992 if (no < 0) /* Ordinary character. */
5993 {
5994 if (c == '\\' && *src != NUL)
5995 {
5996 /* Check for abbreviations -- webb */
5997 switch (*src)
5998 {
5999 case 'r': c = CAR; ++src; break;
6000 case 'n': c = NL; ++src; break;
6001 case 't': c = TAB; ++src; break;
6002 /* Oh no! \e already has meaning in subst pat :-( */
6003 /* case 'e': c = ESC; ++src; break; */
6004 case 'b': c = Ctrl_H; ++src; break;
6005
6006 /* If "backslash" is TRUE the backslash will be removed
6007 * later. Used to insert a literal CR. */
6008 default: if (backslash)
6009 {
6010 if (copy)
6011 *dst = '\\';
6012 ++dst;
6013 }
6014 c = *src++;
6015 }
6016 }
6017
6018 /* Write to buffer, if copy is set. */
6019#ifdef FEAT_MBYTE
6020 if (has_mbyte && (l = (*mb_ptr2len_check)(src - 1)) > 1)
6021 {
6022 /* TODO: should use "func" here. */
6023 if (copy)
6024 mch_memmove(dst, src - 1, l);
6025 dst += l - 1;
6026 src += l - 1;
6027 }
6028 else
6029 {
6030#endif
6031 if (copy)
6032 {
6033 if (func == (fptr)NULL) /* just copy */
6034 *dst = c;
6035 else /* change case */
6036 func = (fptr)(func(dst, c));
6037 /* Turbo C complains without the typecast */
6038 }
6039#ifdef FEAT_MBYTE
6040 }
6041#endif
6042 dst++;
6043 }
6044 else
6045 {
6046 if (REG_MULTI)
6047 {
6048 clnum = reg_mmatch->startpos[no].lnum;
6049 if (clnum < 0 || reg_mmatch->endpos[no].lnum < 0)
6050 s = NULL;
6051 else
6052 {
6053 s = reg_getline(clnum) + reg_mmatch->startpos[no].col;
6054 if (reg_mmatch->endpos[no].lnum == clnum)
6055 len = reg_mmatch->endpos[no].col
6056 - reg_mmatch->startpos[no].col;
6057 else
6058 len = (int)STRLEN(s);
6059 }
6060 }
6061 else
6062 {
6063 s = reg_match->startp[no];
6064 if (reg_match->endp[no] == NULL)
6065 s = NULL;
6066 else
6067 len = (int)(reg_match->endp[no] - s);
6068 }
6069 if (s != NULL)
6070 {
6071 for (;;)
6072 {
6073 if (len == 0)
6074 {
6075 if (REG_MULTI)
6076 {
6077 if (reg_mmatch->endpos[no].lnum == clnum)
6078 break;
6079 if (copy)
6080 *dst = CAR;
6081 ++dst;
6082 s = reg_getline(++clnum);
6083 if (reg_mmatch->endpos[no].lnum == clnum)
6084 len = reg_mmatch->endpos[no].col;
6085 else
6086 len = (int)STRLEN(s);
6087 }
6088 else
6089 break;
6090 }
6091 else if (*s == NUL) /* we hit NUL. */
6092 {
6093 if (copy)
6094 EMSG(_(e_re_damg));
6095 goto exit;
6096 }
6097 else
6098 {
6099 if (backslash && (*s == CAR || *s == '\\'))
6100 {
6101 /*
6102 * Insert a backslash in front of a CR, otherwise
6103 * it will be replaced by a line break.
6104 * Number of backslashes will be halved later,
6105 * double them here.
6106 */
6107 if (copy)
6108 {
6109 dst[0] = '\\';
6110 dst[1] = *s;
6111 }
6112 dst += 2;
6113 }
6114#ifdef FEAT_MBYTE
6115 else if (has_mbyte && (l = (*mb_ptr2len_check)(s)) > 1)
6116 {
6117 /* TODO: should use "func" here. */
6118 if (copy)
6119 mch_memmove(dst, s, l);
6120 dst += l;
6121 s += l - 1;
6122 len -= l - 1;
6123 }
6124#endif
6125 else
6126 {
6127 if (copy)
6128 {
6129 if (func == (fptr)NULL) /* just copy */
6130 *dst = *s;
6131 else /* change case */
6132 func = (fptr)(func(dst, *s));
6133 /* Turbo C complains without the typecast */
6134 }
6135 ++dst;
6136 }
6137 ++s;
6138 --len;
6139 }
6140 }
6141 }
6142 no = -1;
6143 }
6144 }
6145 if (copy)
6146 *dst = NUL;
6147
6148exit:
6149 return (int)((dst - dest) + 1);
6150}
6151
6152#ifdef FEAT_EVAL
6153/*
6154 * Used for the submatch() function: get the string from tne n'th submatch in
6155 * allocated memory.
6156 * Returns NULL when not in a ":s" command and for a non-existing submatch.
6157 */
6158 char_u *
6159reg_submatch(no)
6160 int no;
6161{
6162 char_u *retval = NULL;
6163 char_u *s;
6164 int len;
6165 int round;
6166 linenr_T lnum;
6167
6168 if (!can_f_submatch)
6169 return NULL;
6170
6171 if (submatch_match == NULL)
6172 {
6173 /*
6174 * First round: compute the length and allocate memory.
6175 * Second round: copy the text.
6176 */
6177 for (round = 1; round <= 2; ++round)
6178 {
6179 lnum = submatch_mmatch->startpos[no].lnum;
6180 if (lnum < 0 || submatch_mmatch->endpos[no].lnum < 0)
6181 return NULL;
6182
6183 s = reg_getline(lnum) + submatch_mmatch->startpos[no].col;
6184 if (s == NULL) /* anti-crash check, cannot happen? */
6185 break;
6186 if (submatch_mmatch->endpos[no].lnum == lnum)
6187 {
6188 /* Within one line: take form start to end col. */
6189 len = submatch_mmatch->endpos[no].col
6190 - submatch_mmatch->startpos[no].col;
6191 if (round == 2)
6192 {
6193 STRNCPY(retval, s, len);
6194 retval[len] = NUL;
6195 }
6196 ++len;
6197 }
6198 else
6199 {
6200 /* Multiple lines: take start line from start col, middle
6201 * lines completely and end line up to end col. */
6202 len = (int)STRLEN(s);
6203 if (round == 2)
6204 {
6205 STRCPY(retval, s);
6206 retval[len] = '\n';
6207 }
6208 ++len;
6209 ++lnum;
6210 while (lnum < submatch_mmatch->endpos[no].lnum)
6211 {
6212 s = reg_getline(lnum++);
6213 if (round == 2)
6214 STRCPY(retval + len, s);
6215 len += (int)STRLEN(s);
6216 if (round == 2)
6217 retval[len] = '\n';
6218 ++len;
6219 }
6220 if (round == 2)
6221 STRNCPY(retval + len, reg_getline(lnum),
6222 submatch_mmatch->endpos[no].col);
6223 len += submatch_mmatch->endpos[no].col;
6224 if (round == 2)
6225 retval[len] = NUL;
6226 ++len;
6227 }
6228
6229 if (round == 1)
6230 {
6231 retval = lalloc((long_u)len, TRUE);
6232 if (s == NULL)
6233 return NULL;
6234 }
6235 }
6236 }
6237 else
6238 {
6239 if (submatch_match->endp[no] == NULL)
6240 retval = NULL;
6241 else
6242 {
6243 s = submatch_match->startp[no];
6244 retval = vim_strnsave(s, (int)(submatch_match->endp[no] - s));
6245 }
6246 }
6247
6248 return retval;
6249}
6250#endif