| Colin Cross | 4fa7b10 | 2010-01-12 18:59:25 -0800 | [diff] [blame] | 1 | /*	$OpenBSD: regcomp.c,v 1.19 2008/02/23 08:13:07 otto Exp $ */ | 
|  | 2 | /*- | 
|  | 3 | * Copyright (c) 1992, 1993, 1994 Henry Spencer. | 
|  | 4 | * Copyright (c) 1992, 1993, 1994 | 
|  | 5 | *	The Regents of the University of California.  All rights reserved. | 
|  | 6 | * | 
|  | 7 | * This code is derived from software contributed to Berkeley by | 
|  | 8 | * Henry Spencer. | 
|  | 9 | * | 
|  | 10 | * Redistribution and use in source and binary forms, with or without | 
|  | 11 | * modification, are permitted provided that the following conditions | 
|  | 12 | * are met: | 
|  | 13 | * 1. Redistributions of source code must retain the above copyright | 
|  | 14 | *    notice, this list of conditions and the following disclaimer. | 
|  | 15 | * 2. Redistributions in binary form must reproduce the above copyright | 
|  | 16 | *    notice, this list of conditions and the following disclaimer in the | 
|  | 17 | *    documentation and/or other materials provided with the distribution. | 
|  | 18 | * 3. Neither the name of the University nor the names of its contributors | 
|  | 19 | *    may be used to endorse or promote products derived from this software | 
|  | 20 | *    without specific prior written permission. | 
|  | 21 | * | 
|  | 22 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | 
|  | 23 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
|  | 24 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
|  | 25 | * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | 
|  | 26 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | 
|  | 27 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | 
|  | 28 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | 
|  | 29 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | 
|  | 30 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | 
|  | 31 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | 
|  | 32 | * SUCH DAMAGE. | 
|  | 33 | * | 
|  | 34 | *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94 | 
|  | 35 | */ | 
|  | 36 |  | 
|  | 37 | #include <sys/types.h> | 
|  | 38 | #include <stdio.h> | 
|  | 39 | #include <string.h> | 
|  | 40 | #include <ctype.h> | 
|  | 41 | #include <limits.h> | 
|  | 42 | #include <stdlib.h> | 
|  | 43 | #include <regex.h> | 
|  | 44 |  | 
|  | 45 | #include "utils.h" | 
|  | 46 | #include "regex2.h" | 
|  | 47 |  | 
|  | 48 | #include "cclass.h" | 
|  | 49 | #include "cname.h" | 
|  | 50 |  | 
|  | 51 | /* | 
|  | 52 | * parse structure, passed up and down to avoid global variables and | 
|  | 53 | * other clumsinesses | 
|  | 54 | */ | 
|  | 55 | struct parse { | 
|  | 56 | char *next;		/* next character in RE */ | 
|  | 57 | char *end;		/* end of string (-> NUL normally) */ | 
|  | 58 | int error;		/* has an error been seen? */ | 
|  | 59 | sop *strip;		/* malloced strip */ | 
|  | 60 | sopno ssize;		/* malloced strip size (allocated) */ | 
|  | 61 | sopno slen;		/* malloced strip length (used) */ | 
|  | 62 | int ncsalloc;		/* number of csets allocated */ | 
|  | 63 | struct re_guts *g; | 
|  | 64 | #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */ | 
|  | 65 | sopno pbegin[NPAREN];	/* -> ( ([0] unused) */ | 
|  | 66 | sopno pend[NPAREN];	/* -> ) ([0] unused) */ | 
|  | 67 | }; | 
|  | 68 |  | 
|  | 69 | static void p_ere(struct parse *, int); | 
|  | 70 | static void p_ere_exp(struct parse *); | 
|  | 71 | static void p_str(struct parse *); | 
|  | 72 | static void p_bre(struct parse *, int, int); | 
|  | 73 | static int p_simp_re(struct parse *, int); | 
|  | 74 | static int p_count(struct parse *); | 
|  | 75 | static void p_bracket(struct parse *); | 
|  | 76 | static void p_b_term(struct parse *, cset *); | 
|  | 77 | static void p_b_cclass(struct parse *, cset *); | 
|  | 78 | static void p_b_eclass(struct parse *, cset *); | 
|  | 79 | static char p_b_symbol(struct parse *); | 
|  | 80 | static char p_b_coll_elem(struct parse *, int); | 
|  | 81 | static char othercase(int); | 
|  | 82 | static void bothcases(struct parse *, int); | 
|  | 83 | static void ordinary(struct parse *, int); | 
|  | 84 | static void nonnewline(struct parse *); | 
|  | 85 | static void repeat(struct parse *, sopno, int, int); | 
|  | 86 | static int seterr(struct parse *, int); | 
|  | 87 | static cset *allocset(struct parse *); | 
|  | 88 | static void freeset(struct parse *, cset *); | 
|  | 89 | static int freezeset(struct parse *, cset *); | 
|  | 90 | static int firstch(struct parse *, cset *); | 
|  | 91 | static int nch(struct parse *, cset *); | 
|  | 92 | static void mcadd(struct parse *, cset *, char *); | 
|  | 93 | static void mcinvert(struct parse *, cset *); | 
|  | 94 | static void mccase(struct parse *, cset *); | 
|  | 95 | static int isinsets(struct re_guts *, int); | 
|  | 96 | static int samesets(struct re_guts *, int, int); | 
|  | 97 | static void categorize(struct parse *, struct re_guts *); | 
|  | 98 | static sopno dupl(struct parse *, sopno, sopno); | 
|  | 99 | static void doemit(struct parse *, sop, size_t); | 
|  | 100 | static void doinsert(struct parse *, sop, size_t, sopno); | 
|  | 101 | static void dofwd(struct parse *, sopno, sop); | 
|  | 102 | static void enlarge(struct parse *, sopno); | 
|  | 103 | static void stripsnug(struct parse *, struct re_guts *); | 
|  | 104 | static void findmust(struct parse *, struct re_guts *); | 
|  | 105 | static sopno pluscount(struct parse *, struct re_guts *); | 
|  | 106 |  | 
|  | 107 | static char nuls[10];		/* place to point scanner in event of error */ | 
|  | 108 |  | 
|  | 109 | /* | 
|  | 110 | * macros for use with parse structure | 
|  | 111 | * BEWARE:  these know that the parse structure is named `p' !!! | 
|  | 112 | */ | 
|  | 113 | #define	PEEK()	(*p->next) | 
|  | 114 | #define	PEEK2()	(*(p->next+1)) | 
|  | 115 | #define	MORE()	(p->next < p->end) | 
|  | 116 | #define	MORE2()	(p->next+1 < p->end) | 
|  | 117 | #define	SEE(c)	(MORE() && PEEK() == (c)) | 
|  | 118 | #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) | 
|  | 119 | #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0) | 
|  | 120 | #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0) | 
|  | 121 | #define	NEXT()	(p->next++) | 
|  | 122 | #define	NEXT2()	(p->next += 2) | 
|  | 123 | #define	NEXTn(n)	(p->next += (n)) | 
|  | 124 | #define	GETNEXT()	(*p->next++) | 
|  | 125 | #define	SETERROR(e)	seterr(p, (e)) | 
|  | 126 | #define	REQUIRE(co, e)	((co) || SETERROR(e)) | 
|  | 127 | #define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e)) | 
|  | 128 | #define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e)) | 
|  | 129 | #define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e)) | 
|  | 130 | #define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd)) | 
|  | 131 | #define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos) | 
|  | 132 | #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos)) | 
|  | 133 | #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos) | 
|  | 134 | #define	HERE()		(p->slen) | 
|  | 135 | #define	THERE()		(p->slen - 1) | 
|  | 136 | #define	THERETHERE()	(p->slen - 2) | 
|  | 137 | #define	DROP(n)	(p->slen -= (n)) | 
|  | 138 |  | 
|  | 139 | #ifndef NDEBUG | 
|  | 140 | static int never = 0;		/* for use in asserts; shuts lint up */ | 
|  | 141 | #else | 
|  | 142 | #define	never	0		/* some <assert.h>s have bugs too */ | 
|  | 143 | #endif | 
|  | 144 |  | 
|  | 145 | /* | 
|  | 146 | - regcomp - interface for parser and compilation | 
|  | 147 | */ | 
|  | 148 | int				/* 0 success, otherwise REG_something */ | 
|  | 149 | regcomp(regex_t *preg, const char *pattern, int cflags) | 
|  | 150 | { | 
|  | 151 | struct parse pa; | 
|  | 152 | struct re_guts *g; | 
|  | 153 | struct parse *p = &pa; | 
|  | 154 | int i; | 
|  | 155 | size_t len; | 
|  | 156 | #ifdef REDEBUG | 
|  | 157 | #	define	GOODFLAGS(f)	(f) | 
|  | 158 | #else | 
|  | 159 | #	define	GOODFLAGS(f)	((f)&~REG_DUMP) | 
|  | 160 | #endif | 
|  | 161 |  | 
|  | 162 | cflags = GOODFLAGS(cflags); | 
|  | 163 | if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) | 
|  | 164 | return(REG_INVARG); | 
|  | 165 |  | 
|  | 166 | if (cflags®_PEND) { | 
|  | 167 | if (preg->re_endp < pattern) | 
|  | 168 | return(REG_INVARG); | 
|  | 169 | len = preg->re_endp - pattern; | 
|  | 170 | } else | 
|  | 171 | len = strlen((char *)pattern); | 
|  | 172 |  | 
|  | 173 | /* do the mallocs early so failure handling is easy */ | 
|  | 174 | g = (struct re_guts *)malloc(sizeof(struct re_guts) + | 
|  | 175 | (NC-1)*sizeof(cat_t)); | 
|  | 176 | if (g == NULL) | 
|  | 177 | return(REG_ESPACE); | 
|  | 178 | p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */ | 
|  | 179 | p->strip = (sop *)calloc(p->ssize, sizeof(sop)); | 
|  | 180 | p->slen = 0; | 
|  | 181 | if (p->strip == NULL) { | 
|  | 182 | free((char *)g); | 
|  | 183 | return(REG_ESPACE); | 
|  | 184 | } | 
|  | 185 |  | 
|  | 186 | /* set things up */ | 
|  | 187 | p->g = g; | 
|  | 188 | p->next = (char *)pattern;	/* convenience; we do not modify it */ | 
|  | 189 | p->end = p->next + len; | 
|  | 190 | p->error = 0; | 
|  | 191 | p->ncsalloc = 0; | 
|  | 192 | for (i = 0; i < NPAREN; i++) { | 
|  | 193 | p->pbegin[i] = 0; | 
|  | 194 | p->pend[i] = 0; | 
|  | 195 | } | 
|  | 196 | g->csetsize = NC; | 
|  | 197 | g->sets = NULL; | 
|  | 198 | g->setbits = NULL; | 
|  | 199 | g->ncsets = 0; | 
|  | 200 | g->cflags = cflags; | 
|  | 201 | g->iflags = 0; | 
|  | 202 | g->nbol = 0; | 
|  | 203 | g->neol = 0; | 
|  | 204 | g->must = NULL; | 
|  | 205 | g->mlen = 0; | 
|  | 206 | g->nsub = 0; | 
|  | 207 | g->ncategories = 1;	/* category 0 is "everything else" */ | 
|  | 208 | g->categories = &g->catspace[-(CHAR_MIN)]; | 
|  | 209 | (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); | 
|  | 210 | g->backrefs = 0; | 
|  | 211 |  | 
|  | 212 | /* do it */ | 
|  | 213 | EMIT(OEND, 0); | 
|  | 214 | g->firststate = THERE(); | 
|  | 215 | if (cflags®_EXTENDED) | 
|  | 216 | p_ere(p, OUT); | 
|  | 217 | else if (cflags®_NOSPEC) | 
|  | 218 | p_str(p); | 
|  | 219 | else | 
|  | 220 | p_bre(p, OUT, OUT); | 
|  | 221 | EMIT(OEND, 0); | 
|  | 222 | g->laststate = THERE(); | 
|  | 223 |  | 
|  | 224 | /* tidy up loose ends and fill things in */ | 
|  | 225 | categorize(p, g); | 
|  | 226 | stripsnug(p, g); | 
|  | 227 | findmust(p, g); | 
|  | 228 | g->nplus = pluscount(p, g); | 
|  | 229 | g->magic = MAGIC2; | 
|  | 230 | preg->re_nsub = g->nsub; | 
|  | 231 | preg->re_g = g; | 
|  | 232 | preg->re_magic = MAGIC1; | 
|  | 233 | #ifndef REDEBUG | 
|  | 234 | /* not debugging, so can't rely on the assert() in regexec() */ | 
|  | 235 | if (g->iflags&BAD) | 
|  | 236 | SETERROR(REG_ASSERT); | 
|  | 237 | #endif | 
|  | 238 |  | 
|  | 239 | /* win or lose, we're done */ | 
|  | 240 | if (p->error != 0)	/* lose */ | 
|  | 241 | regfree(preg); | 
|  | 242 | return(p->error); | 
|  | 243 | } | 
|  | 244 |  | 
|  | 245 | /* | 
|  | 246 | - p_ere - ERE parser top level, concatenation and alternation | 
|  | 247 | */ | 
|  | 248 | static void | 
|  | 249 | p_ere(struct parse *p, int stop)	/* character this ERE should end at */ | 
|  | 250 | { | 
|  | 251 | char c; | 
| David 'Digit' Turner | 50ace4f | 2010-06-16 16:36:41 -0700 | [diff] [blame] | 252 | sopno prevback = 0; | 
|  | 253 | sopno prevfwd = 0; | 
| Colin Cross | 4fa7b10 | 2010-01-12 18:59:25 -0800 | [diff] [blame] | 254 | sopno conc; | 
|  | 255 | int first = 1;		/* is this the first alternative? */ | 
|  | 256 |  | 
|  | 257 | for (;;) { | 
|  | 258 | /* do a bunch of concatenated expressions */ | 
|  | 259 | conc = HERE(); | 
|  | 260 | while (MORE() && (c = PEEK()) != '|' && c != stop) | 
|  | 261 | p_ere_exp(p); | 
|  | 262 | REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */ | 
|  | 263 |  | 
|  | 264 | if (!EAT('|')) | 
|  | 265 | break;		/* NOTE BREAK OUT */ | 
|  | 266 |  | 
|  | 267 | if (first) { | 
|  | 268 | INSERT(OCH_, conc);	/* offset is wrong */ | 
|  | 269 | prevfwd = conc; | 
|  | 270 | prevback = conc; | 
|  | 271 | first = 0; | 
|  | 272 | } | 
|  | 273 | ASTERN(OOR1, prevback); | 
|  | 274 | prevback = THERE(); | 
|  | 275 | AHEAD(prevfwd);			/* fix previous offset */ | 
|  | 276 | prevfwd = HERE(); | 
|  | 277 | EMIT(OOR2, 0);			/* offset is very wrong */ | 
|  | 278 | } | 
|  | 279 |  | 
|  | 280 | if (!first) {		/* tail-end fixups */ | 
|  | 281 | AHEAD(prevfwd); | 
|  | 282 | ASTERN(O_CH, prevback); | 
|  | 283 | } | 
|  | 284 |  | 
|  | 285 | assert(!MORE() || SEE(stop)); | 
|  | 286 | } | 
|  | 287 |  | 
|  | 288 | /* | 
|  | 289 | - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op | 
|  | 290 | */ | 
|  | 291 | static void | 
|  | 292 | p_ere_exp(struct parse *p) | 
|  | 293 | { | 
|  | 294 | char c; | 
|  | 295 | sopno pos; | 
|  | 296 | int count; | 
|  | 297 | int count2; | 
|  | 298 | sopno subno; | 
|  | 299 | int wascaret = 0; | 
|  | 300 |  | 
|  | 301 | assert(MORE());		/* caller should have ensured this */ | 
|  | 302 | c = GETNEXT(); | 
|  | 303 |  | 
|  | 304 | pos = HERE(); | 
|  | 305 | switch (c) { | 
|  | 306 | case '(': | 
|  | 307 | REQUIRE(MORE(), REG_EPAREN); | 
|  | 308 | p->g->nsub++; | 
|  | 309 | subno = p->g->nsub; | 
|  | 310 | if (subno < NPAREN) | 
|  | 311 | p->pbegin[subno] = HERE(); | 
|  | 312 | EMIT(OLPAREN, subno); | 
|  | 313 | if (!SEE(')')) | 
|  | 314 | p_ere(p, ')'); | 
|  | 315 | if (subno < NPAREN) { | 
|  | 316 | p->pend[subno] = HERE(); | 
|  | 317 | assert(p->pend[subno] != 0); | 
|  | 318 | } | 
|  | 319 | EMIT(ORPAREN, subno); | 
|  | 320 | MUSTEAT(')', REG_EPAREN); | 
|  | 321 | break; | 
|  | 322 | #ifndef POSIX_MISTAKE | 
|  | 323 | case ')':		/* happens only if no current unmatched ( */ | 
|  | 324 | /* | 
|  | 325 | * You may ask, why the ifndef?  Because I didn't notice | 
|  | 326 | * this until slightly too late for 1003.2, and none of the | 
|  | 327 | * other 1003.2 regular-expression reviewers noticed it at | 
|  | 328 | * all.  So an unmatched ) is legal POSIX, at least until | 
|  | 329 | * we can get it fixed. | 
|  | 330 | */ | 
|  | 331 | SETERROR(REG_EPAREN); | 
|  | 332 | break; | 
|  | 333 | #endif | 
|  | 334 | case '^': | 
|  | 335 | EMIT(OBOL, 0); | 
|  | 336 | p->g->iflags |= USEBOL; | 
|  | 337 | p->g->nbol++; | 
|  | 338 | wascaret = 1; | 
|  | 339 | break; | 
|  | 340 | case '$': | 
|  | 341 | EMIT(OEOL, 0); | 
|  | 342 | p->g->iflags |= USEEOL; | 
|  | 343 | p->g->neol++; | 
|  | 344 | break; | 
|  | 345 | case '|': | 
|  | 346 | SETERROR(REG_EMPTY); | 
|  | 347 | break; | 
|  | 348 | case '*': | 
|  | 349 | case '+': | 
|  | 350 | case '?': | 
|  | 351 | SETERROR(REG_BADRPT); | 
|  | 352 | break; | 
|  | 353 | case '.': | 
|  | 354 | if (p->g->cflags®_NEWLINE) | 
|  | 355 | nonnewline(p); | 
|  | 356 | else | 
|  | 357 | EMIT(OANY, 0); | 
|  | 358 | break; | 
|  | 359 | case '[': | 
|  | 360 | p_bracket(p); | 
|  | 361 | break; | 
|  | 362 | case '\\': | 
|  | 363 | REQUIRE(MORE(), REG_EESCAPE); | 
|  | 364 | c = GETNEXT(); | 
|  | 365 | ordinary(p, c); | 
|  | 366 | break; | 
|  | 367 | case '{':		/* okay as ordinary except if digit follows */ | 
|  | 368 | REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); | 
|  | 369 | /* FALLTHROUGH */ | 
|  | 370 | default: | 
|  | 371 | ordinary(p, c); | 
|  | 372 | break; | 
|  | 373 | } | 
|  | 374 |  | 
|  | 375 | if (!MORE()) | 
|  | 376 | return; | 
|  | 377 | c = PEEK(); | 
|  | 378 | /* we call { a repetition if followed by a digit */ | 
|  | 379 | if (!( c == '*' || c == '+' || c == '?' || | 
|  | 380 | (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) | 
|  | 381 | return;		/* no repetition, we're done */ | 
|  | 382 | NEXT(); | 
|  | 383 |  | 
|  | 384 | REQUIRE(!wascaret, REG_BADRPT); | 
|  | 385 | switch (c) { | 
|  | 386 | case '*':	/* implemented as +? */ | 
|  | 387 | /* this case does not require the (y|) trick, noKLUDGE */ | 
|  | 388 | INSERT(OPLUS_, pos); | 
|  | 389 | ASTERN(O_PLUS, pos); | 
|  | 390 | INSERT(OQUEST_, pos); | 
|  | 391 | ASTERN(O_QUEST, pos); | 
|  | 392 | break; | 
|  | 393 | case '+': | 
|  | 394 | INSERT(OPLUS_, pos); | 
|  | 395 | ASTERN(O_PLUS, pos); | 
|  | 396 | break; | 
|  | 397 | case '?': | 
|  | 398 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ | 
|  | 399 | INSERT(OCH_, pos);		/* offset slightly wrong */ | 
|  | 400 | ASTERN(OOR1, pos);		/* this one's right */ | 
|  | 401 | AHEAD(pos);			/* fix the OCH_ */ | 
|  | 402 | EMIT(OOR2, 0);			/* offset very wrong... */ | 
|  | 403 | AHEAD(THERE());			/* ...so fix it */ | 
|  | 404 | ASTERN(O_CH, THERETHERE()); | 
|  | 405 | break; | 
|  | 406 | case '{': | 
|  | 407 | count = p_count(p); | 
|  | 408 | if (EAT(',')) { | 
|  | 409 | if (isdigit((uch)PEEK())) { | 
|  | 410 | count2 = p_count(p); | 
|  | 411 | REQUIRE(count <= count2, REG_BADBR); | 
|  | 412 | } else		/* single number with comma */ | 
|  | 413 | count2 = INFINITY; | 
|  | 414 | } else		/* just a single number */ | 
|  | 415 | count2 = count; | 
|  | 416 | repeat(p, pos, count, count2); | 
|  | 417 | if (!EAT('}')) {	/* error heuristics */ | 
|  | 418 | while (MORE() && PEEK() != '}') | 
|  | 419 | NEXT(); | 
|  | 420 | REQUIRE(MORE(), REG_EBRACE); | 
|  | 421 | SETERROR(REG_BADBR); | 
|  | 422 | } | 
|  | 423 | break; | 
|  | 424 | } | 
|  | 425 |  | 
|  | 426 | if (!MORE()) | 
|  | 427 | return; | 
|  | 428 | c = PEEK(); | 
|  | 429 | if (!( c == '*' || c == '+' || c == '?' || | 
|  | 430 | (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) | 
|  | 431 | return; | 
|  | 432 | SETERROR(REG_BADRPT); | 
|  | 433 | } | 
|  | 434 |  | 
|  | 435 | /* | 
|  | 436 | - p_str - string (no metacharacters) "parser" | 
|  | 437 | */ | 
|  | 438 | static void | 
|  | 439 | p_str(struct parse *p) | 
|  | 440 | { | 
|  | 441 | REQUIRE(MORE(), REG_EMPTY); | 
|  | 442 | while (MORE()) | 
|  | 443 | ordinary(p, GETNEXT()); | 
|  | 444 | } | 
|  | 445 |  | 
|  | 446 | /* | 
|  | 447 | - p_bre - BRE parser top level, anchoring and concatenation | 
|  | 448 | * Giving end1 as OUT essentially eliminates the end1/end2 check. | 
|  | 449 | * | 
|  | 450 | * This implementation is a bit of a kludge, in that a trailing $ is first | 
|  | 451 | * taken as an ordinary character and then revised to be an anchor.  The | 
|  | 452 | * only undesirable side effect is that '$' gets included as a character | 
|  | 453 | * category in such cases.  This is fairly harmless; not worth fixing. | 
|  | 454 | * The amount of lookahead needed to avoid this kludge is excessive. | 
|  | 455 | */ | 
|  | 456 | static void | 
|  | 457 | p_bre(struct parse *p, | 
|  | 458 | int end1,		/* first terminating character */ | 
|  | 459 | int end2)		/* second terminating character */ | 
|  | 460 | { | 
|  | 461 | sopno start = HERE(); | 
|  | 462 | int first = 1;			/* first subexpression? */ | 
|  | 463 | int wasdollar = 0; | 
|  | 464 |  | 
|  | 465 | if (EAT('^')) { | 
|  | 466 | EMIT(OBOL, 0); | 
|  | 467 | p->g->iflags |= USEBOL; | 
|  | 468 | p->g->nbol++; | 
|  | 469 | } | 
|  | 470 | while (MORE() && !SEETWO(end1, end2)) { | 
|  | 471 | wasdollar = p_simp_re(p, first); | 
|  | 472 | first = 0; | 
|  | 473 | } | 
|  | 474 | if (wasdollar) {	/* oops, that was a trailing anchor */ | 
|  | 475 | DROP(1); | 
|  | 476 | EMIT(OEOL, 0); | 
|  | 477 | p->g->iflags |= USEEOL; | 
|  | 478 | p->g->neol++; | 
|  | 479 | } | 
|  | 480 |  | 
|  | 481 | REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */ | 
|  | 482 | } | 
|  | 483 |  | 
|  | 484 | /* | 
|  | 485 | - p_simp_re - parse a simple RE, an atom possibly followed by a repetition | 
|  | 486 | */ | 
|  | 487 | static int			/* was the simple RE an unbackslashed $? */ | 
|  | 488 | p_simp_re(struct parse *p, | 
|  | 489 | int starordinary)		/* is a leading * an ordinary character? */ | 
|  | 490 | { | 
|  | 491 | int c; | 
|  | 492 | int count; | 
|  | 493 | int count2; | 
|  | 494 | sopno pos; | 
|  | 495 | int i; | 
|  | 496 | sopno subno; | 
|  | 497 | #	define	BACKSL	(1<<CHAR_BIT) | 
|  | 498 |  | 
|  | 499 | pos = HERE();		/* repetion op, if any, covers from here */ | 
|  | 500 |  | 
|  | 501 | assert(MORE());		/* caller should have ensured this */ | 
|  | 502 | c = GETNEXT(); | 
|  | 503 | if (c == '\\') { | 
|  | 504 | REQUIRE(MORE(), REG_EESCAPE); | 
|  | 505 | c = BACKSL | GETNEXT(); | 
|  | 506 | } | 
|  | 507 | switch (c) { | 
|  | 508 | case '.': | 
|  | 509 | if (p->g->cflags®_NEWLINE) | 
|  | 510 | nonnewline(p); | 
|  | 511 | else | 
|  | 512 | EMIT(OANY, 0); | 
|  | 513 | break; | 
|  | 514 | case '[': | 
|  | 515 | p_bracket(p); | 
|  | 516 | break; | 
|  | 517 | case BACKSL|'{': | 
|  | 518 | SETERROR(REG_BADRPT); | 
|  | 519 | break; | 
|  | 520 | case BACKSL|'(': | 
|  | 521 | p->g->nsub++; | 
|  | 522 | subno = p->g->nsub; | 
|  | 523 | if (subno < NPAREN) | 
|  | 524 | p->pbegin[subno] = HERE(); | 
|  | 525 | EMIT(OLPAREN, subno); | 
|  | 526 | /* the MORE here is an error heuristic */ | 
|  | 527 | if (MORE() && !SEETWO('\\', ')')) | 
|  | 528 | p_bre(p, '\\', ')'); | 
|  | 529 | if (subno < NPAREN) { | 
|  | 530 | p->pend[subno] = HERE(); | 
|  | 531 | assert(p->pend[subno] != 0); | 
|  | 532 | } | 
|  | 533 | EMIT(ORPAREN, subno); | 
|  | 534 | REQUIRE(EATTWO('\\', ')'), REG_EPAREN); | 
|  | 535 | break; | 
|  | 536 | case BACKSL|')':	/* should not get here -- must be user */ | 
|  | 537 | case BACKSL|'}': | 
|  | 538 | SETERROR(REG_EPAREN); | 
|  | 539 | break; | 
|  | 540 | case BACKSL|'1': | 
|  | 541 | case BACKSL|'2': | 
|  | 542 | case BACKSL|'3': | 
|  | 543 | case BACKSL|'4': | 
|  | 544 | case BACKSL|'5': | 
|  | 545 | case BACKSL|'6': | 
|  | 546 | case BACKSL|'7': | 
|  | 547 | case BACKSL|'8': | 
|  | 548 | case BACKSL|'9': | 
|  | 549 | i = (c&~BACKSL) - '0'; | 
|  | 550 | assert(i < NPAREN); | 
|  | 551 | if (p->pend[i] != 0) { | 
|  | 552 | assert(i <= p->g->nsub); | 
|  | 553 | EMIT(OBACK_, i); | 
|  | 554 | assert(p->pbegin[i] != 0); | 
|  | 555 | assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); | 
|  | 556 | assert(OP(p->strip[p->pend[i]]) == ORPAREN); | 
|  | 557 | (void) dupl(p, p->pbegin[i]+1, p->pend[i]); | 
|  | 558 | EMIT(O_BACK, i); | 
|  | 559 | } else | 
|  | 560 | SETERROR(REG_ESUBREG); | 
|  | 561 | p->g->backrefs = 1; | 
|  | 562 | break; | 
|  | 563 | case '*': | 
|  | 564 | REQUIRE(starordinary, REG_BADRPT); | 
|  | 565 | /* FALLTHROUGH */ | 
|  | 566 | default: | 
|  | 567 | ordinary(p, (char)c); | 
|  | 568 | break; | 
|  | 569 | } | 
|  | 570 |  | 
|  | 571 | if (EAT('*')) {		/* implemented as +? */ | 
|  | 572 | /* this case does not require the (y|) trick, noKLUDGE */ | 
|  | 573 | INSERT(OPLUS_, pos); | 
|  | 574 | ASTERN(O_PLUS, pos); | 
|  | 575 | INSERT(OQUEST_, pos); | 
|  | 576 | ASTERN(O_QUEST, pos); | 
|  | 577 | } else if (EATTWO('\\', '{')) { | 
|  | 578 | count = p_count(p); | 
|  | 579 | if (EAT(',')) { | 
|  | 580 | if (MORE() && isdigit((uch)PEEK())) { | 
|  | 581 | count2 = p_count(p); | 
|  | 582 | REQUIRE(count <= count2, REG_BADBR); | 
|  | 583 | } else		/* single number with comma */ | 
|  | 584 | count2 = INFINITY; | 
|  | 585 | } else		/* just a single number */ | 
|  | 586 | count2 = count; | 
|  | 587 | repeat(p, pos, count, count2); | 
|  | 588 | if (!EATTWO('\\', '}')) {	/* error heuristics */ | 
|  | 589 | while (MORE() && !SEETWO('\\', '}')) | 
|  | 590 | NEXT(); | 
|  | 591 | REQUIRE(MORE(), REG_EBRACE); | 
|  | 592 | SETERROR(REG_BADBR); | 
|  | 593 | } | 
|  | 594 | } else if (c == '$')	/* $ (but not \$) ends it */ | 
|  | 595 | return(1); | 
|  | 596 |  | 
|  | 597 | return(0); | 
|  | 598 | } | 
|  | 599 |  | 
|  | 600 | /* | 
|  | 601 | - p_count - parse a repetition count | 
|  | 602 | */ | 
|  | 603 | static int			/* the value */ | 
|  | 604 | p_count(struct parse *p) | 
|  | 605 | { | 
|  | 606 | int count = 0; | 
|  | 607 | int ndigits = 0; | 
|  | 608 |  | 
|  | 609 | while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { | 
|  | 610 | count = count*10 + (GETNEXT() - '0'); | 
|  | 611 | ndigits++; | 
|  | 612 | } | 
|  | 613 |  | 
|  | 614 | REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); | 
|  | 615 | return(count); | 
|  | 616 | } | 
|  | 617 |  | 
|  | 618 | /* | 
|  | 619 | - p_bracket - parse a bracketed character list | 
|  | 620 | * | 
|  | 621 | * Note a significant property of this code:  if the allocset() did SETERROR, | 
|  | 622 | * no set operations are done. | 
|  | 623 | */ | 
|  | 624 | static void | 
|  | 625 | p_bracket(struct parse *p) | 
|  | 626 | { | 
|  | 627 | cset *cs; | 
|  | 628 | int invert = 0; | 
|  | 629 |  | 
|  | 630 | /* Dept of Truly Sickening Special-Case Kludges */ | 
|  | 631 | if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { | 
|  | 632 | EMIT(OBOW, 0); | 
|  | 633 | NEXTn(6); | 
|  | 634 | return; | 
|  | 635 | } | 
|  | 636 | if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { | 
|  | 637 | EMIT(OEOW, 0); | 
|  | 638 | NEXTn(6); | 
|  | 639 | return; | 
|  | 640 | } | 
|  | 641 |  | 
|  | 642 | if ((cs = allocset(p)) == NULL) { | 
|  | 643 | /* allocset did set error status in p */ | 
|  | 644 | return; | 
|  | 645 | } | 
|  | 646 |  | 
|  | 647 | if (EAT('^')) | 
|  | 648 | invert++;	/* make note to invert set at end */ | 
|  | 649 | if (EAT(']')) | 
|  | 650 | CHadd(cs, ']'); | 
|  | 651 | else if (EAT('-')) | 
|  | 652 | CHadd(cs, '-'); | 
|  | 653 | while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) | 
|  | 654 | p_b_term(p, cs); | 
|  | 655 | if (EAT('-')) | 
|  | 656 | CHadd(cs, '-'); | 
|  | 657 | MUSTEAT(']', REG_EBRACK); | 
|  | 658 |  | 
|  | 659 | if (p->error != 0) {	/* don't mess things up further */ | 
|  | 660 | freeset(p, cs); | 
|  | 661 | return; | 
|  | 662 | } | 
|  | 663 |  | 
|  | 664 | if (p->g->cflags®_ICASE) { | 
|  | 665 | int i; | 
|  | 666 | int ci; | 
|  | 667 |  | 
|  | 668 | for (i = p->g->csetsize - 1; i >= 0; i--) | 
|  | 669 | if (CHIN(cs, i) && isalpha(i)) { | 
|  | 670 | ci = othercase(i); | 
|  | 671 | if (ci != i) | 
|  | 672 | CHadd(cs, ci); | 
|  | 673 | } | 
|  | 674 | if (cs->multis != NULL) | 
|  | 675 | mccase(p, cs); | 
|  | 676 | } | 
|  | 677 | if (invert) { | 
|  | 678 | int i; | 
|  | 679 |  | 
|  | 680 | for (i = p->g->csetsize - 1; i >= 0; i--) | 
|  | 681 | if (CHIN(cs, i)) | 
|  | 682 | CHsub(cs, i); | 
|  | 683 | else | 
|  | 684 | CHadd(cs, i); | 
|  | 685 | if (p->g->cflags®_NEWLINE) | 
|  | 686 | CHsub(cs, '\n'); | 
|  | 687 | if (cs->multis != NULL) | 
|  | 688 | mcinvert(p, cs); | 
|  | 689 | } | 
|  | 690 |  | 
|  | 691 | assert(cs->multis == NULL);		/* xxx */ | 
|  | 692 |  | 
|  | 693 | if (nch(p, cs) == 1) {		/* optimize singleton sets */ | 
|  | 694 | ordinary(p, firstch(p, cs)); | 
|  | 695 | freeset(p, cs); | 
|  | 696 | } else | 
|  | 697 | EMIT(OANYOF, freezeset(p, cs)); | 
|  | 698 | } | 
|  | 699 |  | 
|  | 700 | /* | 
|  | 701 | - p_b_term - parse one term of a bracketed character list | 
|  | 702 | */ | 
|  | 703 | static void | 
|  | 704 | p_b_term(struct parse *p, cset *cs) | 
|  | 705 | { | 
|  | 706 | char c; | 
|  | 707 | char start, finish; | 
|  | 708 | int i; | 
|  | 709 |  | 
|  | 710 | /* classify what we've got */ | 
|  | 711 | switch ((MORE()) ? PEEK() : '\0') { | 
|  | 712 | case '[': | 
|  | 713 | c = (MORE2()) ? PEEK2() : '\0'; | 
|  | 714 | break; | 
|  | 715 | case '-': | 
|  | 716 | SETERROR(REG_ERANGE); | 
|  | 717 | return;			/* NOTE RETURN */ | 
|  | 718 | break; | 
|  | 719 | default: | 
|  | 720 | c = '\0'; | 
|  | 721 | break; | 
|  | 722 | } | 
|  | 723 |  | 
|  | 724 | switch (c) { | 
|  | 725 | case ':':		/* character class */ | 
|  | 726 | NEXT2(); | 
|  | 727 | REQUIRE(MORE(), REG_EBRACK); | 
|  | 728 | c = PEEK(); | 
|  | 729 | REQUIRE(c != '-' && c != ']', REG_ECTYPE); | 
|  | 730 | p_b_cclass(p, cs); | 
|  | 731 | REQUIRE(MORE(), REG_EBRACK); | 
|  | 732 | REQUIRE(EATTWO(':', ']'), REG_ECTYPE); | 
|  | 733 | break; | 
|  | 734 | case '=':		/* equivalence class */ | 
|  | 735 | NEXT2(); | 
|  | 736 | REQUIRE(MORE(), REG_EBRACK); | 
|  | 737 | c = PEEK(); | 
|  | 738 | REQUIRE(c != '-' && c != ']', REG_ECOLLATE); | 
|  | 739 | p_b_eclass(p, cs); | 
|  | 740 | REQUIRE(MORE(), REG_EBRACK); | 
|  | 741 | REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); | 
|  | 742 | break; | 
|  | 743 | default:		/* symbol, ordinary character, or range */ | 
|  | 744 | /* xxx revision needed for multichar stuff */ | 
|  | 745 | start = p_b_symbol(p); | 
|  | 746 | if (SEE('-') && MORE2() && PEEK2() != ']') { | 
|  | 747 | /* range */ | 
|  | 748 | NEXT(); | 
|  | 749 | if (EAT('-')) | 
|  | 750 | finish = '-'; | 
|  | 751 | else | 
|  | 752 | finish = p_b_symbol(p); | 
|  | 753 | } else | 
|  | 754 | finish = start; | 
|  | 755 | /* xxx what about signed chars here... */ | 
|  | 756 | REQUIRE(start <= finish, REG_ERANGE); | 
|  | 757 | for (i = start; i <= finish; i++) | 
|  | 758 | CHadd(cs, i); | 
|  | 759 | break; | 
|  | 760 | } | 
|  | 761 | } | 
|  | 762 |  | 
|  | 763 | /* | 
|  | 764 | - p_b_cclass - parse a character-class name and deal with it | 
|  | 765 | */ | 
|  | 766 | static void | 
|  | 767 | p_b_cclass(struct parse *p, cset *cs) | 
|  | 768 | { | 
|  | 769 | char *sp = p->next; | 
| David 'Digit' Turner | 50ace4f | 2010-06-16 16:36:41 -0700 | [diff] [blame] | 770 | const struct cclass *cp; | 
| Colin Cross | 4fa7b10 | 2010-01-12 18:59:25 -0800 | [diff] [blame] | 771 | size_t len; | 
|  | 772 | char *u; | 
|  | 773 | char c; | 
|  | 774 |  | 
|  | 775 | while (MORE() && isalpha(PEEK())) | 
|  | 776 | NEXT(); | 
|  | 777 | len = p->next - sp; | 
|  | 778 | for (cp = cclasses; cp->name != NULL; cp++) | 
|  | 779 | if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') | 
|  | 780 | break; | 
|  | 781 | if (cp->name == NULL) { | 
|  | 782 | /* oops, didn't find it */ | 
|  | 783 | SETERROR(REG_ECTYPE); | 
|  | 784 | return; | 
|  | 785 | } | 
|  | 786 |  | 
|  | 787 | u = cp->chars; | 
|  | 788 | while ((c = *u++) != '\0') | 
|  | 789 | CHadd(cs, c); | 
|  | 790 | for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) | 
|  | 791 | MCadd(p, cs, u); | 
|  | 792 | } | 
|  | 793 |  | 
|  | 794 | /* | 
|  | 795 | - p_b_eclass - parse an equivalence-class name and deal with it | 
|  | 796 | * | 
|  | 797 | * This implementation is incomplete. xxx | 
|  | 798 | */ | 
|  | 799 | static void | 
|  | 800 | p_b_eclass(struct parse *p, cset *cs) | 
|  | 801 | { | 
|  | 802 | char c; | 
|  | 803 |  | 
|  | 804 | c = p_b_coll_elem(p, '='); | 
|  | 805 | CHadd(cs, c); | 
|  | 806 | } | 
|  | 807 |  | 
|  | 808 | /* | 
|  | 809 | - p_b_symbol - parse a character or [..]ed multicharacter collating symbol | 
|  | 810 | */ | 
|  | 811 | static char			/* value of symbol */ | 
|  | 812 | p_b_symbol(struct parse *p) | 
|  | 813 | { | 
|  | 814 | char value; | 
|  | 815 |  | 
|  | 816 | REQUIRE(MORE(), REG_EBRACK); | 
|  | 817 | if (!EATTWO('[', '.')) | 
|  | 818 | return(GETNEXT()); | 
|  | 819 |  | 
|  | 820 | /* collating symbol */ | 
|  | 821 | value = p_b_coll_elem(p, '.'); | 
|  | 822 | REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); | 
|  | 823 | return(value); | 
|  | 824 | } | 
|  | 825 |  | 
|  | 826 | /* | 
|  | 827 | - p_b_coll_elem - parse a collating-element name and look it up | 
|  | 828 | */ | 
|  | 829 | static char			/* value of collating element */ | 
|  | 830 | p_b_coll_elem(struct parse *p, | 
|  | 831 | int endc)			/* name ended by endc,']' */ | 
|  | 832 | { | 
|  | 833 | char *sp = p->next; | 
| David 'Digit' Turner | 50ace4f | 2010-06-16 16:36:41 -0700 | [diff] [blame] | 834 | const struct cname *cp; | 
| Colin Cross | 4fa7b10 | 2010-01-12 18:59:25 -0800 | [diff] [blame] | 835 | int len; | 
|  | 836 |  | 
|  | 837 | while (MORE() && !SEETWO(endc, ']')) | 
|  | 838 | NEXT(); | 
|  | 839 | if (!MORE()) { | 
|  | 840 | SETERROR(REG_EBRACK); | 
|  | 841 | return(0); | 
|  | 842 | } | 
|  | 843 | len = p->next - sp; | 
|  | 844 | for (cp = cnames; cp->name != NULL; cp++) | 
|  | 845 | if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') | 
|  | 846 | return(cp->code);	/* known name */ | 
|  | 847 | if (len == 1) | 
|  | 848 | return(*sp);	/* single character */ | 
|  | 849 | SETERROR(REG_ECOLLATE);			/* neither */ | 
|  | 850 | return(0); | 
|  | 851 | } | 
|  | 852 |  | 
|  | 853 | /* | 
|  | 854 | - othercase - return the case counterpart of an alphabetic | 
|  | 855 | */ | 
|  | 856 | static char			/* if no counterpart, return ch */ | 
|  | 857 | othercase(int ch) | 
|  | 858 | { | 
|  | 859 | ch = (uch)ch; | 
|  | 860 | assert(isalpha(ch)); | 
|  | 861 | if (isupper(ch)) | 
|  | 862 | return ((uch)tolower(ch)); | 
|  | 863 | else if (islower(ch)) | 
|  | 864 | return ((uch)toupper(ch)); | 
|  | 865 | else			/* peculiar, but could happen */ | 
|  | 866 | return(ch); | 
|  | 867 | } | 
|  | 868 |  | 
|  | 869 | /* | 
|  | 870 | - bothcases - emit a dualcase version of a two-case character | 
|  | 871 | * | 
|  | 872 | * Boy, is this implementation ever a kludge... | 
|  | 873 | */ | 
|  | 874 | static void | 
|  | 875 | bothcases(struct parse *p, int ch) | 
|  | 876 | { | 
|  | 877 | char *oldnext = p->next; | 
|  | 878 | char *oldend = p->end; | 
|  | 879 | char bracket[3]; | 
|  | 880 |  | 
|  | 881 | ch = (uch)ch; | 
|  | 882 | assert(othercase(ch) != ch);	/* p_bracket() would recurse */ | 
|  | 883 | p->next = bracket; | 
|  | 884 | p->end = bracket+2; | 
|  | 885 | bracket[0] = ch; | 
|  | 886 | bracket[1] = ']'; | 
|  | 887 | bracket[2] = '\0'; | 
|  | 888 | p_bracket(p); | 
|  | 889 | assert(p->next == bracket+2); | 
|  | 890 | p->next = oldnext; | 
|  | 891 | p->end = oldend; | 
|  | 892 | } | 
|  | 893 |  | 
|  | 894 | /* | 
|  | 895 | - ordinary - emit an ordinary character | 
|  | 896 | */ | 
|  | 897 | static void | 
|  | 898 | ordinary(struct parse *p, int ch) | 
|  | 899 | { | 
|  | 900 | cat_t *cap = p->g->categories; | 
|  | 901 |  | 
|  | 902 | if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) | 
|  | 903 | bothcases(p, ch); | 
|  | 904 | else { | 
|  | 905 | EMIT(OCHAR, (uch)ch); | 
|  | 906 | if (cap[ch] == 0) | 
|  | 907 | cap[ch] = p->g->ncategories++; | 
|  | 908 | } | 
|  | 909 | } | 
|  | 910 |  | 
|  | 911 | /* | 
|  | 912 | - nonnewline - emit REG_NEWLINE version of OANY | 
|  | 913 | * | 
|  | 914 | * Boy, is this implementation ever a kludge... | 
|  | 915 | */ | 
|  | 916 | static void | 
|  | 917 | nonnewline(struct parse *p) | 
|  | 918 | { | 
|  | 919 | char *oldnext = p->next; | 
|  | 920 | char *oldend = p->end; | 
|  | 921 | char bracket[4]; | 
|  | 922 |  | 
|  | 923 | p->next = bracket; | 
|  | 924 | p->end = bracket+3; | 
|  | 925 | bracket[0] = '^'; | 
|  | 926 | bracket[1] = '\n'; | 
|  | 927 | bracket[2] = ']'; | 
|  | 928 | bracket[3] = '\0'; | 
|  | 929 | p_bracket(p); | 
|  | 930 | assert(p->next == bracket+3); | 
|  | 931 | p->next = oldnext; | 
|  | 932 | p->end = oldend; | 
|  | 933 | } | 
|  | 934 |  | 
|  | 935 | /* | 
|  | 936 | - repeat - generate code for a bounded repetition, recursively if needed | 
|  | 937 | */ | 
|  | 938 | static void | 
|  | 939 | repeat(struct parse *p, | 
|  | 940 | sopno start,		/* operand from here to end of strip */ | 
|  | 941 | int from,			/* repeated from this number */ | 
|  | 942 | int to)			/* to this number of times (maybe INFINITY) */ | 
|  | 943 | { | 
|  | 944 | sopno finish = HERE(); | 
|  | 945 | #	define	N	2 | 
|  | 946 | #	define	INF	3 | 
|  | 947 | #	define	REP(f, t)	((f)*8 + (t)) | 
|  | 948 | #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) | 
|  | 949 | sopno copy; | 
|  | 950 |  | 
|  | 951 | if (p->error != 0)	/* head off possible runaway recursion */ | 
|  | 952 | return; | 
|  | 953 |  | 
|  | 954 | assert(from <= to); | 
|  | 955 |  | 
|  | 956 | switch (REP(MAP(from), MAP(to))) { | 
|  | 957 | case REP(0, 0):			/* must be user doing this */ | 
|  | 958 | DROP(finish-start);	/* drop the operand */ | 
|  | 959 | break; | 
|  | 960 | case REP(0, 1):			/* as x{1,1}? */ | 
|  | 961 | case REP(0, N):			/* as x{1,n}? */ | 
|  | 962 | case REP(0, INF):		/* as x{1,}? */ | 
|  | 963 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ | 
|  | 964 | INSERT(OCH_, start);		/* offset is wrong... */ | 
|  | 965 | repeat(p, start+1, 1, to); | 
|  | 966 | ASTERN(OOR1, start); | 
|  | 967 | AHEAD(start);			/* ... fix it */ | 
|  | 968 | EMIT(OOR2, 0); | 
|  | 969 | AHEAD(THERE()); | 
|  | 970 | ASTERN(O_CH, THERETHERE()); | 
|  | 971 | break; | 
|  | 972 | case REP(1, 1):			/* trivial case */ | 
|  | 973 | /* done */ | 
|  | 974 | break; | 
|  | 975 | case REP(1, N):			/* as x?x{1,n-1} */ | 
|  | 976 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ | 
|  | 977 | INSERT(OCH_, start); | 
|  | 978 | ASTERN(OOR1, start); | 
|  | 979 | AHEAD(start); | 
|  | 980 | EMIT(OOR2, 0);			/* offset very wrong... */ | 
|  | 981 | AHEAD(THERE());			/* ...so fix it */ | 
|  | 982 | ASTERN(O_CH, THERETHERE()); | 
|  | 983 | copy = dupl(p, start+1, finish+1); | 
|  | 984 | assert(copy == finish+4); | 
|  | 985 | repeat(p, copy, 1, to-1); | 
|  | 986 | break; | 
|  | 987 | case REP(1, INF):		/* as x+ */ | 
|  | 988 | INSERT(OPLUS_, start); | 
|  | 989 | ASTERN(O_PLUS, start); | 
|  | 990 | break; | 
|  | 991 | case REP(N, N):			/* as xx{m-1,n-1} */ | 
|  | 992 | copy = dupl(p, start, finish); | 
|  | 993 | repeat(p, copy, from-1, to-1); | 
|  | 994 | break; | 
|  | 995 | case REP(N, INF):		/* as xx{n-1,INF} */ | 
|  | 996 | copy = dupl(p, start, finish); | 
|  | 997 | repeat(p, copy, from-1, to); | 
|  | 998 | break; | 
|  | 999 | default:			/* "can't happen" */ | 
|  | 1000 | SETERROR(REG_ASSERT);	/* just in case */ | 
|  | 1001 | break; | 
|  | 1002 | } | 
|  | 1003 | } | 
|  | 1004 |  | 
|  | 1005 | /* | 
|  | 1006 | - seterr - set an error condition | 
|  | 1007 | */ | 
|  | 1008 | static int			/* useless but makes type checking happy */ | 
|  | 1009 | seterr(struct parse *p, int e) | 
|  | 1010 | { | 
|  | 1011 | if (p->error == 0)	/* keep earliest error condition */ | 
|  | 1012 | p->error = e; | 
|  | 1013 | p->next = nuls;		/* try to bring things to a halt */ | 
|  | 1014 | p->end = nuls; | 
|  | 1015 | return(0);		/* make the return value well-defined */ | 
|  | 1016 | } | 
|  | 1017 |  | 
|  | 1018 | /* | 
|  | 1019 | - allocset - allocate a set of characters for [] | 
|  | 1020 | */ | 
|  | 1021 | static cset * | 
|  | 1022 | allocset(struct parse *p) | 
|  | 1023 | { | 
|  | 1024 | int no = p->g->ncsets++; | 
|  | 1025 | size_t nc; | 
|  | 1026 | size_t nbytes; | 
|  | 1027 | cset *cs; | 
|  | 1028 | size_t css = (size_t)p->g->csetsize; | 
|  | 1029 | int i; | 
|  | 1030 |  | 
|  | 1031 | if (no >= p->ncsalloc) {	/* need another column of space */ | 
|  | 1032 | void *ptr; | 
|  | 1033 |  | 
|  | 1034 | p->ncsalloc += CHAR_BIT; | 
|  | 1035 | nc = p->ncsalloc; | 
|  | 1036 | assert(nc % CHAR_BIT == 0); | 
|  | 1037 | nbytes = nc / CHAR_BIT * css; | 
|  | 1038 |  | 
|  | 1039 | ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset)); | 
|  | 1040 | if (ptr == NULL) | 
|  | 1041 | goto nomem; | 
|  | 1042 | p->g->sets = ptr; | 
|  | 1043 |  | 
|  | 1044 | ptr = (uch *)realloc((char *)p->g->setbits, nbytes); | 
|  | 1045 | if (ptr == NULL) | 
|  | 1046 | goto nomem; | 
|  | 1047 | p->g->setbits = ptr; | 
|  | 1048 |  | 
|  | 1049 | for (i = 0; i < no; i++) | 
|  | 1050 | p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); | 
|  | 1051 |  | 
|  | 1052 | (void) memset((char *)p->g->setbits + (nbytes - css), 0, css); | 
|  | 1053 | } | 
|  | 1054 | /* XXX should not happen */ | 
|  | 1055 | if (p->g->sets == NULL || p->g->setbits == NULL) | 
|  | 1056 | goto nomem; | 
|  | 1057 |  | 
|  | 1058 | cs = &p->g->sets[no]; | 
|  | 1059 | cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); | 
|  | 1060 | cs->mask = 1 << ((no) % CHAR_BIT); | 
|  | 1061 | cs->hash = 0; | 
|  | 1062 | cs->smultis = 0; | 
|  | 1063 | cs->multis = NULL; | 
|  | 1064 |  | 
|  | 1065 | return(cs); | 
|  | 1066 | nomem: | 
|  | 1067 | free(p->g->sets); | 
|  | 1068 | p->g->sets = NULL; | 
|  | 1069 | free(p->g->setbits); | 
|  | 1070 | p->g->setbits = NULL; | 
|  | 1071 |  | 
|  | 1072 | SETERROR(REG_ESPACE); | 
|  | 1073 | /* caller's responsibility not to do set ops */ | 
|  | 1074 | return(NULL); | 
|  | 1075 | } | 
|  | 1076 |  | 
|  | 1077 | /* | 
|  | 1078 | - freeset - free a now-unused set | 
|  | 1079 | */ | 
|  | 1080 | static void | 
|  | 1081 | freeset(struct parse *p, cset *cs) | 
|  | 1082 | { | 
|  | 1083 | int i; | 
|  | 1084 | cset *top = &p->g->sets[p->g->ncsets]; | 
|  | 1085 | size_t css = (size_t)p->g->csetsize; | 
|  | 1086 |  | 
| David 'Digit' Turner | 50ace4f | 2010-06-16 16:36:41 -0700 | [diff] [blame] | 1087 | for (i = 0; i < (ssize_t)css; i++) | 
| Colin Cross | 4fa7b10 | 2010-01-12 18:59:25 -0800 | [diff] [blame] | 1088 | CHsub(cs, i); | 
|  | 1089 | if (cs == top-1)	/* recover only the easy case */ | 
|  | 1090 | p->g->ncsets--; | 
|  | 1091 | } | 
|  | 1092 |  | 
|  | 1093 | /* | 
|  | 1094 | - freezeset - final processing on a set of characters | 
|  | 1095 | * | 
|  | 1096 | * The main task here is merging identical sets.  This is usually a waste | 
|  | 1097 | * of time (although the hash code minimizes the overhead), but can win | 
|  | 1098 | * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash | 
|  | 1099 | * is done using addition rather than xor -- all ASCII [aA] sets xor to | 
|  | 1100 | * the same value! | 
|  | 1101 | */ | 
|  | 1102 | static int			/* set number */ | 
|  | 1103 | freezeset(struct parse *p, cset *cs) | 
|  | 1104 | { | 
|  | 1105 | uch h = cs->hash; | 
|  | 1106 | int i; | 
|  | 1107 | cset *top = &p->g->sets[p->g->ncsets]; | 
|  | 1108 | cset *cs2; | 
|  | 1109 | size_t css = (size_t)p->g->csetsize; | 
|  | 1110 |  | 
|  | 1111 | /* look for an earlier one which is the same */ | 
|  | 1112 | for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) | 
|  | 1113 | if (cs2->hash == h && cs2 != cs) { | 
|  | 1114 | /* maybe */ | 
| David 'Digit' Turner | 50ace4f | 2010-06-16 16:36:41 -0700 | [diff] [blame] | 1115 | for (i = 0; i < (ssize_t)css; i++) | 
| Colin Cross | 4fa7b10 | 2010-01-12 18:59:25 -0800 | [diff] [blame] | 1116 | if (!!CHIN(cs2, i) != !!CHIN(cs, i)) | 
|  | 1117 | break;		/* no */ | 
| David 'Digit' Turner | 50ace4f | 2010-06-16 16:36:41 -0700 | [diff] [blame] | 1118 | if (i == (ssize_t)css) | 
| Colin Cross | 4fa7b10 | 2010-01-12 18:59:25 -0800 | [diff] [blame] | 1119 | break;			/* yes */ | 
|  | 1120 | } | 
|  | 1121 |  | 
|  | 1122 | if (cs2 < top) {	/* found one */ | 
|  | 1123 | freeset(p, cs); | 
|  | 1124 | cs = cs2; | 
|  | 1125 | } | 
|  | 1126 |  | 
|  | 1127 | return((int)(cs - p->g->sets)); | 
|  | 1128 | } | 
|  | 1129 |  | 
|  | 1130 | /* | 
|  | 1131 | - firstch - return first character in a set (which must have at least one) | 
|  | 1132 | */ | 
|  | 1133 | static int			/* character; there is no "none" value */ | 
|  | 1134 | firstch(struct parse *p, cset *cs) | 
|  | 1135 | { | 
|  | 1136 | int i; | 
|  | 1137 | size_t css = (size_t)p->g->csetsize; | 
|  | 1138 |  | 
| David 'Digit' Turner | 50ace4f | 2010-06-16 16:36:41 -0700 | [diff] [blame] | 1139 | for (i = 0; i < (ssize_t)css; i++) | 
| Colin Cross | 4fa7b10 | 2010-01-12 18:59:25 -0800 | [diff] [blame] | 1140 | if (CHIN(cs, i)) | 
|  | 1141 | return((char)i); | 
|  | 1142 | assert(never); | 
|  | 1143 | return(0);		/* arbitrary */ | 
|  | 1144 | } | 
|  | 1145 |  | 
|  | 1146 | /* | 
|  | 1147 | - nch - number of characters in a set | 
|  | 1148 | */ | 
|  | 1149 | static int | 
|  | 1150 | nch(struct parse *p, cset *cs) | 
|  | 1151 | { | 
|  | 1152 | int i; | 
|  | 1153 | size_t css = (size_t)p->g->csetsize; | 
|  | 1154 | int n = 0; | 
|  | 1155 |  | 
| David 'Digit' Turner | 50ace4f | 2010-06-16 16:36:41 -0700 | [diff] [blame] | 1156 | for (i = 0; i < (ssize_t)css; i++) | 
| Colin Cross | 4fa7b10 | 2010-01-12 18:59:25 -0800 | [diff] [blame] | 1157 | if (CHIN(cs, i)) | 
|  | 1158 | n++; | 
|  | 1159 | return(n); | 
|  | 1160 | } | 
|  | 1161 |  | 
|  | 1162 | /* | 
|  | 1163 | - mcadd - add a collating element to a cset | 
|  | 1164 | */ | 
|  | 1165 | static void | 
|  | 1166 | mcadd( struct parse *p, cset *cs, char *cp) | 
|  | 1167 | { | 
|  | 1168 | size_t oldend = cs->smultis; | 
|  | 1169 | void *np; | 
|  | 1170 |  | 
|  | 1171 | cs->smultis += strlen(cp) + 1; | 
|  | 1172 | np = realloc(cs->multis, cs->smultis); | 
|  | 1173 | if (np == NULL) { | 
|  | 1174 | if (cs->multis) | 
|  | 1175 | free(cs->multis); | 
|  | 1176 | cs->multis = NULL; | 
|  | 1177 | SETERROR(REG_ESPACE); | 
|  | 1178 | return; | 
|  | 1179 | } | 
|  | 1180 | cs->multis = np; | 
|  | 1181 |  | 
|  | 1182 | strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1); | 
|  | 1183 | } | 
|  | 1184 |  | 
|  | 1185 | /* | 
|  | 1186 | - mcinvert - invert the list of collating elements in a cset | 
|  | 1187 | * | 
|  | 1188 | * This would have to know the set of possibilities.  Implementation | 
|  | 1189 | * is deferred. | 
|  | 1190 | */ | 
|  | 1191 | /* ARGSUSED */ | 
|  | 1192 | static void | 
|  | 1193 | mcinvert(struct parse *p, cset *cs) | 
|  | 1194 | { | 
|  | 1195 | assert(cs->multis == NULL);	/* xxx */ | 
|  | 1196 | } | 
|  | 1197 |  | 
|  | 1198 | /* | 
|  | 1199 | - mccase - add case counterparts of the list of collating elements in a cset | 
|  | 1200 | * | 
|  | 1201 | * This would have to know the set of possibilities.  Implementation | 
|  | 1202 | * is deferred. | 
|  | 1203 | */ | 
|  | 1204 | /* ARGSUSED */ | 
|  | 1205 | static void | 
|  | 1206 | mccase(struct parse *p, cset *cs) | 
|  | 1207 | { | 
|  | 1208 | assert(cs->multis == NULL);	/* xxx */ | 
|  | 1209 | } | 
|  | 1210 |  | 
|  | 1211 | /* | 
|  | 1212 | - isinsets - is this character in any sets? | 
|  | 1213 | */ | 
|  | 1214 | static int			/* predicate */ | 
|  | 1215 | isinsets(struct re_guts *g, int c) | 
|  | 1216 | { | 
|  | 1217 | uch *col; | 
|  | 1218 | int i; | 
|  | 1219 | int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; | 
|  | 1220 | unsigned uc = (uch)c; | 
|  | 1221 |  | 
|  | 1222 | for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) | 
|  | 1223 | if (col[uc] != 0) | 
|  | 1224 | return(1); | 
|  | 1225 | return(0); | 
|  | 1226 | } | 
|  | 1227 |  | 
|  | 1228 | /* | 
|  | 1229 | - samesets - are these two characters in exactly the same sets? | 
|  | 1230 | */ | 
|  | 1231 | static int			/* predicate */ | 
|  | 1232 | samesets(struct re_guts *g, int c1, int c2) | 
|  | 1233 | { | 
|  | 1234 | uch *col; | 
|  | 1235 | int i; | 
|  | 1236 | int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; | 
|  | 1237 | unsigned uc1 = (uch)c1; | 
|  | 1238 | unsigned uc2 = (uch)c2; | 
|  | 1239 |  | 
|  | 1240 | for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) | 
|  | 1241 | if (col[uc1] != col[uc2]) | 
|  | 1242 | return(0); | 
|  | 1243 | return(1); | 
|  | 1244 | } | 
|  | 1245 |  | 
|  | 1246 | /* | 
|  | 1247 | - categorize - sort out character categories | 
|  | 1248 | */ | 
|  | 1249 | static void | 
|  | 1250 | categorize(struct parse *p, struct re_guts *g) | 
|  | 1251 | { | 
|  | 1252 | cat_t *cats = g->categories; | 
|  | 1253 | int c; | 
|  | 1254 | int c2; | 
|  | 1255 | cat_t cat; | 
|  | 1256 |  | 
|  | 1257 | /* avoid making error situations worse */ | 
|  | 1258 | if (p->error != 0) | 
|  | 1259 | return; | 
|  | 1260 |  | 
|  | 1261 | for (c = CHAR_MIN; c <= CHAR_MAX; c++) | 
|  | 1262 | if (cats[c] == 0 && isinsets(g, c)) { | 
|  | 1263 | cat = g->ncategories++; | 
|  | 1264 | cats[c] = cat; | 
|  | 1265 | for (c2 = c+1; c2 <= CHAR_MAX; c2++) | 
|  | 1266 | if (cats[c2] == 0 && samesets(g, c, c2)) | 
|  | 1267 | cats[c2] = cat; | 
|  | 1268 | } | 
|  | 1269 | } | 
|  | 1270 |  | 
|  | 1271 | /* | 
|  | 1272 | - dupl - emit a duplicate of a bunch of sops | 
|  | 1273 | */ | 
|  | 1274 | static sopno			/* start of duplicate */ | 
|  | 1275 | dupl(struct parse *p, | 
|  | 1276 | sopno start,		/* from here */ | 
|  | 1277 | sopno finish)		/* to this less one */ | 
|  | 1278 | { | 
|  | 1279 | sopno ret = HERE(); | 
|  | 1280 | sopno len = finish - start; | 
|  | 1281 |  | 
|  | 1282 | assert(finish >= start); | 
|  | 1283 | if (len == 0) | 
|  | 1284 | return(ret); | 
|  | 1285 | enlarge(p, p->ssize + len);	/* this many unexpected additions */ | 
|  | 1286 | assert(p->ssize >= p->slen + len); | 
|  | 1287 | (void) memcpy((char *)(p->strip + p->slen), | 
|  | 1288 | (char *)(p->strip + start), (size_t)len*sizeof(sop)); | 
|  | 1289 | p->slen += len; | 
|  | 1290 | return(ret); | 
|  | 1291 | } | 
|  | 1292 |  | 
|  | 1293 | /* | 
|  | 1294 | - doemit - emit a strip operator | 
|  | 1295 | * | 
|  | 1296 | * It might seem better to implement this as a macro with a function as | 
|  | 1297 | * hard-case backup, but it's just too big and messy unless there are | 
|  | 1298 | * some changes to the data structures.  Maybe later. | 
|  | 1299 | */ | 
|  | 1300 | static void | 
|  | 1301 | doemit(struct parse *p, sop op, size_t opnd) | 
|  | 1302 | { | 
|  | 1303 | /* avoid making error situations worse */ | 
|  | 1304 | if (p->error != 0) | 
|  | 1305 | return; | 
|  | 1306 |  | 
|  | 1307 | /* deal with oversize operands ("can't happen", more or less) */ | 
|  | 1308 | assert(opnd < 1<<OPSHIFT); | 
|  | 1309 |  | 
|  | 1310 | /* deal with undersized strip */ | 
|  | 1311 | if (p->slen >= p->ssize) | 
|  | 1312 | enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */ | 
|  | 1313 | assert(p->slen < p->ssize); | 
|  | 1314 |  | 
|  | 1315 | /* finally, it's all reduced to the easy case */ | 
|  | 1316 | p->strip[p->slen++] = SOP(op, opnd); | 
|  | 1317 | } | 
|  | 1318 |  | 
|  | 1319 | /* | 
|  | 1320 | - doinsert - insert a sop into the strip | 
|  | 1321 | */ | 
|  | 1322 | static void | 
|  | 1323 | doinsert(struct parse *p, sop op, size_t opnd, sopno pos) | 
|  | 1324 | { | 
|  | 1325 | sopno sn; | 
|  | 1326 | sop s; | 
|  | 1327 | int i; | 
|  | 1328 |  | 
|  | 1329 | /* avoid making error situations worse */ | 
|  | 1330 | if (p->error != 0) | 
|  | 1331 | return; | 
|  | 1332 |  | 
|  | 1333 | sn = HERE(); | 
|  | 1334 | EMIT(op, opnd);		/* do checks, ensure space */ | 
|  | 1335 | assert(HERE() == sn+1); | 
|  | 1336 | s = p->strip[sn]; | 
|  | 1337 |  | 
|  | 1338 | /* adjust paren pointers */ | 
|  | 1339 | assert(pos > 0); | 
|  | 1340 | for (i = 1; i < NPAREN; i++) { | 
|  | 1341 | if (p->pbegin[i] >= pos) { | 
|  | 1342 | p->pbegin[i]++; | 
|  | 1343 | } | 
|  | 1344 | if (p->pend[i] >= pos) { | 
|  | 1345 | p->pend[i]++; | 
|  | 1346 | } | 
|  | 1347 | } | 
|  | 1348 |  | 
|  | 1349 | memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], | 
|  | 1350 | (HERE()-pos-1)*sizeof(sop)); | 
|  | 1351 | p->strip[pos] = s; | 
|  | 1352 | } | 
|  | 1353 |  | 
|  | 1354 | /* | 
|  | 1355 | - dofwd - complete a forward reference | 
|  | 1356 | */ | 
|  | 1357 | static void | 
|  | 1358 | dofwd(struct parse *p, sopno pos, sop value) | 
|  | 1359 | { | 
|  | 1360 | /* avoid making error situations worse */ | 
|  | 1361 | if (p->error != 0) | 
|  | 1362 | return; | 
|  | 1363 |  | 
|  | 1364 | assert(value < 1<<OPSHIFT); | 
|  | 1365 | p->strip[pos] = OP(p->strip[pos]) | value; | 
|  | 1366 | } | 
|  | 1367 |  | 
|  | 1368 | /* | 
|  | 1369 | - enlarge - enlarge the strip | 
|  | 1370 | */ | 
|  | 1371 | static void | 
|  | 1372 | enlarge(struct parse *p, sopno size) | 
|  | 1373 | { | 
|  | 1374 | sop *sp; | 
|  | 1375 |  | 
|  | 1376 | if (p->ssize >= size) | 
|  | 1377 | return; | 
|  | 1378 |  | 
|  | 1379 | sp = (sop *)realloc(p->strip, size*sizeof(sop)); | 
|  | 1380 | if (sp == NULL) { | 
|  | 1381 | SETERROR(REG_ESPACE); | 
|  | 1382 | return; | 
|  | 1383 | } | 
|  | 1384 | p->strip = sp; | 
|  | 1385 | p->ssize = size; | 
|  | 1386 | } | 
|  | 1387 |  | 
|  | 1388 | /* | 
|  | 1389 | - stripsnug - compact the strip | 
|  | 1390 | */ | 
|  | 1391 | static void | 
|  | 1392 | stripsnug(struct parse *p, struct re_guts *g) | 
|  | 1393 | { | 
|  | 1394 | g->nstates = p->slen; | 
|  | 1395 | g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); | 
|  | 1396 | if (g->strip == NULL) { | 
|  | 1397 | SETERROR(REG_ESPACE); | 
|  | 1398 | g->strip = p->strip; | 
|  | 1399 | } | 
|  | 1400 | } | 
|  | 1401 |  | 
|  | 1402 | /* | 
|  | 1403 | - findmust - fill in must and mlen with longest mandatory literal string | 
|  | 1404 | * | 
|  | 1405 | * This algorithm could do fancy things like analyzing the operands of | | 
|  | 1406 | * for common subsequences.  Someday.  This code is simple and finds most | 
|  | 1407 | * of the interesting cases. | 
|  | 1408 | * | 
|  | 1409 | * Note that must and mlen got initialized during setup. | 
|  | 1410 | */ | 
|  | 1411 | static void | 
|  | 1412 | findmust(struct parse *p, struct re_guts *g) | 
|  | 1413 | { | 
|  | 1414 | sop *scan; | 
| David 'Digit' Turner | 50ace4f | 2010-06-16 16:36:41 -0700 | [diff] [blame] | 1415 | sop *start = NULL;    /* start initialized in the default case, after that */ | 
| Colin Cross | 4fa7b10 | 2010-01-12 18:59:25 -0800 | [diff] [blame] | 1416 | sop *newstart; /* newstart was initialized in the OCHAR case */ | 
|  | 1417 | sopno newlen; | 
|  | 1418 | sop s; | 
|  | 1419 | char *cp; | 
|  | 1420 | sopno i; | 
|  | 1421 |  | 
|  | 1422 | /* avoid making error situations worse */ | 
|  | 1423 | if (p->error != 0) | 
|  | 1424 | return; | 
|  | 1425 |  | 
|  | 1426 | /* find the longest OCHAR sequence in strip */ | 
|  | 1427 | newlen = 0; | 
|  | 1428 | scan = g->strip + 1; | 
|  | 1429 | do { | 
|  | 1430 | s = *scan++; | 
|  | 1431 | switch (OP(s)) { | 
|  | 1432 | case OCHAR:		/* sequence member */ | 
|  | 1433 | if (newlen == 0)		/* new sequence */ | 
|  | 1434 | newstart = scan - 1; | 
|  | 1435 | newlen++; | 
|  | 1436 | break; | 
|  | 1437 | case OPLUS_:		/* things that don't break one */ | 
|  | 1438 | case OLPAREN: | 
|  | 1439 | case ORPAREN: | 
|  | 1440 | break; | 
|  | 1441 | case OQUEST_:		/* things that must be skipped */ | 
|  | 1442 | case OCH_: | 
|  | 1443 | scan--; | 
|  | 1444 | do { | 
|  | 1445 | scan += OPND(s); | 
|  | 1446 | s = *scan; | 
|  | 1447 | /* assert() interferes w debug printouts */ | 
|  | 1448 | if (OP(s) != O_QUEST && OP(s) != O_CH && | 
|  | 1449 | OP(s) != OOR2) { | 
|  | 1450 | g->iflags |= BAD; | 
|  | 1451 | return; | 
|  | 1452 | } | 
|  | 1453 | } while (OP(s) != O_QUEST && OP(s) != O_CH); | 
|  | 1454 | /* fallthrough */ | 
|  | 1455 | default:		/* things that break a sequence */ | 
|  | 1456 | if (newlen > g->mlen) {		/* ends one */ | 
|  | 1457 | start = newstart; | 
|  | 1458 | g->mlen = newlen; | 
|  | 1459 | } | 
|  | 1460 | newlen = 0; | 
|  | 1461 | break; | 
|  | 1462 | } | 
|  | 1463 | } while (OP(s) != OEND); | 
|  | 1464 |  | 
|  | 1465 | if (g->mlen == 0)		/* there isn't one */ | 
|  | 1466 | return; | 
|  | 1467 |  | 
|  | 1468 | /* turn it into a character string */ | 
|  | 1469 | g->must = malloc((size_t)g->mlen + 1); | 
|  | 1470 | if (g->must == NULL) {		/* argh; just forget it */ | 
|  | 1471 | g->mlen = 0; | 
|  | 1472 | return; | 
|  | 1473 | } | 
|  | 1474 | cp = g->must; | 
|  | 1475 | scan = start; | 
|  | 1476 | for (i = g->mlen; i > 0; i--) { | 
|  | 1477 | while (OP(s = *scan++) != OCHAR) | 
|  | 1478 | continue; | 
|  | 1479 | assert(cp < g->must + g->mlen); | 
|  | 1480 | *cp++ = (char)OPND(s); | 
|  | 1481 | } | 
|  | 1482 | assert(cp == g->must + g->mlen); | 
|  | 1483 | *cp++ = '\0';		/* just on general principles */ | 
|  | 1484 | } | 
|  | 1485 |  | 
|  | 1486 | /* | 
|  | 1487 | - pluscount - count + nesting | 
|  | 1488 | */ | 
|  | 1489 | static sopno			/* nesting depth */ | 
|  | 1490 | pluscount(struct parse *p, struct re_guts *g) | 
|  | 1491 | { | 
|  | 1492 | sop *scan; | 
|  | 1493 | sop s; | 
|  | 1494 | sopno plusnest = 0; | 
|  | 1495 | sopno maxnest = 0; | 
|  | 1496 |  | 
|  | 1497 | if (p->error != 0) | 
|  | 1498 | return(0);	/* there may not be an OEND */ | 
|  | 1499 |  | 
|  | 1500 | scan = g->strip + 1; | 
|  | 1501 | do { | 
|  | 1502 | s = *scan++; | 
|  | 1503 | switch (OP(s)) { | 
|  | 1504 | case OPLUS_: | 
|  | 1505 | plusnest++; | 
|  | 1506 | break; | 
|  | 1507 | case O_PLUS: | 
|  | 1508 | if (plusnest > maxnest) | 
|  | 1509 | maxnest = plusnest; | 
|  | 1510 | plusnest--; | 
|  | 1511 | break; | 
|  | 1512 | } | 
|  | 1513 | } while (OP(s) != OEND); | 
|  | 1514 | if (plusnest != 0) | 
|  | 1515 | g->iflags |= BAD; | 
|  | 1516 | return(maxnest); | 
|  | 1517 | } |