| Jeff Sharkey | 3e8b158 | 2012-07-13 16:37:13 -0700 | [diff] [blame] | 1 | /*	$OpenBSD: util.c,v 1.36 2007/10/02 17:59:18 otto Exp $	*/ | 
 | 2 | /*	$FreeBSD: head/usr.bin/grep/fastgrep.c 211496 2010-08-19 09:28:59Z des $ */ | 
 | 3 |  | 
 | 4 | /*- | 
 | 5 |  * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav | 
 | 6 |  * Copyright (C) 2008 Gabor Kovesdan <gabor@FreeBSD.org> | 
 | 7 |  * All rights reserved. | 
 | 8 |  * | 
 | 9 |  * Redistribution and use in source and binary forms, with or without | 
 | 10 |  * modification, are permitted provided that the following conditions | 
 | 11 |  * are met: | 
 | 12 |  * 1. Redistributions of source code must retain the above copyright | 
 | 13 |  *    notice, this list of conditions and the following disclaimer. | 
 | 14 |  * 2. Redistributions in binary form must reproduce the above copyright | 
 | 15 |  *    notice, this list of conditions and the following disclaimer in the | 
 | 16 |  *    documentation and/or other materials provided with the distribution. | 
 | 17 |  * | 
 | 18 |  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | 
 | 19 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
 | 20 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
 | 21 |  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | 
 | 22 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | 
 | 23 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | 
 | 24 |  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | 
 | 25 |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | 
 | 26 |  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | 
 | 27 |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | 
 | 28 |  * SUCH DAMAGE. | 
 | 29 |  */ | 
 | 30 |  | 
 | 31 | /* | 
 | 32 |  * XXX: This file is a speed up for grep to cover the defects of the | 
 | 33 |  * regex library.  These optimizations should practically be implemented | 
 | 34 |  * there keeping this code clean.  This is a future TODO, but for the | 
 | 35 |  * meantime, we need to use this workaround. | 
 | 36 |  */ | 
 | 37 |  | 
 | 38 | #if HAVE_NBTOOL_CONFIG_H | 
 | 39 | #include "nbtool_config.h" | 
 | 40 | #endif | 
 | 41 |  | 
 | 42 | #include <sys/cdefs.h> | 
 | 43 | __RCSID("$NetBSD: fastgrep.c,v 1.5 2011/04/18 03:27:40 joerg Exp $"); | 
 | 44 |  | 
 | 45 | #include <limits.h> | 
 | 46 | #include <stdbool.h> | 
 | 47 | #include <stdlib.h> | 
 | 48 | #include <string.h> | 
 | 49 | #include <wchar.h> | 
 | 50 | #include <wctype.h> | 
 | 51 |  | 
 | 52 | #include "grep.h" | 
 | 53 |  | 
 | 54 | static inline int	grep_cmp(const unsigned char *, const unsigned char *, size_t); | 
 | 55 | static inline void	grep_revstr(unsigned char *, int); | 
 | 56 |  | 
 | 57 | void | 
 | 58 | fgrepcomp(fastgrep_t *fg, const char *pat) | 
 | 59 | { | 
 | 60 | 	unsigned int i; | 
 | 61 |  | 
 | 62 | 	/* Initialize. */ | 
 | 63 | 	fg->len = strlen(pat); | 
 | 64 | 	fg->bol = false; | 
 | 65 | 	fg->eol = false; | 
 | 66 | 	fg->reversed = false; | 
 | 67 |  | 
 | 68 | 	fg->pattern = (unsigned char *)grep_strdup(pat); | 
 | 69 |  | 
 | 70 | 	/* Preprocess pattern. */ | 
 | 71 | 	for (i = 0; i <= UCHAR_MAX; i++) | 
 | 72 | 		fg->qsBc[i] = fg->len; | 
 | 73 | 	for (i = 1; i < fg->len; i++) | 
 | 74 | 		fg->qsBc[fg->pattern[i]] = fg->len - i; | 
 | 75 | } | 
 | 76 |  | 
 | 77 | /* | 
 | 78 |  * Returns: -1 on failure, 0 on success | 
 | 79 |  */ | 
 | 80 | int | 
 | 81 | fastcomp(fastgrep_t *fg, const char *pat) | 
 | 82 | { | 
 | 83 | 	unsigned int i; | 
 | 84 | 	int firstHalfDot = -1; | 
 | 85 | 	int firstLastHalfDot = -1; | 
 | 86 | 	int hasDot = 0; | 
 | 87 | 	int lastHalfDot = 0; | 
 | 88 | 	int shiftPatternLen; | 
 | 89 |  | 
 | 90 | 	/* Initialize. */ | 
 | 91 | 	fg->len = strlen(pat); | 
 | 92 | 	fg->bol = false; | 
 | 93 | 	fg->eol = false; | 
 | 94 | 	fg->reversed = false; | 
 | 95 | 	fg->word = wflag; | 
 | 96 |  | 
 | 97 | 	/* Remove end-of-line character ('$'). */ | 
 | 98 | 	if (fg->len > 0 && pat[fg->len - 1] == '$') { | 
 | 99 | 		fg->eol = true; | 
 | 100 | 		fg->len--; | 
 | 101 | 	} | 
 | 102 |  | 
 | 103 | 	/* Remove beginning-of-line character ('^'). */ | 
 | 104 | 	if (pat[0] == '^') { | 
 | 105 | 		fg->bol = true; | 
 | 106 | 		fg->len--; | 
 | 107 | 		pat++; | 
 | 108 | 	} | 
 | 109 |  | 
 | 110 | 	if (fg->len >= 14 && | 
 | 111 | 	    memcmp(pat, "[[:<:]]", 7) == 0 && | 
 | 112 | 	    memcmp(pat + fg->len - 7, "[[:>:]]", 7) == 0) { | 
 | 113 | 		fg->len -= 14; | 
 | 114 | 		pat += 7; | 
 | 115 | 		/* Word boundary is handled separately in util.c */ | 
 | 116 | 		fg->word = true; | 
 | 117 | 	} | 
 | 118 |  | 
 | 119 | 	/* | 
 | 120 | 	 * pat has been adjusted earlier to not include '^', '$' or | 
 | 121 | 	 * the word match character classes at the beginning and ending | 
 | 122 | 	 * of the string respectively. | 
 | 123 | 	 */ | 
 | 124 | 	fg->pattern = grep_malloc(fg->len + 1); | 
 | 125 | 	memcpy(fg->pattern, pat, fg->len); | 
 | 126 | 	fg->pattern[fg->len] = '\0'; | 
 | 127 |  | 
 | 128 | 	/* Look for ways to cheat...er...avoid the full regex engine. */ | 
 | 129 | 	for (i = 0; i < fg->len; i++) { | 
 | 130 | 		/* Can still cheat? */ | 
 | 131 | 		if (fg->pattern[i] == '.') { | 
 | 132 | 			hasDot = i; | 
 | 133 | 			if (i < fg->len / 2) { | 
 | 134 | 				if (firstHalfDot < 0) | 
 | 135 | 					/* Closest dot to the beginning */ | 
 | 136 | 					firstHalfDot = i; | 
 | 137 | 			} else { | 
 | 138 | 				/* Closest dot to the end of the pattern. */ | 
 | 139 | 				lastHalfDot = i; | 
 | 140 | 				if (firstLastHalfDot < 0) | 
 | 141 | 					firstLastHalfDot = i; | 
 | 142 | 			} | 
 | 143 | 		} else { | 
 | 144 | 			/* Free memory and let others know this is empty. */ | 
 | 145 | 			free(fg->pattern); | 
 | 146 | 			fg->pattern = NULL; | 
 | 147 | 			return (-1); | 
 | 148 | 		} | 
 | 149 | 	} | 
 | 150 |  | 
 | 151 | 	/* | 
 | 152 | 	 * Determine if a reverse search would be faster based on the placement | 
 | 153 | 	 * of the dots. | 
 | 154 | 	 */ | 
 | 155 | 	if ((!(lflag || cflag)) && ((!(fg->bol || fg->eol)) && | 
 | 156 | 	    ((lastHalfDot) && ((firstHalfDot < 0) || | 
 | 157 | 	    ((fg->len - (lastHalfDot + 1)) < (size_t)firstHalfDot)))) && | 
 | 158 | 	    !oflag && !color) { | 
 | 159 | 		fg->reversed = true; | 
 | 160 | 		hasDot = fg->len - (firstHalfDot < 0 ? | 
 | 161 | 		    firstLastHalfDot : firstHalfDot) - 1; | 
 | 162 | 		grep_revstr(fg->pattern, fg->len); | 
 | 163 | 	} | 
 | 164 |  | 
 | 165 | 	/* | 
 | 166 | 	 * Normal Quick Search would require a shift based on the position the | 
 | 167 | 	 * next character after the comparison is within the pattern.  With | 
 | 168 | 	 * wildcards, the position of the last dot effects the maximum shift | 
 | 169 | 	 * distance. | 
 | 170 | 	 * The closer to the end the wild card is the slower the search.  A | 
 | 171 | 	 * reverse version of this algorithm would be useful for wildcards near | 
 | 172 | 	 * the end of the string. | 
 | 173 | 	 * | 
 | 174 | 	 * Examples: | 
 | 175 | 	 * Pattern	Max shift | 
 | 176 | 	 * -------	--------- | 
 | 177 | 	 * this		5 | 
 | 178 | 	 * .his		4 | 
 | 179 | 	 * t.is		3 | 
 | 180 | 	 * th.s		2 | 
 | 181 | 	 * thi.		1 | 
 | 182 | 	 */ | 
 | 183 |  | 
 | 184 | 	/* Adjust the shift based on location of the last dot ('.'). */ | 
 | 185 | 	shiftPatternLen = fg->len - hasDot; | 
 | 186 |  | 
 | 187 | 	/* Preprocess pattern. */ | 
 | 188 | 	for (i = 0; i <= (signed)UCHAR_MAX; i++) | 
 | 189 | 		fg->qsBc[i] = shiftPatternLen; | 
 | 190 | 	for (i = hasDot + 1; i < fg->len; i++) { | 
 | 191 | 		fg->qsBc[fg->pattern[i]] = fg->len - i; | 
 | 192 | 	} | 
 | 193 |  | 
 | 194 | 	/* | 
 | 195 | 	 * Put pattern back to normal after pre-processing to allow for easy | 
 | 196 | 	 * comparisons later. | 
 | 197 | 	 */ | 
 | 198 | 	if (fg->reversed) | 
 | 199 | 		grep_revstr(fg->pattern, fg->len); | 
 | 200 |  | 
 | 201 | 	return (0); | 
 | 202 | } | 
 | 203 |  | 
 | 204 | int | 
 | 205 | grep_search(fastgrep_t *fg, const unsigned char *data, size_t len, regmatch_t *pmatch) | 
 | 206 | { | 
 | 207 | 	unsigned int j; | 
 | 208 | 	int ret = REG_NOMATCH; | 
 | 209 |  | 
 | 210 | 	if (pmatch->rm_so == (ssize_t)len) | 
 | 211 | 		return (ret); | 
 | 212 |  | 
 | 213 | 	if (fg->bol && pmatch->rm_so != 0) { | 
 | 214 | 		pmatch->rm_so = len; | 
 | 215 | 		pmatch->rm_eo = len; | 
 | 216 | 		return (ret); | 
 | 217 | 	} | 
 | 218 |  | 
 | 219 | 	/* No point in going farther if we do not have enough data. */ | 
 | 220 | 	if (len < fg->len) | 
 | 221 | 		return (ret); | 
 | 222 |  | 
 | 223 | 	/* Only try once at the beginning or ending of the line. */ | 
 | 224 | 	if (fg->bol || fg->eol) { | 
 | 225 | 		/* Simple text comparison. */ | 
 | 226 | 		/* Verify data is >= pattern length before searching on it. */ | 
 | 227 | 		if (len >= fg->len) { | 
 | 228 | 			/* Determine where in data to start search at. */ | 
 | 229 | 			j = fg->eol ? len - fg->len : 0; | 
 | 230 | 			if (!((fg->bol && fg->eol) && (len != fg->len))) | 
 | 231 | 				if (grep_cmp(fg->pattern, data + j, | 
 | 232 | 				    fg->len) == -1) { | 
 | 233 | 					pmatch->rm_so = j; | 
 | 234 | 					pmatch->rm_eo = j + fg->len; | 
 | 235 | 						ret = 0; | 
 | 236 | 				} | 
 | 237 | 		} | 
 | 238 | 	} else if (fg->reversed) { | 
 | 239 | 		/* Quick Search algorithm. */ | 
 | 240 | 		j = len; | 
 | 241 | 		do { | 
 | 242 | 			if (grep_cmp(fg->pattern, data + j - fg->len, | 
 | 243 | 				fg->len) == -1) { | 
 | 244 | 				pmatch->rm_so = j - fg->len; | 
 | 245 | 				pmatch->rm_eo = j; | 
 | 246 | 				ret = 0; | 
 | 247 | 				break; | 
 | 248 | 			} | 
 | 249 | 			/* Shift if within bounds, otherwise, we are done. */ | 
 | 250 | 			if (j == fg->len) | 
 | 251 | 				break; | 
 | 252 | 			j -= fg->qsBc[data[j - fg->len - 1]]; | 
 | 253 | 		} while (j >= fg->len); | 
 | 254 | 	} else { | 
 | 255 | 		/* Quick Search algorithm. */ | 
 | 256 | 		j = pmatch->rm_so; | 
 | 257 | 		do { | 
 | 258 | 			if (grep_cmp(fg->pattern, data + j, fg->len) == -1) { | 
 | 259 | 				pmatch->rm_so = j; | 
 | 260 | 				pmatch->rm_eo = j + fg->len; | 
 | 261 | 				ret = 0; | 
 | 262 | 				break; | 
 | 263 | 			} | 
 | 264 |  | 
 | 265 | 			/* Shift if within bounds, otherwise, we are done. */ | 
 | 266 | 			if (j + fg->len == len) | 
 | 267 | 				break; | 
 | 268 | 			else | 
 | 269 | 				j += fg->qsBc[data[j + fg->len]]; | 
 | 270 | 		} while (j <= (len - fg->len)); | 
 | 271 | 	} | 
 | 272 |  | 
 | 273 | 	return (ret); | 
 | 274 | } | 
 | 275 |  | 
 | 276 | /* | 
 | 277 |  * Returns:	i >= 0 on failure (position that it failed) | 
 | 278 |  *		-1 on success | 
 | 279 |  */ | 
 | 280 | static inline int | 
 | 281 | grep_cmp(const unsigned char *pat, const unsigned char *data, size_t len) | 
 | 282 | { | 
 | 283 | 	size_t size; | 
 | 284 | 	wchar_t *wdata, *wpat; | 
 | 285 | 	unsigned int i; | 
 | 286 |  | 
 | 287 | 	if (iflag) { | 
 | 288 | 		if ((size = mbstowcs(NULL, (const char *)data, 0)) == | 
 | 289 | 		    ((size_t) - 1)) | 
 | 290 | 			return (-1); | 
 | 291 |  | 
 | 292 | 		wdata = grep_malloc(size * sizeof(wint_t)); | 
 | 293 |  | 
 | 294 | 		if (mbstowcs(wdata, (const char *)data, size) == | 
 | 295 | 		    ((size_t) - 1)) | 
 | 296 | 			return (-1); | 
 | 297 |  | 
 | 298 | 		if ((size = mbstowcs(NULL, (const char *)pat, 0)) == | 
 | 299 | 		    ((size_t) - 1)) | 
 | 300 | 			return (-1); | 
 | 301 |  | 
 | 302 | 		wpat = grep_malloc(size * sizeof(wint_t)); | 
 | 303 |  | 
 | 304 | 		if (mbstowcs(wpat, (const char *)pat, size) == ((size_t) - 1)) | 
 | 305 | 			return (-1); | 
 | 306 | 		for (i = 0; i < len; i++) { | 
 | 307 | 			if ((towlower(wpat[i]) == towlower(wdata[i])) || | 
 | 308 | 			    ((grepbehave != GREP_FIXED) && wpat[i] == L'.')) | 
 | 309 | 				continue; | 
 | 310 | 			free(wpat); | 
 | 311 | 			free(wdata); | 
 | 312 | 				return (i); | 
 | 313 | 		} | 
 | 314 | 	} else { | 
 | 315 | 		for (i = 0; i < len; i++) { | 
 | 316 | 			if ((pat[i] == data[i]) || ((grepbehave != GREP_FIXED) && | 
 | 317 | 			    pat[i] == '.')) | 
 | 318 | 				continue; | 
 | 319 | 			return (i); | 
 | 320 | 		} | 
 | 321 | 	} | 
 | 322 | 	return (-1); | 
 | 323 | } | 
 | 324 |  | 
 | 325 | static inline void | 
 | 326 | grep_revstr(unsigned char *str, int len) | 
 | 327 | { | 
 | 328 | 	int i; | 
 | 329 | 	char c; | 
 | 330 |  | 
 | 331 | 	for (i = 0; i < len / 2; i++) { | 
 | 332 | 		c = str[i]; | 
 | 333 | 		str[i] = str[len - i - 1]; | 
 | 334 | 		str[len - i - 1] = c; | 
 | 335 | 	} | 
 | 336 | } |