Sync with upstream NetBSD.
Test: treehugger
Change-Id: Ia82ba4a0c00a47377240e5d9a85dc30deb5ce3b7
diff --git a/libc/upstream-netbsd/android/include/netbsd-compat.h b/libc/upstream-netbsd/android/include/netbsd-compat.h
index 5dd086e..a625f06 100644
--- a/libc/upstream-netbsd/android/include/netbsd-compat.h
+++ b/libc/upstream-netbsd/android/include/netbsd-compat.h
@@ -43,6 +43,8 @@
#include <stddef.h>
int reallocarr(void*, size_t, size_t);
+#define __arraycount(a) (sizeof(a) / sizeof(a[0]))
+
/* Use appropriate shell depending on process's executable. */
__LIBC_HIDDEN__ extern const char* __bionic_get_shell_path();
#define _PATH_BSHELL __bionic_get_shell_path()
diff --git a/libc/upstream-netbsd/lib/libc/include/isc/list.h b/libc/upstream-netbsd/lib/libc/include/isc/list.h
index 46f2e79..76dc097 100644
--- a/libc/upstream-netbsd/lib/libc/include/isc/list.h
+++ b/libc/upstream-netbsd/lib/libc/include/isc/list.h
@@ -1,4 +1,4 @@
-/* $NetBSD: list.h,v 1.5 2009/04/12 17:07:16 christos Exp $ */
+/* $NetBSD: list.h,v 1.6 2022/04/19 20:32:15 rillig Exp $ */
/*
* Copyright (c) 2004 by Internet Systems Consortium, Inc. ("ISC")
@@ -23,14 +23,14 @@
#define LIST(type) struct { type *head, *tail; }
#define INIT_LIST(list) \
- do { (list).head = NULL; (list).tail = NULL; } while (/*CONSTCOND*/0)
+ do { (list).head = NULL; (list).tail = NULL; } while (0)
#define LINK(type) struct { type *prev, *next; }
#define INIT_LINK_TYPE(elt, link, type) \
do { \
(elt)->link.prev = (type *)(-1); \
(elt)->link.next = (type *)(-1); \
- } while (/*CONSTCOND*/0)
+ } while (0)
#define INIT_LINK(elt, link) \
INIT_LINK_TYPE(elt, link, void)
#define LINKED(elt, link) ((void *)((elt)->link.prev) != (void *)(-1) && \
@@ -50,7 +50,7 @@
(elt)->link.prev = NULL; \
(elt)->link.next = (list).head; \
(list).head = (elt); \
- } while (/*CONSTCOND*/0)
+ } while (0)
#define APPEND(list, elt, link) \
do { \
@@ -62,7 +62,7 @@
(elt)->link.prev = (list).tail; \
(elt)->link.next = NULL; \
(list).tail = (elt); \
- } while (/*CONSTCOND*/0)
+ } while (0)
#define UNLINK_TYPE(list, elt, link, type) \
do { \
@@ -80,7 +80,7 @@
(list).head = (elt)->link.next; \
} \
INIT_LINK_TYPE(elt, link, type); \
- } while (/*CONSTCOND*/0)
+ } while (0)
#define UNLINK(list, elt, link) \
UNLINK_TYPE(list, elt, link, void)
@@ -98,7 +98,7 @@
(elt)->link.prev->link.next = (elt); \
(elt)->link.next = (before); \
} \
- } while (/*CONSTCOND*/0)
+ } while (0)
#define INSERT_AFTER(list, after, elt, link) \
do { \
@@ -111,7 +111,7 @@
(elt)->link.next->link.prev = (elt); \
(elt)->link.prev = (after); \
} \
- } while (/*CONSTCOND*/0)
+ } while (0)
#define ENQUEUE(list, elt, link) APPEND(list, elt, link)
#define DEQUEUE(list, elt, link) UNLINK(list, elt, link)
diff --git a/libc/upstream-netbsd/lib/libc/regex/cclass.h b/libc/upstream-netbsd/lib/libc/regex/cclass.h
deleted file mode 100644
index 3ab2ccb..0000000
--- a/libc/upstream-netbsd/lib/libc/regex/cclass.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* $NetBSD: cclass.h,v 1.7 2003/08/07 16:43:19 agc Exp $ */
-
-/*-
- * Copyright (c) 1992, 1993, 1994
- * The Regents of the University of California. All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)cclass.h 8.3 (Berkeley) 3/20/94
- */
-
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)cclass.h 8.3 (Berkeley) 3/20/94
- */
-
-/* character-class table */
-static const struct cclass {
- const char *name;
- const char *chars;
- const char *multis;
-} cclasses[] = {
- { "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
-0123456789", "" },
- { "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
- "" },
- { "blank", " \t", "" },
- { "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
-\25\26\27\30\31\32\33\34\35\36\37\177", "" },
- { "digit", "0123456789", "" },
- { "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
-0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
- "" },
- { "lower", "abcdefghijklmnopqrstuvwxyz",
- "" },
- { "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
-0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
- "" },
- { "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
- "" },
- { "space", "\t\n\v\f\r ", "" },
- { "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
- "" },
- { "xdigit", "0123456789ABCDEFabcdef",
- "" },
- { NULL, 0, "" }
-};
diff --git a/libc/upstream-netbsd/lib/libc/regex/cname.h b/libc/upstream-netbsd/lib/libc/regex/cname.h
index 4b9ef39..47e57ac 100644
--- a/libc/upstream-netbsd/lib/libc/regex/cname.h
+++ b/libc/upstream-netbsd/lib/libc/regex/cname.h
@@ -1,6 +1,9 @@
-/* $NetBSD: cname.h,v 1.7 2003/08/07 16:43:19 agc Exp $ */
+/* $NetBSD: cname.h,v 1.8 2021/02/23 22:14:59 christos Exp $ */
/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
* Copyright (c) 1992, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
@@ -32,144 +35,108 @@
* SUCH DAMAGE.
*
* @(#)cname.h 8.3 (Berkeley) 3/20/94
- */
-
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)cname.h 8.3 (Berkeley) 3/20/94
+ * $FreeBSD: head/lib/libc/regex/cname.h 326025 2017-11-20 19:49:47Z pfg $
*/
/* character-name table */
-static const struct cname {
+static struct cname {
const char *name;
char code;
} cnames[] = {
- { "NUL", '\0' },
- { "SOH", '\001' },
- { "STX", '\002' },
- { "ETX", '\003' },
- { "EOT", '\004' },
- { "ENQ", '\005' },
- { "ACK", '\006' },
- { "BEL", '\007' },
- { "alert", '\007' },
- { "BS", '\010' },
- { "backspace", '\b' },
- { "HT", '\011' },
- { "tab", '\t' },
- { "LF", '\012' },
- { "newline", '\n' },
- { "VT", '\013' },
- { "vertical-tab", '\v' },
- { "FF", '\014' },
- { "form-feed", '\f' },
- { "CR", '\015' },
- { "carriage-return", '\r' },
- { "SO", '\016' },
- { "SI", '\017' },
- { "DLE", '\020' },
- { "DC1", '\021' },
- { "DC2", '\022' },
- { "DC3", '\023' },
- { "DC4", '\024' },
- { "NAK", '\025' },
- { "SYN", '\026' },
- { "ETB", '\027' },
- { "CAN", '\030' },
- { "EM", '\031' },
- { "SUB", '\032' },
- { "ESC", '\033' },
- { "IS4", '\034' },
- { "FS", '\034' },
- { "IS3", '\035' },
- { "GS", '\035' },
- { "IS2", '\036' },
- { "RS", '\036' },
- { "IS1", '\037' },
- { "US", '\037' },
- { "space", ' ' },
- { "exclamation-mark", '!' },
- { "quotation-mark", '"' },
- { "number-sign", '#' },
- { "dollar-sign", '$' },
- { "percent-sign", '%' },
- { "ampersand", '&' },
- { "apostrophe", '\'' },
- { "left-parenthesis", '(' },
- { "right-parenthesis", ')' },
- { "asterisk", '*' },
- { "plus-sign", '+' },
- { "comma", ',' },
- { "hyphen", '-' },
- { "hyphen-minus", '-' },
- { "period", '.' },
- { "full-stop", '.' },
- { "slash", '/' },
- { "solidus", '/' },
- { "zero", '0' },
- { "one", '1' },
- { "two", '2' },
- { "three", '3' },
- { "four", '4' },
- { "five", '5' },
- { "six", '6' },
- { "seven", '7' },
- { "eight", '8' },
- { "nine", '9' },
- { "colon", ':' },
- { "semicolon", ';' },
- { "less-than-sign", '<' },
- { "equals-sign", '=' },
- { "greater-than-sign", '>' },
- { "question-mark", '?' },
- { "commercial-at", '@' },
- { "left-square-bracket", '[' },
- { "backslash", '\\' },
- { "reverse-solidus", '\\' },
- { "right-square-bracket", ']' },
- { "circumflex", '^' },
- { "circumflex-accent", '^' },
- { "underscore", '_' },
- { "low-line", '_' },
- { "grave-accent", '`' },
- { "left-brace", '{' },
- { "left-curly-bracket", '{' },
- { "vertical-line", '|' },
- { "right-brace", '}' },
- { "right-curly-bracket", '}' },
- { "tilde", '~' },
- { "DEL", '\177' },
- { NULL, 0 },
+ {"NUL", '\0'},
+ {"SOH", '\001'},
+ {"STX", '\002'},
+ {"ETX", '\003'},
+ {"EOT", '\004'},
+ {"ENQ", '\005'},
+ {"ACK", '\006'},
+ {"BEL", '\007'},
+ {"alert", '\007'},
+ {"BS", '\010'},
+ {"backspace", '\b'},
+ {"HT", '\011'},
+ {"tab", '\t'},
+ {"LF", '\012'},
+ {"newline", '\n'},
+ {"VT", '\013'},
+ {"vertical-tab", '\v'},
+ {"FF", '\014'},
+ {"form-feed", '\f'},
+ {"CR", '\015'},
+ {"carriage-return", '\r'},
+ {"SO", '\016'},
+ {"SI", '\017'},
+ {"DLE", '\020'},
+ {"DC1", '\021'},
+ {"DC2", '\022'},
+ {"DC3", '\023'},
+ {"DC4", '\024'},
+ {"NAK", '\025'},
+ {"SYN", '\026'},
+ {"ETB", '\027'},
+ {"CAN", '\030'},
+ {"EM", '\031'},
+ {"SUB", '\032'},
+ {"ESC", '\033'},
+ {"IS4", '\034'},
+ {"FS", '\034'},
+ {"IS3", '\035'},
+ {"GS", '\035'},
+ {"IS2", '\036'},
+ {"RS", '\036'},
+ {"IS1", '\037'},
+ {"US", '\037'},
+ {"space", ' '},
+ {"exclamation-mark", '!'},
+ {"quotation-mark", '"'},
+ {"number-sign", '#'},
+ {"dollar-sign", '$'},
+ {"percent-sign", '%'},
+ {"ampersand", '&'},
+ {"apostrophe", '\''},
+ {"left-parenthesis", '('},
+ {"right-parenthesis", ')'},
+ {"asterisk", '*'},
+ {"plus-sign", '+'},
+ {"comma", ','},
+ {"hyphen", '-'},
+ {"hyphen-minus", '-'},
+ {"period", '.'},
+ {"full-stop", '.'},
+ {"slash", '/'},
+ {"solidus", '/'},
+ {"zero", '0'},
+ {"one", '1'},
+ {"two", '2'},
+ {"three", '3'},
+ {"four", '4'},
+ {"five", '5'},
+ {"six", '6'},
+ {"seven", '7'},
+ {"eight", '8'},
+ {"nine", '9'},
+ {"colon", ':'},
+ {"semicolon", ';'},
+ {"less-than-sign", '<'},
+ {"equals-sign", '='},
+ {"greater-than-sign", '>'},
+ {"question-mark", '?'},
+ {"commercial-at", '@'},
+ {"left-square-bracket", '['},
+ {"backslash", '\\'},
+ {"reverse-solidus", '\\'},
+ {"right-square-bracket",']'},
+ {"circumflex", '^'},
+ {"circumflex-accent", '^'},
+ {"underscore", '_'},
+ {"low-line", '_'},
+ {"grave-accent", '`'},
+ {"left-brace", '{'},
+ {"left-curly-bracket", '{'},
+ {"vertical-line", '|'},
+ {"right-brace", '}'},
+ {"right-curly-bracket", '}'},
+ {"tilde", '~'},
+ {"DEL", '\177'},
+ {NULL, 0}
};
diff --git a/libc/upstream-netbsd/lib/libc/regex/engine.c b/libc/upstream-netbsd/lib/libc/regex/engine.c
index 2a800d4..ca8b24d 100644
--- a/libc/upstream-netbsd/lib/libc/regex/engine.c
+++ b/libc/upstream-netbsd/lib/libc/regex/engine.c
@@ -1,6 +1,9 @@
-/* $NetBSD: engine.c,v 1.24 2012/03/13 21:13:42 christos Exp $ */
+/* $NetBSD: engine.c,v 1.29 2021/02/25 21:47:46 christos Exp $ */
/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
* Copyright (c) 1992, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
@@ -34,42 +37,13 @@
* @(#)engine.c 8.5 (Berkeley) 3/20/94
*/
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)engine.c 8.5 (Berkeley) 3/20/94
- */
+#include <sys/cdefs.h>
+#ifdef __FBSDID
+__FBSDID("$FreeBSD: head/lib/libc/regex/engine.c 368358 2020-12-05 03:16:05Z kevans $");
+#endif
+__RCSID("$NetBSD: engine.c,v 1.29 2021/02/25 21:47:46 christos Exp $");
+
+#include <stdbool.h>
/*
* The matching engine and friends. This file is #included by regexec.c
@@ -79,28 +53,37 @@
*/
#ifdef SNAMES
+#define stepback sstepback
#define matcher smatcher
-#define fast sfast
-#define slow sslow
+#define walk swalk
#define dissect sdissect
#define backref sbackref
#define step sstep
#define print sprint
#define at sat
#define match smat
-#define nope snope
#endif
#ifdef LNAMES
+#define stepback lstepback
#define matcher lmatcher
-#define fast lfast
-#define slow lslow
+#define walk lwalk
#define dissect ldissect
#define backref lbackref
#define step lstep
#define print lprint
#define at lat
#define match lmat
-#define nope lnope
+#endif
+#ifdef MNAMES
+#define stepback mstepback
+#define matcher mmatcher
+#define walk mwalk
+#define dissect mdissect
+#define backref mbackref
+#define step mstep
+#define print mprint
+#define at mat
+#define match mmat
#endif
/* another structure passed up and down to avoid zillions of parameters */
@@ -118,6 +101,7 @@
states fresh; /* states for a fresh start */
states tmp; /* temporary */
states empty; /* empty set of states */
+ mbstate_t mbs; /* multibyte conversion state */
};
/* ========= begin header generated by ./mkh ========= */
@@ -128,27 +112,31 @@
/* === engine.c === */
static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
-static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev);
-static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
-static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
-static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft);
-#define BOL (OUT+1)
-#define EOL (BOL+1)
-#define BOLEOL (BOL+2)
-#define NOTHING (BOL+3)
-#define BOW (BOL+4)
-#define EOW (BOL+5)
-#define CODEMAX (BOL+5) /* highest code used */
-#define NONCHAR(c) ((c) > CHAR_MAX)
-#define NNONCHAR (CODEMAX-CHAR_MAX)
+static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int);
+static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast);
+static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags);
+#define MAX_RECURSION 100
+#define BOL (OUT-1)
+#define EOL (BOL-1)
+#define BOLEOL (BOL-2)
+#define NOTHING (BOL-3)
+#define BOW (BOL-4)
+#define EOW (BOL-5)
+#define BADCHAR (BOL-6)
+#define NWBND (BOL-7)
+#define NONCHAR(c) ((c) <= OUT)
+/* sflags */
+#define SBOS 0x0001
+#define SEOS 0x0002
+
#ifdef REDEBUG
-static void print(struct match *m, char *caption, states st, int ch, FILE *d);
+static void print(struct match *m, const char *caption, states st, int ch, FILE *d);
#endif
#ifdef REDEBUG
-static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst);
+static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst);
#endif
#ifdef REDEBUG
-static char *pchar(int ch);
+static const char *pchar(int ch);
#endif
#ifdef __cplusplus
@@ -160,7 +148,6 @@
#define SP(t, s, c) print(m, t, s, c, stdout)
#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
#define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); }
-static int nope = 0;
#else
#define SP(t, s, c) /* nothing */
#define AT(t, p1, p2, s1, s2) /* nothing */
@@ -168,27 +155,70 @@
#endif
/*
+ * Given a multibyte string pointed to by start, step back nchar characters
+ * from current position pointed to by cur.
+ */
+static const char *
+stepback(const char *start, const char *cur, int nchar)
+{
+#ifdef NLS
+ const char *ret;
+ size_t wc, mbc;
+ mbstate_t mbs;
+ size_t clen;
+
+ if (MB_CUR_MAX == 1)
+ goto out;
+
+ ret = cur;
+ for (wc = nchar; wc > 0; wc--) {
+ for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) {
+ if ((ret - mbc) < start)
+ return (NULL);
+ memset(&mbs, 0, sizeof(mbs));
+ clen = mbrtowc(NULL, ret - mbc, mbc, &mbs);
+ if (clen != (size_t)-1 && clen != (size_t)-2)
+ break;
+ }
+ if (mbc > MB_CUR_MAX)
+ return (NULL);
+ ret -= mbc;
+ }
+
+ return (ret);
+out:
+#endif
+ return (cur - nchar) > start ? cur - nchar : NULL;
+}
+
+/*
- matcher - the actual matching engine
- == static int matcher(struct re_guts *g, char *string, \
+ == static int matcher(struct re_guts *g, const char *string, \
== size_t nmatch, regmatch_t pmatch[], int eflags);
*/
static int /* 0 success, REG_NOMATCH failure */
-matcher(
- struct re_guts *g,
- const char *string,
- size_t nmatch,
- regmatch_t pmatch[],
- int eflags)
+matcher(struct re_guts *g,
+ const char *string,
+ size_t nmatch,
+ regmatch_t pmatch[],
+ int eflags)
{
const char *endp;
size_t i;
struct match mv;
struct match *m = &mv;
- const char *dp;
+ const char *dp = NULL;
const sopno gf = g->firststate+1; /* +1 for OEND */
const sopno gl = g->laststate;
const char *start;
const char *stop;
+ /* Boyer-Moore algorithms variables */
+ const char *pp;
+ size_t cj, mj;
+ const char *mustfirst;
+ const char *mustlast;
+ size_t *matchjump;
+ size_t *charjump;
int error = 0;
_DIAGASSERT(g != NULL);
@@ -211,12 +241,46 @@
/* prescreening; this does wonders for this rather slow code */
if (g->must != NULL) {
- for (dp = start; dp < stop; dp++)
- if (*dp == g->must[0] && (size_t)(stop - dp) >= g->mlen &&
- memcmp(dp, g->must, g->mlen) == 0)
- break;
- if (dp == stop) /* we didn't find g->must */
- return(REG_NOMATCH);
+ if (g->charjump != NULL && g->matchjump != NULL) {
+ mustfirst = g->must;
+ mustlast = g->must + g->mlen - 1;
+ charjump = g->charjump;
+ matchjump = g->matchjump;
+ pp = mustlast;
+ for (dp = start+g->mlen-1; dp < stop;) {
+ /* Fast skip non-matches */
+ while (dp < stop && charjump[(int)*dp])
+ dp += charjump[(int)*dp];
+
+ if (dp >= stop)
+ break;
+
+ /* Greedy matcher */
+ /* We depend on not being used for
+ * for strings of length 1
+ */
+ while (*--dp == *--pp && pp != mustfirst);
+
+ if (*dp == *pp)
+ break;
+
+ /* Jump to next possible match */
+ mj = matchjump[pp - mustfirst];
+ cj = charjump[(int)*dp];
+ dp += (cj < mj ? mj : cj);
+ pp = mustlast;
+ }
+ if (pp != mustfirst)
+ return(REG_NOMATCH);
+ } else {
+ for (dp = start; dp < stop; dp++)
+ if (*dp == g->must[0] &&
+ (size_t)(stop - dp) >= g->mlen &&
+ memcmp(dp, g->must, (size_t)g->mlen) == 0)
+ break;
+ if (dp == stop) /* we didn't find g->must */
+ return(REG_NOMATCH);
+ }
}
/* match struct setup */
@@ -233,10 +297,22 @@
SETUP(m->tmp);
SETUP(m->empty);
CLEAR(m->empty);
+ ZAPSTATE(&m->mbs);
+
+ /* Adjust start according to moffset, to speed things up */
+ if (dp != NULL && g->moffset > -1) {
+ const char *nstart;
+
+ nstart = stepback(start, dp, g->moffset);
+ if (nstart != NULL)
+ start = nstart;
+ }
+
+ SP("mloop", m->st, *start);
/* this loop does only one repetition except for backrefs */
for (;;) {
- endp = fast(m, start, stop, gf, gl);
+ endp = walk(m, start, stop, gf, gl, true);
if (endp == NULL) { /* a miss */
error = REG_NOMATCH;
goto done;
@@ -248,11 +324,12 @@
assert(m->coldp != NULL);
for (;;) {
NOTE("finding start");
- endp = slow(m, m->coldp, stop, gf, gl);
+ endp = walk(m, m->coldp, stop, gf, gl, false);
if (endp != NULL)
break;
assert(m->coldp < m->endp);
- m->coldp++;
+ m->coldp += XMBRTOWC(NULL, m->coldp,
+ (size_t)(m->endp - m->coldp), &m->mbs, 0);
}
if (nmatch == 1 && !g->backrefs)
break; /* no further info needed */
@@ -266,20 +343,20 @@
goto done;
}
for (i = 1; i <= m->g->nsub; i++)
- m->pmatch[i].rm_so = m->pmatch[i].rm_eo = (regoff_t)-1;
+ m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
if (!g->backrefs && !(m->eflags®_BACKR)) {
NOTE("dissecting");
dp = dissect(m, m->coldp, endp, gf, gl);
} else {
if (g->nplus > 0 && m->lastpos == NULL)
m->lastpos = malloc((g->nplus+1) *
- sizeof(const char *));
+ sizeof(const char *));
if (g->nplus > 0 && m->lastpos == NULL) {
error = REG_ESPACE;
goto done;
}
NOTE("backref dissect");
- dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
+ dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
}
if (dp != NULL)
break;
@@ -291,7 +368,7 @@
if (dp != NULL || endp <= m->coldp)
break; /* defeat */
NOTE("backoff");
- endp = slow(m, m->coldp, endp-1, gf, gl);
+ endp = walk(m, m->coldp, endp-1, gf, gl, false);
if (endp == NULL)
break; /* defeat */
/* try it on a shorter possibility */
@@ -302,7 +379,7 @@
}
#endif
NOTE("backoff dissect");
- dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
+ dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
}
assert(dp == NULL || dp == endp);
if (dp != NULL) /* found a shorter one */
@@ -310,7 +387,9 @@
/* despite initial appearances, there is no match here */
NOTE("false alarm");
- start = m->coldp + 1; /* recycle starting later */
+ /* recycle starting later */
+ start = m->coldp + XMBRTOWC(NULL, m->coldp,
+ (size_t)(stop - m->coldp), &m->mbs, 0);
assert(start <= stop);
}
@@ -337,7 +416,7 @@
m->pmatch = NULL;
}
if (m->lastpos != NULL) {
- free(m->lastpos);
+ free(__UNCONST(m->lastpos));
m->lastpos = NULL;
}
STATETEARDOWN(m);
@@ -349,29 +428,27 @@
== static const char *dissect(struct match *m, const char *start, \
== const char *stop, sopno startst, sopno stopst);
*/
-static const char * /* == stop (success) always */
+static const char * /* == stop (success) always */
dissect(
- struct match *m,
- const char *start,
- const char *stop,
- sopno startst,
- sopno stopst)
+ struct match *m,
+ const char *start,
+ const char *stop,
+ sopno startst,
+ sopno stopst)
{
int i;
- sopno ss; /* start sop of current subRE */
- sopno es; /* end sop of current subRE */
- const char *sp; /* start of string matched by it */
- const char *stp; /* string matched by it cannot pass here */
- const char *rest; /* start of rest of string */
- const char *tail; /* string unmatched by rest of RE */
- sopno ssub; /* start sop of subsubRE */
- sopno esub; /* end sop of subsubRE */
- const char *ssp; /* start of string matched by subsubRE */
- const char *sep; /* end of string matched by subsubRE */
- const char *oldssp; /* previous ssp */
-#ifndef NDEBUG
- const char *dp;
-#endif
+ sopno ss; /* start sop of current subRE */
+ sopno es; /* end sop of current subRE */
+ const char *sp; /* start of string matched by it */
+ const char *stp; /* string matched by it cannot pass here */
+ const char *rest; /* start of rest of string */
+ const char *tail; /* string unmatched by rest of RE */
+ sopno ssub; /* start sop of subsubRE */
+ sopno esub; /* end sop of subsubRE */
+ const char *ssp; /* start of string matched by subsubRE */
+ const char *sep; /* end of string matched by subsubRE */
+ const char *oldssp; /* previous ssp */
+ const char *dp __unused;
_DIAGASSERT(m != NULL);
_DIAGASSERT(start != NULL);
@@ -400,16 +477,22 @@
assert(nope);
break;
case OCHAR:
- sp++;
+ sp += XMBRTOWC(NULL, sp, (size_t)(stop - start),
+ &m->mbs, 0);
break;
case OBOL:
case OEOL:
case OBOW:
case OEOW:
+ case OBOS:
+ case OEOS:
+ case OWBND:
+ case ONWBND:
break;
case OANY:
case OANYOF:
- sp++;
+ sp += XMBRTOWC(NULL, sp, (size_t)(stop - start),
+ &m->mbs, 0);
break;
case OBACK_:
case O_BACK:
@@ -420,10 +503,10 @@
stp = stop;
for (;;) {
/* how long could this one be? */
- rest = slow(m, sp, stp, ss, es);
+ rest = walk(m, sp, stp, ss, es, false);
assert(rest != NULL); /* it did match */
/* could the rest match the rest? */
- tail = slow(m, rest, stop, es, stopst);
+ tail = walk(m, rest, stop, es, stopst, false);
if (tail == stop)
break; /* yes! */
/* no -- try a shorter match for this one */
@@ -433,13 +516,8 @@
ssub = ss + 1;
esub = es - 1;
/* did innards match? */
- if (slow(m, sp, rest, ssub, esub) != NULL) {
-#ifdef NDEBUG
- (void)
-#else
- dp =
-#endif
- dissect(m, sp, rest, ssub, esub);
+ if (walk(m, sp, rest, ssub, esub, false) != NULL) {
+ dp = dissect(m, sp, rest, ssub, esub);
assert(dp == rest);
} else /* no */
assert(sp == rest);
@@ -449,10 +527,10 @@
stp = stop;
for (;;) {
/* how long could this one be? */
- rest = slow(m, sp, stp, ss, es);
+ rest = walk(m, sp, stp, ss, es, false);
assert(rest != NULL); /* it did match */
/* could the rest match the rest? */
- tail = slow(m, rest, stop, es, stopst);
+ tail = walk(m, rest, stop, es, stopst, false);
if (tail == stop)
break; /* yes! */
/* no -- try a shorter match for this one */
@@ -464,7 +542,7 @@
ssp = sp;
oldssp = ssp;
for (;;) { /* find last match of innards */
- sep = slow(m, ssp, rest, ssub, esub);
+ sep = walk(m, ssp, rest, ssub, esub, false);
if (sep == NULL || sep == ssp)
break; /* failed or matched null */
oldssp = ssp; /* on to next try */
@@ -476,13 +554,8 @@
ssp = oldssp;
}
assert(sep == rest); /* must exhaust substring */
- assert(slow(m, ssp, sep, ssub, esub) == rest);
-#ifdef NDEBUG
- (void)
-#else
- dp =
-#endif
- dissect(m, ssp, sep, ssub, esub);
+ assert(walk(m, ssp, sep, ssub, esub, false) == rest);
+ dp = dissect(m, ssp, sep, ssub, esub);
assert(dp == sep);
sp = rest;
break;
@@ -490,10 +563,10 @@
stp = stop;
for (;;) {
/* how long could this one be? */
- rest = slow(m, sp, stp, ss, es);
+ rest = walk(m, sp, stp, ss, es, false);
assert(rest != NULL); /* it did match */
/* could the rest match the rest? */
- tail = slow(m, rest, stop, es, stopst);
+ tail = walk(m, rest, stop, es, stopst, false);
if (tail == stop)
break; /* yes! */
/* no -- try a shorter match for this one */
@@ -504,7 +577,7 @@
esub = ss + OPND(m->g->strip[ss]) - 1;
assert(OP(m->g->strip[esub]) == OOR1);
for (;;) { /* find first matching branch */
- if (slow(m, sp, rest, ssub, esub) == rest)
+ if (walk(m, sp, rest, ssub, esub, false) == rest)
break; /* it matched all of it */
/* that one missed, try next one */
assert(OP(m->g->strip[esub]) == OOR1);
@@ -517,12 +590,7 @@
else
assert(OP(m->g->strip[esub]) == O_CH);
}
-#ifdef NDEBUG
- (void)
-#else
- dp =
-#endif
- dissect(m, sp, rest, ssub, esub);
+ dp = dissect(m, sp, rest, ssub, esub);
assert(dp == rest);
sp = rest;
break;
@@ -553,6 +621,17 @@
return(sp);
}
+#define ISBOW(m, sp) \
+ (sp < m->endp && ISWORD(*sp) && \
+ ((sp == m->beginp && !(m->eflags®_NOTBOL)) || \
+ (sp > m->offp && !ISWORD(*(sp-1)))))
+#define ISEOW(m, sp) \
+ (((sp == m->endp && !(m->eflags®_NOTEOL)) || \
+ (sp < m->endp && *sp == '\n' && \
+ (m->g->cflags®_NEWLINE)) || \
+ (sp < m->endp && !ISWORD(*sp)) ) && \
+ (sp > m->beginp && ISWORD(*(sp-1)))) \
+
/*
- backref - figure out what matched what, figuring in back references
== static const char *backref(struct match *m, const char *start, \
@@ -560,25 +639,27 @@
*/
static const char * /* == stop (success) or NULL (failure) */
backref(
- struct match *m,
- const char *start,
- const char *stop,
- sopno startst,
- sopno stopst,
- sopno lev) /* PLUS nesting level */
+ struct match *m,
+ const char *start,
+ const char *stop,
+ sopno startst,
+ sopno stopst,
+ sopno lev, /* PLUS nesting level */
+ int rec)
{
int i;
- sopno ss; /* start sop of current subRE */
- const char *sp; /* start of string matched by it */
- sopno ssub; /* start sop of subsubRE */
- sopno esub; /* end sop of subsubRE */
- const char *ssp; /* start of string matched by subsubRE */
+ sopno ss; /* start sop of current subRE */
+ const char *sp; /* start of string matched by it */
+ sopno ssub; /* start sop of subsubRE */
+ sopno esub; /* end sop of subsubRE */
+ const char *ssp; /* start of string matched by subsubRE */
const char *dp;
size_t len;
int hard;
sop s;
regoff_t offsave;
cset *cs;
+ wint_t wc;
_DIAGASSERT(m != NULL);
_DIAGASSERT(start != NULL);
@@ -592,23 +673,46 @@
for (ss = startst; !hard && ss < stopst; ss++)
switch (OP(s = m->g->strip[ss])) {
case OCHAR:
- if (sp == stop || *sp++ != (char)OPND(s))
+ if (sp == stop)
+ return(NULL);
+ sp += XMBRTOWC(&wc, sp, (size_t)(stop - sp),
+ &m->mbs, BADCHAR);
+ if (wc != (wint_t)OPND(s))
return(NULL);
break;
case OANY:
if (sp == stop)
return(NULL);
- sp++;
+ sp += XMBRTOWC(&wc, sp, (size_t)(stop - sp),
+ &m->mbs, BADCHAR);
+ if (wc == BADCHAR)
+ return (NULL);
break;
case OANYOF:
+ if (sp == stop)
+ return (NULL);
cs = &m->g->sets[OPND(s)];
- if (sp == stop || !CHIN(cs, *sp++))
+ sp += XMBRTOWC(&wc, sp, (size_t)(stop - sp),
+ &m->mbs, BADCHAR);
+ if (wc == BADCHAR || !CHIN(cs, wc))
+ return(NULL);
+ break;
+ case OBOS:
+ if (sp == m->beginp && (m->eflags & REG_NOTBOL) == 0)
+ { /* yes */ }
+ else
+ return(NULL);
+ break;
+ case OEOS:
+ if (sp == m->endp && (m->eflags & REG_NOTEOL) == 0)
+ { /* yes */ }
+ else
return(NULL);
break;
case OBOL:
- if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
- (sp < m->endp && *(sp-1) == '\n' &&
- (m->g->cflags®_NEWLINE)) )
+ if ((sp == m->beginp && !(m->eflags®_NOTBOL)) ||
+ (sp > m->offp && sp < m->endp &&
+ *(sp-1) == '\n' && (m->g->cflags®_NEWLINE)))
{ /* yes */ }
else
return(NULL);
@@ -621,23 +725,29 @@
else
return(NULL);
break;
+ case OWBND:
+ if (ISBOW(m, sp) || ISEOW(m, sp))
+ { /* yes */ }
+ else
+ return(NULL);
+ break;
+ case ONWBND:
+ if (((sp == m->beginp) && !ISWORD(*sp)) ||
+ (sp == m->endp && !ISWORD(*(sp - 1))))
+ { /* yes, beginning/end of subject */ }
+ else if (ISWORD(*(sp - 1)) == ISWORD(*sp))
+ { /* yes, beginning/end of subject */ }
+ else
+ return(NULL);
+ break;
case OBOW:
- if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
- (sp < m->endp && *(sp-1) == '\n' &&
- (m->g->cflags®_NEWLINE)) ||
- (sp > m->beginp &&
- !ISWORD(*(sp-1))) ) &&
- (sp < m->endp && ISWORD(*sp)) )
+ if (ISBOW(m, sp))
{ /* yes */ }
else
return(NULL);
break;
case OEOW:
- if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
- (sp < m->endp && *sp == '\n' &&
- (m->g->cflags®_NEWLINE)) ||
- (sp < m->endp && !ISWORD(*sp)) ) &&
- (sp > m->beginp && ISWORD(*(sp-1))) )
+ if (ISEOW(m, sp))
{ /* yes */ }
else
return(NULL);
@@ -671,50 +781,47 @@
case OBACK_: /* the vilest depths */
i = OPND(s);
assert(0 < i && i <= m->g->nsub);
- if (m->pmatch[i].rm_eo == (regoff_t)-1)
+ if (m->pmatch[i].rm_eo == -1)
return(NULL);
- assert(m->pmatch[i].rm_so != (regoff_t)-1);
- len = (size_t)(m->pmatch[i].rm_eo - m->pmatch[i].rm_so);
- if (len == 0)
+ assert(m->pmatch[i].rm_so != -1);
+ len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
+ if (len == 0 && rec++ > MAX_RECURSION)
return(NULL);
assert(stop - m->beginp >= len);
if (sp > stop - len)
return(NULL); /* not enough left to match */
- ssp = m->offp + (size_t)m->pmatch[i].rm_so;
+ ssp = m->offp + m->pmatch[i].rm_so;
if (memcmp(sp, ssp, len) != 0)
return(NULL);
while (m->g->strip[ss] != SOP(O_BACK, i))
ss++;
- return(backref(m, sp+len, stop, ss+1, stopst, lev));
-
+ return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
case OQUEST_: /* to null or not */
- dp = backref(m, sp, stop, ss+1, stopst, lev);
+ dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
if (dp != NULL)
return(dp); /* not */
- return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
-
+ return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
case OPLUS_:
assert(m->lastpos != NULL);
assert(lev+1 <= m->g->nplus);
m->lastpos[lev+1] = sp;
- return(backref(m, sp, stop, ss+1, stopst, lev+1));
-
+ return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
case O_PLUS:
if (sp == m->lastpos[lev]) /* last pass matched null */
- return(backref(m, sp, stop, ss+1, stopst, lev-1));
+ return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
/* try another pass */
m->lastpos[lev] = sp;
- dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
+ dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
if (dp == NULL)
- dp = backref(m, sp, stop, ss+1, stopst, lev-1);
- return(dp);
-
+ return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
+ else
+ return(dp);
case OCH_: /* find the right one, if any */
ssub = ss + 1;
esub = ss + OPND(s) - 1;
assert(OP(m->g->strip[esub]) == OOR1);
for (;;) { /* find first matching branch */
- dp = backref(m, sp, stop, ssub, esub, lev);
+ dp = backref(m, sp, stop, ssub, esub, lev, rec);
if (dp != NULL)
return(dp);
/* that one missed, try next one */
@@ -729,29 +836,28 @@
else
assert(OP(m->g->strip[esub]) == O_CH);
}
-
+ /* NOTREACHED */
+ break;
case OLPAREN: /* must undo assignment if rest fails */
i = OPND(s);
assert(0 < i && i <= m->g->nsub);
offsave = m->pmatch[i].rm_so;
m->pmatch[i].rm_so = sp - m->offp;
- dp = backref(m, sp, stop, ss+1, stopst, lev);
+ dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
if (dp != NULL)
return(dp);
m->pmatch[i].rm_so = offsave;
return(NULL);
-
case ORPAREN: /* must undo assignment if rest fails */
i = OPND(s);
assert(0 < i && i <= m->g->nsub);
offsave = m->pmatch[i].rm_eo;
m->pmatch[i].rm_eo = sp - m->offp;
- dp = backref(m, sp, stop, ss+1, stopst, lev);
+ dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
if (dp != NULL)
return(dp);
m->pmatch[i].rm_eo = offsave;
return(NULL);
-
default: /* uh oh */
assert(nope);
break;
@@ -760,141 +866,66 @@
/* "can't happen" */
assert(nope);
/* NOTREACHED */
- return NULL;
+ return "shut up gcc";
}
/*
- - fast - step through the string at top speed
- == static const char *fast(struct match *m, const char *start, \
- == const char *stop, sopno startst, sopno stopst);
+ - walk - step through the string either quickly or slowly
+ == static const char *walk(struct match *m, const char *start, \
+ == const char *stop, sopno startst, sopno stopst, bool fast);
*/
-static const char * /* where tentative match ended, or NULL */
-fast(
- struct match *m,
- const char *start,
- const char *stop,
- sopno startst,
- sopno stopst)
+static const char * /* where it ended, or NULL */
+walk(struct match *m, const char *start, const char *stop, sopno startst,
+ sopno stopst, bool fast)
{
states st = m->st;
states fresh = m->fresh;
- states tmp = m->tmp;
- const char *p = start;
- int c = (start == m->beginp) ? OUT : *(start-1);
- int lastc; /* previous c */
- int flagch;
- size_t i;
- const char *coldp; /* last p after which no match was underway */
-
- _DIAGASSERT(m != NULL);
- _DIAGASSERT(start != NULL);
- _DIAGASSERT(stop != NULL);
-
- CLEAR(st);
- SET1(st, startst);
- st = step(m->g, startst, stopst, st, NOTHING, st);
- ASSIGN(fresh, st);
- SP("start", st, *p);
- coldp = NULL;
- for (;;) {
- /* next character */
- lastc = c;
- c = (p == m->endp) ? OUT : *p;
- if (EQ(st, fresh))
- coldp = p;
-
- /* is there an EOL and/or BOL between lastc and c? */
- flagch = '\0';
- i = 0;
- if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
- (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
- flagch = BOL;
- i = m->g->nbol;
- }
- if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
- (c == OUT && !(m->eflags®_NOTEOL)) ) {
- flagch = (flagch == BOL) ? BOLEOL : EOL;
- i += m->g->neol;
- }
- if (i != 0) {
- for (; i > 0; i--)
- st = step(m->g, startst, stopst, st, flagch, st);
- SP("boleol", st, c);
- }
-
- /* how about a word boundary? */
- if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
- (c != OUT && ISWORD(c)) ) {
- flagch = BOW;
- }
- if ( (lastc != OUT && ISWORD(lastc)) &&
- (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
- flagch = EOW;
- }
- if (flagch == BOW || flagch == EOW) {
- st = step(m->g, startst, stopst, st, flagch, st);
- SP("boweow", st, c);
- }
-
- /* are we done? */
- if (ISSET(st, stopst) || p == stop)
- break; /* NOTE BREAK OUT */
-
- /* no, we must deal with this character */
- ASSIGN(tmp, st);
- ASSIGN(st, fresh);
- assert(c != OUT);
- st = step(m->g, startst, stopst, tmp, c, st);
- SP("aft", st, c);
- assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
- p++;
- }
-
- assert(coldp != NULL);
- m->coldp = coldp;
- if (ISSET(st, stopst))
- return(p+1);
- else
- return(NULL);
-}
-
-/*
- - slow - step through the string more deliberately
- == static const char *slow(struct match *m, const char *start, \
- == const char *stop, sopno startst, sopno stopst);
- */
-static const char * /* where it ended */
-slow(
- struct match *m,
- const char *start,
- const char *stop,
- sopno startst,
- sopno stopst)
-{
- states st = m->st;
states empty = m->empty;
states tmp = m->tmp;
const char *p = start;
- int c = (start == m->beginp) ? OUT : *(start-1);
- int lastc; /* previous c */
- int flagch;
- size_t i;
+ wint_t c;
+ wint_t lastc; /* previous c */
+ wint_t flagch;
+ int sflags;
const char *matchp; /* last p at which a match ended */
+ size_t i, clen;
_DIAGASSERT(m != NULL);
_DIAGASSERT(start != NULL);
_DIAGASSERT(stop != NULL);
- AT("slow", start, stop, startst, stopst);
+ sflags = 0;
+ AT("walk", start, stop, startst, stopst);
CLEAR(st);
SET1(st, startst);
SP("sstart", st, *p);
- st = step(m->g, startst, stopst, st, NOTHING, st);
+ st = step(m->g, startst, stopst, st, NOTHING, st, sflags);
+ if (fast)
+ ASSIGN(fresh, st);
matchp = NULL;
+ if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL)))
+ c = OUT;
+ else {
+ /*
+ * XXX Wrong if the previous character was multi-byte.
+ * Newline never is (in encodings supported by FreeBSD),
+ * so this only breaks the ISWORD tests below.
+ */
+ c = (uch)*(start - 1);
+ }
for (;;) {
/* next character */
lastc = c;
- c = (p == m->endp) ? OUT : *p;
+ sflags = 0;
+ if (p == m->endp) {
+ c = OUT;
+ clen = 0;
+ } else
+ clen = XMBRTOWC(&c, p, (size_t)(m->endp - p),
+ &m->mbs, BADCHAR);
+
+ if (fast && EQ(st, fresh))
+ matchp = p;
/* is there an EOL and/or BOL between lastc and c? */
flagch = '\0';
@@ -909,9 +940,20 @@
flagch = (flagch == BOL) ? BOLEOL : EOL;
i += m->g->neol;
}
+ if (lastc == OUT && (m->eflags & REG_NOTBOL) == 0) {
+ sflags |= SBOS;
+ /* Step one more for BOS. */
+ i++;
+ }
+ if (c == OUT && (m->eflags & REG_NOTEOL) == 0) {
+ sflags |= SEOS;
+ /* Step one more for EOS. */
+ i++;
+ }
if (i != 0) {
for (; i > 0; i--)
- st = step(m->g, startst, stopst, st, flagch, st);
+ st = step(m->g, startst, stopst, st, flagch, st,
+ sflags);
SP("sboleol", st, c);
}
@@ -925,52 +967,78 @@
flagch = EOW;
}
if (flagch == BOW || flagch == EOW) {
- st = step(m->g, startst, stopst, st, flagch, st);
+ st = step(m->g, startst, stopst, st, flagch, st, sflags);
SP("sboweow", st, c);
}
+ if (lastc != OUT && c != OUT &&
+ ISWORD(lastc) == ISWORD(c)) {
+ flagch = NWBND;
+ } else if ((lastc == OUT && !ISWORD(c)) ||
+ (c == OUT && !ISWORD(lastc))) {
+ flagch = NWBND;
+ }
+ if (flagch == NWBND) {
+ st = step(m->g, startst, stopst, st, flagch, st, sflags);
+ SP("snwbnd", st, c);
+ }
/* are we done? */
- if (ISSET(st, stopst))
- matchp = p;
- if (EQ(st, empty) || p == stop)
+ if (ISSET(st, stopst)) {
+ if (fast)
+ break;
+ else
+ matchp = p;
+ }
+ if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p))
break; /* NOTE BREAK OUT */
/* no, we must deal with this character */
ASSIGN(tmp, st);
- ASSIGN(st, empty);
+ if (fast)
+ ASSIGN(st, fresh);
+ else
+ ASSIGN(st, empty);
assert(c != OUT);
- st = step(m->g, startst, stopst, tmp, c, st);
+ st = step(m->g, startst, stopst, tmp, c, st, sflags);
SP("saft", st, c);
- assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
- p++;
+ assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags),
+ st));
+ p += clen;
}
- return(matchp);
+ if (fast) {
+ assert(matchp != NULL);
+ m->coldp = matchp;
+ if (ISSET(st, stopst))
+ return (p + XMBRTOWC(NULL, p, (size_t)(stop - p),
+ &m->mbs, 0));
+ else
+ return (NULL);
+ } else
+ return (matchp);
}
-
/*
- step - map set of states reachable before char to set reachable after
== static states step(struct re_guts *g, sopno start, sopno stop, \
== states bef, int ch, states aft);
- == #define BOL (OUT+1)
- == #define EOL (BOL+1)
- == #define BOLEOL (BOL+2)
- == #define NOTHING (BOL+3)
- == #define BOW (BOL+4)
- == #define EOW (BOL+5)
- == #define CODEMAX (BOL+5) // highest code used
- == #define NONCHAR(c) ((c) > CHAR_MAX)
- == #define NNONCHAR (CODEMAX-CHAR_MAX)
+ == #define BOL (OUT-1)
+ == #define EOL (BOL-1)
+ == #define BOLEOL (BOL-2)
+ == #define NOTHING (BOL-3)
+ == #define BOW (BOL-4)
+ == #define EOW (BOL-5)
+ == #define BADCHAR (BOL-6)
+ == #define NONCHAR(c) ((c) <= OUT)
*/
static states
-step(
- struct re_guts *g,
- sopno start, /* start state within strip */
- sopno stop, /* state after stop state within strip */
- states bef, /* states reachable before */
- int ch, /* character or NONCHAR code */
- states aft) /* states already known reachable after */
+step(struct re_guts *g,
+ sopno start, /* start state within strip */
+ sopno stop, /* state after stop state within strip */
+ states bef, /* states reachable before */
+ wint_t ch, /* character or NONCHAR code */
+ states aft, /* states already known reachable after */
+ int sflags) /* state flags */
{
cset *cs;
sop s;
@@ -989,8 +1057,16 @@
break;
case OCHAR:
/* only characters can match */
- assert(!NONCHAR(ch) || ch != (char)OPND(s));
- if (ch == (char)OPND(s))
+ assert(!NONCHAR(ch) || ch != OPND(s));
+ if (ch == (wint_t)OPND(s))
+ FWD(aft, bef, 1);
+ break;
+ case OBOS:
+ if ((ch == BOL || ch == BOLEOL) && (sflags & SBOS) != 0)
+ FWD(aft, bef, 1);
+ break;
+ case OEOS:
+ if ((ch == EOL || ch == BOLEOL) && (sflags & SEOS) != 0)
FWD(aft, bef, 1);
break;
case OBOL:
@@ -1009,6 +1085,14 @@
if (ch == EOW)
FWD(aft, bef, 1);
break;
+ case OWBND:
+ if (ch == BOW || ch == EOW)
+ FWD(aft, bef, 1);
+ break;
+ case ONWBND:
+ if (ch == NWBND)
+ FWD(aft, aft, 1);
+ break;
case OANY:
if (!NONCHAR(ch))
FWD(aft, bef, 1);
@@ -1054,10 +1138,10 @@
case OOR1: /* done a branch, find the O_CH */
if (ISSTATEIN(aft, here)) {
for (look = 1;
- OP(s = g->strip[pc+look]) != O_CH;
- look += OPND(s))
+ OP(s = g->strip[pc+look]) != O_CH;
+ look += OPND(s))
assert(OP(s) == OOR2);
- FWD(aft, aft, look);
+ FWD(aft, aft, look + 1);
}
break;
case OOR2: /* propagate OCH_'s marking */
@@ -1083,20 +1167,19 @@
/*
- print - print a set of states
== #ifdef REDEBUG
- == static void print(struct match *m, char *caption, states st, \
+ == static void print(struct match *m, const char *caption, states st, \
== int ch, FILE *d);
== #endif
*/
static void
-print(
- struct match *m,
- char *caption,
- states st,
- int ch,
- FILE *d)
+print(struct match *m,
+ const char *caption,
+ states st,
+ int ch,
+ FILE *d)
{
struct re_guts *g = m->g;
- int i;
+ sopno i;
int first = 1;
_DIAGASSERT(m != NULL);
@@ -1112,27 +1195,26 @@
fprintf(d, " %s", pchar(ch));
for (i = 0; i < g->nstates; i++)
if (ISSET(st, i)) {
- fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
+ fprintf(d, "%s%lu", (first) ? "\t" : ", ", i);
first = 0;
}
fprintf(d, "\n");
}
-/*
+/*
- at - print current situation
== #ifdef REDEBUG
- == static void at(struct match *m, char *title, char *start, char *stop, \
- == sopno startst, sopno stopst);
+ == static void at(struct match *m, const char *title, const char *start, \
+ == const char *stop, sopno startst, sopno stopst);
== #endif
*/
static void
-at(
- struct match *m,
- char *title,
- char *start,
- char *stop,
- sopno startst,
- sopno stopst)
+at( struct match *m,
+ const char *title,
+ const char *start,
+ const char *stop,
+ sopno startst,
+ sopno stopst)
{
_DIAGASSERT(m != NULL);
@@ -1153,7 +1235,7 @@
/*
- pchar - make a character printable
== #ifdef REDEBUG
- == static char *pchar(int ch);
+ == static const char *pchar(int ch);
== #endif
*
* Is this identical to regchar() over in debug.c? Well, yes. But a
@@ -1161,28 +1243,26 @@
* a matching debug.o, and this is convenient. It all disappears in
* the non-debug compilation anyway, so it doesn't matter much.
*/
-static char * /* -> representation */
-pchar(
- int ch)
+static const char * /* -> representation */
+pchar(int ch)
{
static char pbuf[10];
- if (isprint(ch) || ch == ' ')
- (void)snprintf(pbuf, sizeof pbuf, "%c", ch);
+ if (isprint((uch)ch) || ch == ' ')
+ snprintf(pbuf, sizeof(pbuf), "%c", ch);
else
- (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
+ snprintf(pbuf, sizeof(pbuf), "\\%o", ch);
return(pbuf);
}
#endif
#endif
+#undef stepback
#undef matcher
-#undef fast
-#undef slow
+#undef walk
#undef dissect
#undef backref
#undef step
#undef print
#undef at
#undef match
-#undef nope
diff --git a/libc/upstream-netbsd/lib/libc/regex/regcomp.c b/libc/upstream-netbsd/lib/libc/regex/regcomp.c
index 4a0d99a..957f8ac 100644
--- a/libc/upstream-netbsd/lib/libc/regex/regcomp.c
+++ b/libc/upstream-netbsd/lib/libc/regex/regcomp.c
@@ -1,9 +1,17 @@
-/* $NetBSD: regcomp.c,v 1.38 2019/02/07 22:22:31 christos Exp $ */
+/* $NetBSD: regcomp.c,v 1.46 2021/03/11 15:00:29 christos Exp $ */
/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
* Copyright (c) 1992, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
+ * Copyright (c) 2011 The FreeBSD Foundation
+ * All rights reserved.
+ * Portions of this software were developed by David Chisnall
+ * under sponsorship from the FreeBSD Foundation.
+ *
* This code is derived from software contributed to Berkeley by
* Henry Spencer.
*
@@ -34,74 +42,65 @@
* @(#)regcomp.c 8.5 (Berkeley) 3/20/94
*/
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
- */
+#if HAVE_NBTOOL_CONFIG_H
+#include "nbtool_config.h"
+#endif
#include <sys/cdefs.h>
-#if defined(LIBC_SCCS) && !defined(lint)
#if 0
static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
-#else
-__RCSID("$NetBSD: regcomp.c,v 1.38 2019/02/07 22:22:31 christos Exp $");
+__FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $");
#endif
-#endif /* LIBC_SCCS and not lint */
+__RCSID("$NetBSD: regcomp.c,v 1.46 2021/03/11 15:00:29 christos Exp $");
+
+#define _OPENBSD_SOURCE
+
+#ifndef LIBHACK
+#define REGEX_GNU_EXTENSIONS
#include "namespace.h"
+#endif
#include <sys/types.h>
-
-#include <assert.h>
+#include <stdio.h>
+#include <string.h>
#include <ctype.h>
#include <limits.h>
-#include <stdio.h>
#include <stdlib.h>
-#include <string.h>
#include <regex.h>
+#include <stdbool.h>
-#ifdef __weak_alias
+#if defined(__weak_alias) && !defined(LIBHACK)
__weak_alias(regcomp,_regcomp)
#endif
+#ifdef REGEX_LIBC_COLLATE
+#include "collate.h"
+#endif
+
#include "utils.h"
#include "regex2.h"
-#include "cclass.h"
#include "cname.h"
/*
+ * Branching context, used to keep track of branch state for all of the branch-
+ * aware functions. In addition to keeping track of branch positions for the
+ * p_branch_* functions, we use this to simplify some clumsiness in BREs for
+ * detection of whether ^ is acting as an anchor or being used erroneously and
+ * also for whether we're in a sub-expression or not.
+ */
+struct branchc {
+ sopno start;
+ sopno back;
+ sopno fwd;
+
+ int nbranch;
+ int nchain;
+ bool outer;
+ bool terminate;
+};
+
+/*
* parse structure, passed up and down to avoid global variables and
* other clumsinesses
*/
@@ -109,6 +108,7 @@
const char *next; /* next character in RE */
const char *end; /* end of string (-> NUL normally) */
int error; /* has an error been seen? */
+ int gnuext;
sop *strip; /* malloced strip */
sopno ssize; /* malloced strip size (allocated) */
sopno slen; /* malloced strip length (used) */
@@ -117,56 +117,70 @@
# define NPAREN 10 /* we need to remember () 1-9 for back refs */
sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
sopno pend[NPAREN]; /* -> ) ([0] unused) */
+ bool allowbranch; /* can this expression branch? */
+ bool bre; /* convenience; is this a BRE? */
+ int pflags; /* other parsing flags -- legacy escapes? */
+ bool (*parse_expr)(struct parse *, struct branchc *);
+ void (*pre_parse)(struct parse *, struct branchc *);
+ void (*post_parse)(struct parse *, struct branchc *);
};
+#define PFLAG_LEGACY_ESC 0x00000001
+
/* ========= begin header generated by ./mkh ========= */
#ifdef __cplusplus
extern "C" {
#endif
/* === regcomp.c === */
-static void p_ere(struct parse *p, int stop, size_t reclimit);
-static void p_ere_exp(struct parse *p, size_t reclimit);
+static bool p_ere_exp(struct parse *p, struct branchc *bc);
static void p_str(struct parse *p);
-static void p_bre(struct parse *p, int end1, int end2, size_t reclimit);
-static int p_simp_re(struct parse *p, int starordinary, size_t reclimit);
+static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
+static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
+static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
+static bool p_branch_empty(struct parse *p, struct branchc *bc);
+static bool p_branch_do(struct parse *p, struct branchc *bc);
+static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
+static void p_bre_post_parse(struct parse *p, struct branchc *bc);
+static void p_re(struct parse *p, int end1, int end2);
+static bool p_simp_re(struct parse *p, struct branchc *bc);
static int p_count(struct parse *p);
static void p_bracket(struct parse *p);
+static int p_range_cmp(wchar_t c1, wchar_t c2);
static void p_b_term(struct parse *p, cset *cs);
+#ifdef REGEX_GNU_EXTENSIONS
+static int p_b_pseudoclass(struct parse *p, char c);
+#endif
static void p_b_cclass(struct parse *p, cset *cs);
+static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
static void p_b_eclass(struct parse *p, cset *cs);
-static char p_b_symbol(struct parse *p);
-static char p_b_coll_elem(struct parse *p, int endc);
-static int othercase(int ch);
-static void bothcases(struct parse *p, int ch);
-static void ordinary(struct parse *p, int ch);
+static wint_t p_b_symbol(struct parse *p);
+static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
+static bool may_escape(struct parse *p, const wint_t ch);
+static wint_t othercase(wint_t ch);
+static void bothcases(struct parse *p, wint_t ch);
+static void ordinary(struct parse *p, wint_t ch);
static void nonnewline(struct parse *p);
-static void repeat(struct parse *p, sopno start, int from, int to, size_t reclimit);
+static void repeat(struct parse *p, sopno start, int from, int to);
static int seterr(struct parse *p, int e);
static cset *allocset(struct parse *p);
static void freeset(struct parse *p, cset *cs);
-static sopno freezeset(struct parse *p, cset *cs);
-static int firstch(struct parse *p, cset *cs);
-static int nch(struct parse *p, cset *cs);
-static void mcadd(struct parse *p, cset *cs, const char *cp);
-#if 0
-static void mcsub(cset *cs, char *cp);
-static int mcin(cset *cs, char *cp);
-static char *mcfind(cset *cs, char *cp);
-#endif
-static void mcinvert(struct parse *p, cset *cs);
-static void mccase(struct parse *p, cset *cs);
-static int isinsets(struct re_guts *g, int c);
-static int samesets(struct re_guts *g, int c1, int c2);
-static void categorize(struct parse *p, struct re_guts *g);
+static void CHadd(struct parse *p, cset *cs, wint_t ch);
+static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
+static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
+static wint_t singleton(cset *cs);
static sopno dupl(struct parse *p, sopno start, sopno finish);
-static void doemit(struct parse *p, sop op, sopno opnd);
-static void doinsert(struct parse *p, sop op, sopno opnd, sopno pos);
-static void dofwd(struct parse *p, sopno pos, sopno value);
+static void doemit(struct parse *p, sop op, size_t opnd);
+static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
+static void dofwd(struct parse *p, sopno pos, sop value);
static int enlarge(struct parse *p, sopno size);
static void stripsnug(struct parse *p, struct re_guts *g);
static void findmust(struct parse *p, struct re_guts *g);
+static int altoffset(sop *scan, int offset);
+static void computejumps(struct parse *p, struct re_guts *g);
+static void computematchjumps(struct parse *p, struct re_guts *g);
static sopno pluscount(struct parse *p, struct re_guts *g);
+static wint_t wgetnext(struct parse *p);
#ifdef __cplusplus
}
@@ -185,19 +199,22 @@
#define MORE2() (p->next+1 < p->end)
#define SEE(c) (MORE() && PEEK() == (c))
#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
+#define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a))
#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
+#define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a))
#define NEXT() (p->next++)
#define NEXT2() (p->next += 2)
#define NEXTn(n) (p->next += (n))
#define GETNEXT() (*p->next++)
+#define WGETNEXT() wgetnext(p)
#define SETERROR(e) seterr(p, (e))
-#define REQUIRE(co, e) (void) ((co) || SETERROR(e))
+#define REQUIRE(co, e) ((co) || SETERROR(e))
#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
-#define MUSTEAT(c, e) (void) (REQUIRE(MORE() && GETNEXT() == (c), e))
+#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
-#define EMIT(op, sopnd) doemit(p, (sop)(op), sopnd)
-#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
+#define EMIT(op, sopnd) doemit(p, (op), (sopnd))
+#define INSERT(op, pos) doinsert(p, (op), HERE()-(pos)+1, pos)
#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
#define HERE() (p->slen)
@@ -205,42 +222,62 @@
#define THERETHERE() (p->slen - 2)
#define DROP(n) (p->slen -= (n))
-#ifndef NDEBUG
-static int never = 0; /* for use in asserts; shuts lint up */
-#else
-#define never 0 /* some <assert.h>s have bugs too */
+/* Macro used by computejump()/computematchjump() */
+#ifndef MIN
+#define MIN(a,b) ((a)<(b)?(a):(b))
#endif
-#define MEMLIMIT 0x8000000
-#define MEMSIZE(p) \
- ((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \
- (p)->ncsalloc * sizeof(cset) + \
- (p)->ssize * sizeof(sop))
-#define RECLIMIT 256
+#ifndef NLS
+static const struct {
+ const char *name;
+ int (*func)(int);
+} wctypes[] = {
+#define ADD(x) { .name = # x, .func = is ## x }
+ ADD(alnum),
+ ADD(alpha),
+ ADD(blank),
+ ADD(cntrl),
+ ADD(digit),
+ ADD(graph),
+ ADD(lower),
+ ADD(print),
+ ADD(punct),
+ ADD(space),
+ ADD(upper),
+ ADD(xdigit),
+#undef ADD
+};
-/*
- - regcomp - interface for parser and compilation
- = extern int regcomp(regex_t *, const char *, int);
- = #define REG_BASIC 0000
- = #define REG_EXTENDED 0001
- = #define REG_ICASE 0002
- = #define REG_NOSUB 0004
- = #define REG_NEWLINE 0010
- = #define REG_NOSPEC 0020
- = #define REG_PEND 0040
- = #define REG_DUMP 0200
- */
-int /* 0 success, otherwise REG_something */
-regcomp(
- regex_t *preg,
- const char *pattern,
- int cflags)
+wctype_t
+__regex_wctype(const char *str)
+{
+ for (size_t i = 0; i < __arraycount(wctypes); i++) {
+ if (strcmp(wctypes[i].name, str) == 0)
+ return (wctype_t)(i + 1);
+ }
+ return (wctype_t)0;
+}
+
+int
+__regex_iswctype(wint_t c, wctype_t ct)
+{
+ if (ct == 0)
+ return 0;
+ return (*wctypes[ct - 1].func)(c);
+}
+#endif
+
+static int /* 0 success, otherwise REG_something */
+regcomp_internal(regex_t * __restrict preg,
+ const char * __restrict pattern,
+ int cflags, int pflags)
{
struct parse pa;
struct re_guts *g;
struct parse *p = &pa;
int i;
size_t len;
+ size_t maxlen;
#ifdef REDEBUG
# define GOODFLAGS(f) (f)
#else
@@ -262,11 +299,27 @@
len = strlen(pattern);
/* do the mallocs early so failure handling is easy */
- g = malloc(sizeof(struct re_guts) + (NC - 1) * sizeof(cat_t));
+ g = malloc(sizeof(*g));
if (g == NULL)
return(REG_ESPACE);
- p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
- p->strip = calloc(p->ssize, sizeof(sop));
+ /*
+ * Limit the pattern space to avoid a 32-bit overflow on buffer
+ * extension. Also avoid any signed overflow in case of conversion
+ * so make the real limit based on a 31-bit overflow.
+ *
+ * Likely not applicable on 64-bit systems but handle the case
+ * generically (who are we to stop people from using ~715MB+
+ * patterns?).
+ */
+ maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3;
+ if (len >= maxlen) {
+ free(g);
+ return(REG_ESPACE);
+ }
+ p->ssize = (sopno)(len / 2 * 3 + 1); /* ugh */
+ assert(p->ssize >= len);
+
+ p->strip = calloc(p->ssize, sizeof(*p->strip));
p->slen = 0;
if (p->strip == NULL) {
free(g);
@@ -275,46 +328,74 @@
/* set things up */
p->g = g;
- p->next = pattern;
+ p->next = pattern; /* convenience; we do not modify it */
p->end = p->next + len;
p->error = 0;
p->ncsalloc = 0;
+ p->pflags = pflags;
for (i = 0; i < NPAREN; i++) {
p->pbegin[i] = 0;
p->pend[i] = 0;
}
- g->csetsize = NC;
+#ifdef REGEX_GNU_EXTENSIONS
+ if ((cflags & REG_GNU) == 0) {
+ p->gnuext = false;
+ p->allowbranch = (cflags & REG_EXTENDED) != 0;
+ } else
+ p->gnuext = p->allowbranch = true;
+#else
+ p->gnuext = false;
+ p->allowbranch = (cflags & REG_EXTENDED) != 0;
+#endif
+ if (cflags & REG_EXTENDED) {
+ p->bre = false;
+ p->parse_expr = p_ere_exp;
+ p->pre_parse = NULL;
+ p->post_parse = NULL;
+ } else {
+ p->bre = true;
+ p->parse_expr = p_simp_re;
+ p->pre_parse = p_bre_pre_parse;
+ p->post_parse = p_bre_post_parse;
+ }
g->sets = NULL;
- g->setbits = NULL;
g->ncsets = 0;
g->cflags = cflags;
g->iflags = 0;
g->nbol = 0;
g->neol = 0;
g->must = NULL;
+ g->moffset = -1;
+ g->charjump = NULL;
+ g->matchjump = NULL;
g->mlen = 0;
g->nsub = 0;
- g->ncategories = 1; /* category 0 is "everything else" */
- g->categories = &g->catspace[-(CHAR_MIN)];
- (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
g->backrefs = 0;
/* do it */
EMIT(OEND, 0);
g->firststate = THERE();
- if (cflags®_EXTENDED)
- p_ere(p, OUT, 0);
- else if (cflags®_NOSPEC)
+ if (cflags & REG_NOSPEC)
p_str(p);
else
- p_bre(p, OUT, OUT, 0);
+ p_re(p, OUT, OUT);
EMIT(OEND, 0);
g->laststate = THERE();
/* tidy up loose ends and fill things in */
- categorize(p, g);
stripsnug(p, g);
findmust(p, g);
+ /* only use Boyer-Moore algorithm if the pattern is bigger
+ * than three characters
+ */
+ if(g->mlen > 3) {
+ computejumps(p, g);
+ computematchjumps(p, g);
+ if(g->matchjump == NULL && g->charjump != NULL) {
+ free(g->charjump);
+ g->charjump = NULL;
+ }
+ }
g->nplus = pluscount(p, g);
g->magic = MAGIC2;
preg->re_nsub = g->nsub;
@@ -333,97 +414,72 @@
}
/*
- - p_ere - ERE parser top level, concatenation and alternation
- == static void p_ere(struct parse *p, int stop, size_t reclimit);
+ - regcomp - interface for parser and compilation
+ = extern int regcomp(regex_t *, const char *, int);
+ = #define REG_BASIC 0000
+ = #define REG_EXTENDED 0001
+ = #define REG_ICASE 0002
+ = #define REG_NOSUB 0004
+ = #define REG_NEWLINE 0010
+ = #define REG_NOSPEC 0020
+ = #define REG_PEND 0040
+ = #define REG_DUMP 0200
*/
-static void
-p_ere(
- struct parse *p,
- int stop, /* character this ERE should end at */
- size_t reclimit)
+int /* 0 success, otherwise REG_something */
+regcomp(regex_t * __restrict preg,
+ const char * __restrict pattern,
+ int cflags)
{
- char c;
- sopno prevback = 0; /* pacify gcc */
- sopno prevfwd = 0; /* pacify gcc */
- sopno conc;
- int first = 1; /* is this the first alternative? */
- _DIAGASSERT(p != NULL);
-
- if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
- p->error = REG_ESPACE;
- return;
- }
-
- for (;;) {
- /* do a bunch of concatenated expressions */
- conc = HERE();
- while (MORE() && (c = PEEK()) != '|' && c != stop)
- p_ere_exp(p, reclimit);
- REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
-
- if (!EAT('|'))
- break; /* NOTE BREAK OUT */
-
- if (first) {
- INSERT(OCH_, conc); /* offset is wrong */
- prevfwd = conc;
- prevback = conc;
- first = 0;
- }
- ASTERN(OOR1, prevback);
- prevback = THERE();
- AHEAD(prevfwd); /* fix previous offset */
- prevfwd = HERE();
- EMIT(OOR2, 0); /* offset is very wrong */
- }
-
- if (!first) { /* tail-end fixups */
- AHEAD(prevfwd);
- ASTERN(O_CH, prevback);
- }
-
- assert(!MORE() || SEE(stop));
+ return (regcomp_internal(preg, pattern, cflags, 0));
}
/*
- - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
- == static void p_ere_exp(struct parse *p, size_t reclimit);
+ - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
+ - return whether we should terminate or not
+ == static bool p_ere_exp(struct parse *p);
*/
-static void
-p_ere_exp(
- struct parse *p,
- size_t reclimit)
+static bool
+p_ere_exp(struct parse *p, struct branchc *bc)
{
char c;
+ wint_t wc;
sopno pos;
int count;
int count2;
+#ifdef REGEX_GNU_EXTENSIONS
+ size_t i;
+ int handled;
+#endif
sopno subno;
int wascaret = 0;
_DIAGASSERT(p != NULL);
+ (void)bc;
assert(MORE()); /* caller should have ensured this */
c = GETNEXT();
+#ifdef REGEX_GNU_EXTENSIONS
+ handled = 0;
+#endif
pos = HERE();
switch (c) {
case '(':
- REQUIRE(MORE(), REG_EPAREN);
+ (void)REQUIRE(MORE(), REG_EPAREN);
p->g->nsub++;
- subno = p->g->nsub;
+ subno = (sopno)p->g->nsub;
if (subno < NPAREN)
p->pbegin[subno] = HERE();
EMIT(OLPAREN, subno);
if (!SEE(')'))
- p_ere(p, ')', reclimit);
+ p_re(p, ')', IGN);
if (subno < NPAREN) {
p->pend[subno] = HERE();
assert(p->pend[subno] != 0);
}
EMIT(ORPAREN, subno);
- MUSTEAT(')', REG_EPAREN);
+ (void)MUSTEAT(')', REG_EPAREN);
break;
#ifndef POSIX_MISTAKE
case ')': /* happens only if no current unmatched ( */
@@ -454,6 +510,7 @@
case '*':
case '+':
case '?':
+ case '{':
SETERROR(REG_BADRPT);
break;
case '.':
@@ -466,30 +523,118 @@
p_bracket(p);
break;
case '\\':
- REQUIRE(MORE(), REG_EESCAPE);
- c = GETNEXT();
- ordinary(p, c);
+ (void)REQUIRE(MORE(), REG_EESCAPE);
+ wc = WGETNEXT();
+#ifdef REGEX_GNU_EXTENSIONS
+ if (p->gnuext) {
+ handled = 1;
+ switch (wc) {
+ case '`':
+ EMIT(OBOS, 0);
+ break;
+ case '\'':
+ EMIT(OEOS, 0);
+ break;
+ case 'B':
+ EMIT(ONWBND, 0);
+ break;
+ case 'b':
+ EMIT(OWBND, 0);
+ break;
+ case 'W':
+ case 'w':
+ case 'S':
+ case 's':
+ p_b_pseudoclass(p, wc);
+ break;
+ case 'a':
+ ordinary(p, '\a');
+ break;
+ case 'e':
+ ordinary(p, '\e');
+ break;
+ case 'f':
+ ordinary(p, '\f');
+ break;
+ case 'n':
+ ordinary(p, '\n');
+ break;
+ case 'r':
+ ordinary(p, '\r');
+ break;
+ case 't':
+ ordinary(p, '\t');
+ break;
+ case 'v':
+ ordinary(p, '\v');
+ break;
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ i = wc - '0';
+ assert(i < NPAREN);
+ if (p->pend[i] != 0) {
+ assert(i <= p->g->nsub);
+ EMIT(OBACK_, i);
+ assert(p->pbegin[i] != 0);
+ assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
+ assert(OP(p->strip[p->pend[i]]) == ORPAREN);
+ (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
+ EMIT(O_BACK, i);
+ } else
+ SETERROR(REG_ESUBREG);
+ p->g->backrefs = 1;
+ break;
+ default:
+ handled = 0;
+ }
+ /* Don't proceed to the POSIX bits if we've already handled it */
+ if (handled)
+ break;
+ }
+#endif
+ switch (wc) {
+ case '<':
+ EMIT(OBOW, 0);
+ break;
+ case '>':
+ EMIT(OEOW, 0);
+ break;
+ default:
+ if (may_escape(p, wc))
+ ordinary(p, wc);
+ else
+ SETERROR(REG_EESCAPE);
+ break;
+ }
break;
- case '{': /* okay as ordinary except if digit follows */
- REQUIRE(!MORE() || !isdigit((unsigned char)PEEK()), REG_BADRPT);
- /* FALLTHROUGH */
default:
if (p->error != 0)
- return;
- ordinary(p, c);
+ return (false);
+ p->next--;
+ wc = WGETNEXT();
+ ordinary(p, wc);
break;
}
if (!MORE())
- return;
+ return (false);
c = PEEK();
/* we call { a repetition if followed by a digit */
- if (!( c == '*' || c == '+' || c == '?' ||
- (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ))
- return; /* no repetition, we're done */
+ if (!( c == '*' || c == '+' || c == '?' || c == '{'))
+ return (false); /* no repetition, we're done */
+ else if (c == '{')
+ (void)REQUIRE(MORE2() && \
+ (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
NEXT();
- REQUIRE(!wascaret, REG_BADRPT);
+ (void)REQUIRE(!wascaret, REG_BADRPT);
switch (c) {
case '*': /* implemented as +? */
/* this case does not require the (y|) trick, noKLUDGE */
@@ -514,30 +659,31 @@
case '{':
count = p_count(p);
if (EAT(',')) {
- if (isdigit((unsigned char)PEEK())) {
+ if (isdigit((uch)PEEK())) {
count2 = p_count(p);
- REQUIRE(count <= count2, REG_BADBR);
+ (void)REQUIRE(count <= count2, REG_BADBR);
} else /* single number with comma */
count2 = INFINITY;
} else /* just a single number */
count2 = count;
- repeat(p, pos, count, count2, 0);
+ repeat(p, pos, count, count2);
if (!EAT('}')) { /* error heuristics */
while (MORE() && PEEK() != '}')
NEXT();
- REQUIRE(MORE(), REG_EBRACE);
+ (void)REQUIRE(MORE(), REG_EBRACE);
SETERROR(REG_BADBR);
}
break;
}
if (!MORE())
- return;
+ return (false);
c = PEEK();
if (!( c == '*' || c == '+' || c == '?' ||
- (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ) )
- return;
+ (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
+ return (false);
SETERROR(REG_BADRPT);
+ return (false);
}
/*
@@ -545,159 +691,350 @@
== static void p_str(struct parse *p);
*/
static void
-p_str(
- struct parse *p)
+p_str(struct parse *p)
{
-
- _DIAGASSERT(p != NULL);
-
- REQUIRE(MORE(), REG_EMPTY);
+ (void)REQUIRE(MORE(), REG_EMPTY);
while (MORE())
- ordinary(p, GETNEXT());
+ ordinary(p, WGETNEXT());
}
/*
- - p_bre - BRE parser top level, anchoring and concatenation
- == static void p_bre(struct parse *p, int end1, \
- == int end2, size_t reclimit);
- * Giving end1 as OUT essentially eliminates the end1/end2 check.
- *
- * This implementation is a bit of a kludge, in that a trailing $ is first
- * taken as an ordinary character and then revised to be an anchor. The
- * only undesirable side effect is that '$' gets included as a character
- * category in such cases. This is fairly harmless; not worth fixing.
- * The amount of lookahead needed to avoid this kludge is excessive.
+ * Eat consecutive branch delimiters for the kind of expression that we are
+ * parsing, return the number of delimiters that we ate.
+ */
+static int
+p_branch_eat_delim(struct parse *p, struct branchc *bc)
+{
+ int nskip;
+
+ (void)bc;
+ nskip = 0;
+ while (EATSPEC('|'))
+ ++nskip;
+ return (nskip);
+}
+
+/*
+ * Insert necessary branch book-keeping operations. This emits a
+ * bogus 'next' offset, since we still have more to parse
*/
static void
-p_bre(
- struct parse *p,
- int end1, /* first terminating character */
- int end2, /* second terminating character */
- size_t reclimit)
+p_branch_ins_offset(struct parse *p, struct branchc *bc)
{
- sopno start;
- int first = 1; /* first subexpression? */
- int wasdollar = 0;
- _DIAGASSERT(p != NULL);
-
- if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
- p->error = REG_ESPACE;
- return;
+ if (bc->nbranch == 0) {
+ INSERT(OCH_, bc->start); /* offset is wrong */
+ bc->fwd = bc->start;
+ bc->back = bc->start;
}
- start = HERE();
+ ASTERN(OOR1, bc->back);
+ bc->back = THERE();
+ AHEAD(bc->fwd); /* fix previous offset */
+ bc->fwd = HERE();
+ EMIT(OOR2, 0); /* offset is very wrong */
+ ++bc->nbranch;
+}
+/*
+ * Fix the offset of the tail branch, if we actually had any branches.
+ * This is to correct the bogus placeholder offset that we use.
+ */
+static void
+p_branch_fix_tail(struct parse *p, struct branchc *bc)
+{
+
+ /* Fix bogus offset at the tail if we actually have branches */
+ if (bc->nbranch > 0) {
+ AHEAD(bc->fwd);
+ ASTERN(O_CH, bc->back);
+ }
+}
+
+/*
+ * Signal to the parser that an empty branch has been encountered; this will,
+ * in the future, be used to allow for more permissive behavior with empty
+ * branches. The return value should indicate whether parsing may continue
+ * or not.
+ */
+static bool
+p_branch_empty(struct parse *p, struct branchc *bc)
+{
+
+ (void)bc;
+ SETERROR(REG_EMPTY);
+ return (false);
+}
+
+/*
+ * Take care of any branching requirements. This includes inserting the
+ * appropriate branching instructions as well as eating all of the branch
+ * delimiters until we either run out of pattern or need to parse more pattern.
+ */
+static bool
+p_branch_do(struct parse *p, struct branchc *bc)
+{
+ int ate = 0;
+
+ ate = p_branch_eat_delim(p, bc);
+ if (ate == 0)
+ return (false);
+ else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
+ /*
+ * Halt parsing only if we have an empty branch and p_branch_empty
+ * indicates that we must not continue. In the future, this will not
+ * necessarily be an error.
+ */
+ return (false);
+ p_branch_ins_offset(p, bc);
+
+ return (true);
+}
+
+static void
+p_bre_pre_parse(struct parse *p, struct branchc *bc)
+{
+
+ (void)bc;
+ /*
+ * Does not move cleanly into expression parser because of
+ * ordinary interpration of * at the beginning position of
+ * an expression.
+ */
if (EAT('^')) {
EMIT(OBOL, 0);
p->g->iflags |= USEBOL;
p->g->nbol++;
}
- while (MORE() && !SEETWO(end1, end2)) {
- wasdollar = p_simp_re(p, first, reclimit);
- first = 0;
- }
- if (wasdollar) { /* oops, that was a trailing anchor */
+}
+
+static void
+p_bre_post_parse(struct parse *p, struct branchc *bc)
+{
+
+ /* Expression is terminating due to EOL token */
+ if (bc->terminate) {
DROP(1);
EMIT(OEOL, 0);
p->g->iflags |= USEEOL;
p->g->neol++;
}
+}
- REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
+/*
+ - p_re - Top level parser, concatenation and BRE anchoring
+ == static void p_re(struct parse *p, int end1, int end2);
+ * Giving end1 as OUT essentially eliminates the end1/end2 check.
+ *
+ * This implementation is a bit of a kludge, in that a trailing $ is first
+ * taken as an ordinary character and then revised to be an anchor.
+ * The amount of lookahead needed to avoid this kludge is excessive.
+ */
+static void
+p_re(struct parse *p,
+ int end1, /* first terminating character */
+ int end2) /* second terminating character; ignored for EREs */
+{
+ struct branchc bc;
+
+ bc.nbranch = 0;
+ if (end1 == OUT && end2 == OUT)
+ bc.outer = true;
+ else
+ bc.outer = false;
+#define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2))
+ for (;;) {
+ bc.start = HERE();
+ bc.nchain = 0;
+ bc.terminate = false;
+ if (p->pre_parse != NULL)
+ p->pre_parse(p, &bc);
+ while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
+ bc.terminate = p->parse_expr(p, &bc);
+ ++bc.nchain;
+ }
+ if (p->post_parse != NULL)
+ p->post_parse(p, &bc);
+ (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
+#ifdef REGEX_GNU_EXTENSIONS
+ if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc))
+ break;
+#endif
+ if (!p->allowbranch)
+ break;
+ /*
+ * p_branch_do's return value indicates whether we should
+ * continue parsing or not. This is both for correctness and
+ * a slight optimization, because it will check if we've
+ * encountered an empty branch or the end of the string
+ * immediately following a branch delimiter.
+ */
+ if (!p_branch_do(p, &bc))
+ break;
+ }
+#undef SEE_END
+ if (p->allowbranch)
+ p_branch_fix_tail(p, &bc);
+ assert(!MORE() || SEE(end1));
}
/*
- p_simp_re - parse a simple RE, an atom possibly followed by a repetition
- == static int p_simp_re(struct parse *p, int starordinary, size_t reclimit);
+ == static bool p_simp_re(struct parse *p, struct branchc *bc);
*/
-static int /* was the simple RE an unbackslashed $? */
-p_simp_re(
- struct parse *p,
- int starordinary, /* is a leading * an ordinary character? */
- size_t reclimit)
+static bool /* was the simple RE an unbackslashed $? */
+p_simp_re(struct parse *p, struct branchc *bc)
{
int c;
+ int cc; /* convenient/control character */
int count;
int count2;
- sopno pos, i;
+ sopno pos;
+ bool handled;
+ size_t i;
+ wint_t wc;
sopno subno;
# define BACKSL (1<<CHAR_BIT)
- _DIAGASSERT(p != NULL);
-
- pos = HERE(); /* repetion op, if any, covers from here */
+ pos = HERE(); /* repetition op, if any, covers from here */
+ handled = false;
assert(MORE()); /* caller should have ensured this */
c = GETNEXT();
if (c == '\\') {
- REQUIRE(MORE(), REG_EESCAPE);
- c = BACKSL | (unsigned char)GETNEXT();
- }
- switch (c) {
- case '.':
- if (p->g->cflags®_NEWLINE)
- nonnewline(p);
- else
- EMIT(OANY, 0);
- break;
- case '[':
- p_bracket(p);
- break;
- case BACKSL|'{':
- SETERROR(REG_BADRPT);
- break;
- case BACKSL|'(':
- p->g->nsub++;
- subno = p->g->nsub;
- if (subno < NPAREN)
- p->pbegin[subno] = HERE();
- EMIT(OLPAREN, subno);
- /* the MORE here is an error heuristic */
- if (MORE() && !SEETWO('\\', ')'))
- p_bre(p, '\\', ')', reclimit);
- if (subno < NPAREN) {
- p->pend[subno] = HERE();
- assert(p->pend[subno] != 0);
+ (void)REQUIRE(MORE(), REG_EESCAPE);
+ cc = GETNEXT();
+ c = BACKSL | cc;
+#ifdef REGEX_GNU_EXTENSIONS
+ if (p->gnuext) {
+ handled = true;
+ switch (c) {
+ case BACKSL|'`':
+ EMIT(OBOS, 0);
+ break;
+ case BACKSL|'\'':
+ EMIT(OEOS, 0);
+ break;
+ case BACKSL|'B':
+ EMIT(ONWBND, 0);
+ break;
+ case BACKSL|'b':
+ EMIT(OWBND, 0);
+ break;
+ case BACKSL|'W':
+ case BACKSL|'w':
+ case BACKSL|'S':
+ case BACKSL|'s':
+ p_b_pseudoclass(p, cc);
+ break;
+ case BACKSL|'a':
+ ordinary(p, '\a');
+ break;
+ case BACKSL|'e':
+ ordinary(p, '\e');
+ break;
+ case BACKSL|'f':
+ ordinary(p, '\f');
+ break;
+ case BACKSL|'n':
+ ordinary(p, '\n');
+ break;
+ case BACKSL|'r':
+ ordinary(p, '\r');
+ break;
+ case BACKSL|'t':
+ ordinary(p, '\t');
+ break;
+ case BACKSL|'v':
+ ordinary(p, '\v');
+ break;
+ default:
+ handled = false;
+ }
}
- EMIT(ORPAREN, subno);
- REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
- break;
- case BACKSL|')': /* should not get here -- must be user */
- case BACKSL|'}':
- SETERROR(REG_EPAREN);
- break;
- case BACKSL|'1':
- case BACKSL|'2':
- case BACKSL|'3':
- case BACKSL|'4':
- case BACKSL|'5':
- case BACKSL|'6':
- case BACKSL|'7':
- case BACKSL|'8':
- case BACKSL|'9':
- i = (c&~BACKSL) - '0';
- assert(i < NPAREN);
- if (p->pend[i] != 0) {
- assert(i <= p->g->nsub);
- EMIT(OBACK_, i);
- assert(p->pbegin[i] != 0);
- assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
- assert(OP(p->strip[p->pend[i]]) == ORPAREN);
- (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
- EMIT(O_BACK, i);
- } else
- SETERROR(REG_ESUBREG);
- p->g->backrefs = 1;
- break;
- case '*':
- REQUIRE(starordinary, REG_BADRPT);
- /* FALLTHROUGH */
- default:
- if (p->error != 0)
- return(0);
- ordinary(p, c &~ BACKSL);
- break;
+#endif
+ }
+ if (!handled) {
+ switch (c) {
+ case '.':
+ if (p->g->cflags®_NEWLINE)
+ nonnewline(p);
+ else
+ EMIT(OANY, 0);
+ break;
+ case '[':
+ p_bracket(p);
+ break;
+ case BACKSL|'<':
+ EMIT(OBOW, 0);
+ break;
+ case BACKSL|'>':
+ EMIT(OEOW, 0);
+ break;
+ case BACKSL|'{':
+ SETERROR(REG_BADRPT);
+ break;
+ case BACKSL|'(':
+ p->g->nsub++;
+ subno = (sopno)p->g->nsub;
+ if (subno < NPAREN)
+ p->pbegin[subno] = HERE();
+ EMIT(OLPAREN, subno);
+ /* the MORE here is an error heuristic */
+ if (MORE() && !SEETWO('\\', ')'))
+ p_re(p, '\\', ')');
+ if (subno < NPAREN) {
+ p->pend[subno] = HERE();
+ assert(p->pend[subno] != 0);
+ }
+ EMIT(ORPAREN, subno);
+ (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
+ break;
+ case BACKSL|')': /* should not get here -- must be user */
+ SETERROR(REG_EPAREN);
+ break;
+ case BACKSL|'1':
+ case BACKSL|'2':
+ case BACKSL|'3':
+ case BACKSL|'4':
+ case BACKSL|'5':
+ case BACKSL|'6':
+ case BACKSL|'7':
+ case BACKSL|'8':
+ case BACKSL|'9':
+ i = (c&~BACKSL) - '0';
+ assert(i < NPAREN);
+ if (p->pend[i] != 0) {
+ assert(i <= p->g->nsub);
+ EMIT(OBACK_, i);
+ assert(p->pbegin[i] != 0);
+ assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
+ assert(OP(p->strip[p->pend[i]]) == ORPAREN);
+ (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
+ EMIT(O_BACK, i);
+ } else
+ SETERROR(REG_ESUBREG);
+ p->g->backrefs = 1;
+ break;
+ case '*':
+ /*
+ * Ordinary if used as the first character beyond BOL anchor of
+ * a (sub-)expression, counts as a bad repetition operator if it
+ * appears otherwise.
+ */
+ (void)REQUIRE(bc->nchain == 0, REG_BADRPT);
+ /* FALLTHROUGH */
+ default:
+ if (p->error != 0)
+ return (false); /* Definitely not $... */
+ p->next--;
+ wc = WGETNEXT();
+ if ((c & BACKSL) == 0 || may_escape(p, wc))
+ ordinary(p, wc);
+ else
+ SETERROR(REG_EESCAPE);
+ break;
+ }
}
if (EAT('*')) { /* implemented as +? */
@@ -706,27 +1043,35 @@
ASTERN(O_PLUS, pos);
INSERT(OQUEST_, pos);
ASTERN(O_QUEST, pos);
+#ifdef REGEX_GNU_EXTENSIONS
+ } else if (p->gnuext && EATTWO('\\', '?')) {
+ INSERT(OQUEST_, pos);
+ ASTERN(O_QUEST, pos);
+ } else if (p->gnuext && EATTWO('\\', '+')) {
+ INSERT(OPLUS_, pos);
+ ASTERN(O_PLUS, pos);
+#endif
} else if (EATTWO('\\', '{')) {
count = p_count(p);
if (EAT(',')) {
- if (MORE() && isdigit((unsigned char)PEEK())) {
+ if (MORE() && isdigit((uch)PEEK())) {
count2 = p_count(p);
- REQUIRE(count <= count2, REG_BADBR);
+ (void)REQUIRE(count <= count2, REG_BADBR);
} else /* single number with comma */
count2 = INFINITY;
} else /* just a single number */
count2 = count;
- repeat(p, pos, count, count2, 0);
+ repeat(p, pos, count, count2);
if (!EATTWO('\\', '}')) { /* error heuristics */
while (MORE() && !SEETWO('\\', '}'))
NEXT();
- REQUIRE(MORE(), REG_EBRACE);
+ (void)REQUIRE(MORE(), REG_EBRACE);
SETERROR(REG_BADBR);
}
- } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */
- return(1);
+ } else if (c == '$') /* $ (but not \$) ends it */
+ return (true);
- return(0);
+ return (false);
}
/*
@@ -734,105 +1079,95 @@
== static int p_count(struct parse *p);
*/
static int /* the value */
-p_count(
- struct parse *p)
+p_count(struct parse *p)
{
int count = 0;
int ndigits = 0;
- _DIAGASSERT(p != NULL);
-
- while (MORE() && isdigit((unsigned char)PEEK()) && count <= DUPMAX) {
+ while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
count = count*10 + (GETNEXT() - '0');
ndigits++;
}
- REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
+ (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
return(count);
}
/*
- p_bracket - parse a bracketed character list
== static void p_bracket(struct parse *p);
- *
- * Note a significant property of this code: if the allocset() did SETERROR,
- * no set operations are done.
*/
static void
-p_bracket(
- struct parse *p)
+p_bracket(struct parse *p)
{
cset *cs;
- int invert = 0;
- _DIAGASSERT(p != NULL);
-
- cs = allocset(p);
- if (cs == NULL)
- return;
+ wint_t ch;
/* Dept of Truly Sickening Special-Case Kludges */
- if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]",
- (size_t)6) == 0) {
+ if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
EMIT(OBOW, 0);
NEXTn(6);
return;
}
- if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]",
- (size_t)6) == 0) {
+ if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
EMIT(OEOW, 0);
NEXTn(6);
return;
}
+ if ((cs = allocset(p)) == NULL)
+ return;
+
+ if (p->g->cflags®_ICASE)
+ cs->icase = 1;
if (EAT('^'))
- invert++; /* make note to invert set at end */
+ cs->invert = 1;
if (EAT(']'))
- CHadd(cs, ']');
+ CHadd(p, cs, ']');
else if (EAT('-'))
- CHadd(cs, '-');
+ CHadd(p, cs, '-');
while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
p_b_term(p, cs);
if (EAT('-'))
- CHadd(cs, '-');
- MUSTEAT(']', REG_EBRACK);
+ CHadd(p, cs, '-');
+ (void)MUSTEAT(']', REG_EBRACK);
if (p->error != 0) /* don't mess things up further */
return;
- if (p->g->cflags®_ICASE) {
- ssize_t i;
- int ci;
+ if (cs->invert && p->g->cflags®_NEWLINE)
+ cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
- for (i = p->g->csetsize - 1; i >= 0; i--)
- if (CHIN(cs, i) && isalpha(i)) {
- ci = othercase((int)i);
- if (ci != i)
- CHadd(cs, ci);
- }
- if (cs->multis != NULL)
- mccase(p, cs);
- }
- if (invert) {
- ssize_t i;
-
- for (i = p->g->csetsize - 1; i >= 0; i--)
- if (CHIN(cs, i))
- CHsub(cs, (int)i);
- else
- CHadd(cs, (int)i);
- if (p->g->cflags®_NEWLINE)
- CHsub(cs, '\n');
- if (cs->multis != NULL)
- mcinvert(p, cs);
- }
-
- assert(cs->multis == NULL); /* xxx */
-
- if (nch(p, cs) == 1) { /* optimize singleton sets */
- ordinary(p, firstch(p, cs));
+ if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
+ ordinary(p, ch);
freeset(p, cs);
} else
- EMIT(OANYOF, freezeset(p, cs));
+ EMIT(OANYOF, (size_t)(cs - p->g->sets));
+}
+
+static int
+p_range_cmp(wchar_t c1, wchar_t c2)
+{
+#ifdef REGEX_LIBC_COLLATE
+ return __wcollate_range_cmp(c1, c2);
+#elif defined(NLS)
+ /* Copied from libc/collate __wcollate_range_cmp */
+ wchar_t s1[2], s2[2];
+
+ s1[0] = c1;
+ s1[1] = L'\0';
+ s2[0] = c2;
+ s2[1] = L'\0';
+ return wcscoll(s1, s2);
+#else
+ char s1[2], s2[2];
+
+ s1[0] = (char)c1;
+ s1[1] = '\0';
+ s2[0] = (char)c2;
+ s2[1] = '\0';
+ return strcoll(s1, s2);
+#endif
}
/*
@@ -840,13 +1175,15 @@
== static void p_b_term(struct parse *p, cset *cs);
*/
static void
-p_b_term(
- struct parse *p,
- cset *cs)
+p_b_term(struct parse *p, cset *cs)
{
char c;
- char start, finish;
- int i;
+ wint_t start, finish;
+ wint_t i;
+#ifdef REGEX_LIBC_COLLATE
+ struct xlocale_collate *table =
+ (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
+#endif
_DIAGASSERT(p != NULL);
_DIAGASSERT(cs != NULL);
@@ -856,11 +1193,9 @@
case '[':
c = (MORE2()) ? PEEK2() : '\0';
break;
-
case '-':
SETERROR(REG_ERANGE);
return; /* NOTE RETURN */
-
default:
c = '\0';
break;
@@ -869,24 +1204,23 @@
switch (c) {
case ':': /* character class */
NEXT2();
- REQUIRE(MORE(), REG_EBRACK);
+ (void)REQUIRE(MORE(), REG_EBRACK);
c = PEEK();
- REQUIRE(c != '-' && c != ']', REG_ECTYPE);
+ (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
p_b_cclass(p, cs);
- REQUIRE(MORE(), REG_EBRACK);
- REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
+ (void)REQUIRE(MORE(), REG_EBRACK);
+ (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
break;
case '=': /* equivalence class */
NEXT2();
- REQUIRE(MORE(), REG_EBRACK);
+ (void)REQUIRE(MORE(), REG_EBRACK);
c = PEEK();
- REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
+ (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
p_b_eclass(p, cs);
- REQUIRE(MORE(), REG_EBRACK);
- REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
+ (void)REQUIRE(MORE(), REG_EBRACK);
+ (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
break;
default: /* symbol, ordinary character, or range */
-/* xxx revision needed for multichar stuff */
start = p_b_symbol(p);
if (SEE('-') && MORE2() && PEEK2() != ']') {
/* range */
@@ -897,51 +1231,103 @@
finish = p_b_symbol(p);
} else
finish = start;
-/* xxx what about signed chars here... */
- REQUIRE(start <= finish, REG_ERANGE);
- for (i = start; i <= finish; i++)
- CHadd(cs, i);
+ if (start == finish)
+ CHadd(p, cs, start);
+ else {
+#ifdef REGEX_LIBC_COLLATE
+ if (table->__collate_load_error || MB_CUR_MAX > 1) {
+#else
+ if (MB_CUR_MAX > 1) {
+#endif
+ (void)REQUIRE(start <= finish, REG_ERANGE);
+ CHaddrange(p, cs, start, finish);
+ } else {
+ (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
+ for (i = 0; i <= UCHAR_MAX; i++) {
+ if (p_range_cmp(start, i) <= 0 &&
+ p_range_cmp(i, finish) <= 0 )
+ CHadd(p, cs, i);
+ }
+ }
+ }
break;
}
}
+#ifdef REGEX_GNU_EXTENSIONS
+/*
+ - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
+ == static int p_b_pseudoclass(struct parse *p, char c)
+ */
+static int
+p_b_pseudoclass(struct parse *p, char c) {
+ cset *cs;
+
+ if ((cs = allocset(p)) == NULL)
+ return(0);
+
+ if (p->g->cflags®_ICASE)
+ cs->icase = 1;
+
+ switch (c) {
+ case 'W':
+ cs->invert = 1;
+ /* FALLTHROUGH */
+ case 'w':
+ p_b_cclass_named(p, cs, "alnum");
+ break;
+ case 'S':
+ cs->invert = 1;
+ /* FALLTHROUGH */
+ case 's':
+ p_b_cclass_named(p, cs, "space");
+ break;
+ default:
+ return(0);
+ }
+
+ EMIT(OANYOF, (size_t)(cs - p->g->sets));
+ return(1);
+}
+#endif
+
/*
- p_b_cclass - parse a character-class name and deal with it
== static void p_b_cclass(struct parse *p, cset *cs);
*/
static void
-p_b_cclass(
- struct parse *p,
- cset *cs)
+p_b_cclass(struct parse *p, cset *cs)
{
- const char *sp;
- const struct cclass *cp;
+ const char *sp = p->next;
size_t len;
- const char *u;
- char c;
+ char clname[16];
- _DIAGASSERT(p != NULL);
- _DIAGASSERT(cs != NULL);
-
- sp = p->next;
-
- while (MORE() && isalpha((unsigned char)PEEK()))
+ while (MORE() && isalpha((uch)PEEK()))
NEXT();
len = p->next - sp;
- for (cp = cclasses; cp->name != NULL; cp++)
- if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
- break;
- if (cp->name == NULL) {
- /* oops, didn't find it */
+ if (len >= sizeof(clname) - 1) {
SETERROR(REG_ECTYPE);
return;
}
+ memcpy(clname, sp, len);
+ clname[len] = '\0';
- u = cp->chars;
- while ((c = *u++) != '\0')
- CHadd(cs, c);
- for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
- MCadd(p, cs, u);
+ p_b_cclass_named(p, cs, clname);
+}
+
+/*
+ - p_b_cclass_named - deal with a named character class
+ == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
+ */
+static void
+p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
+ wctype_t wct;
+
+ if ((wct = wctype(clname)) == 0) {
+ SETERROR(REG_ECTYPE);
+ return;
+ }
+ CHaddtype(p, cs, wct);
}
/*
@@ -951,58 +1337,52 @@
* This implementation is incomplete. xxx
*/
static void
-p_b_eclass(
- struct parse *p,
- cset *cs)
+p_b_eclass(struct parse *p, cset *cs)
{
- char c;
+ wint_t c;
_DIAGASSERT(p != NULL);
_DIAGASSERT(cs != NULL);
c = p_b_coll_elem(p, '=');
- CHadd(cs, c);
+ CHadd(p, cs, c);
}
/*
- p_b_symbol - parse a character or [..]ed multicharacter collating symbol
- == static char p_b_symbol(struct parse *p);
+ == static wint_t p_b_symbol(struct parse *p);
*/
-static char /* value of symbol */
-p_b_symbol(
- struct parse *p)
+static wint_t /* value of symbol */
+p_b_symbol(struct parse *p)
{
- char value;
+ wint_t value;
_DIAGASSERT(p != NULL);
- REQUIRE(MORE(), REG_EBRACK);
+ (void)REQUIRE(MORE(), REG_EBRACK);
if (!EATTWO('[', '.'))
- return(GETNEXT());
+ return(WGETNEXT());
/* collating symbol */
value = p_b_coll_elem(p, '.');
- REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
+ (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
return(value);
}
/*
- p_b_coll_elem - parse a collating-element name and look it up
- == static char p_b_coll_elem(struct parse *p, int endc);
+ == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
*/
-static char /* value of collating element */
-p_b_coll_elem(
- struct parse *p,
- int endc) /* name ended by endc,']' */
+static wint_t /* value of collating element */
+p_b_coll_elem(struct parse *p,
+ wint_t endc) /* name ended by endc,']' */
{
- const char *sp;
- const struct cname *cp;
+ const char *sp = p->next;
+ struct cname *cp;
size_t len;
_DIAGASSERT(p != NULL);
- sp = p->next;
-
while (MORE() && !SEETWO(endc, ']'))
NEXT();
if (!MORE()) {
@@ -1013,85 +1393,152 @@
for (cp = cnames; cp->name != NULL; cp++)
if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
return(cp->code); /* known name */
- if (len == 1)
- return(*sp); /* single character */
- SETERROR(REG_ECOLLATE); /* neither */
+#ifdef NLS
+ mbstate_t mbs;
+ wchar_t wc;
+ size_t clen;
+
+ memset(&mbs, 0, sizeof(mbs));
+ if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
+ return (wc); /* single character */
+ else if (clen == (size_t)-1 || clen == (size_t)-2)
+ SETERROR(REG_ILLSEQ);
+ else
+ SETERROR(REG_ECOLLATE); /* neither */
return(0);
+#else
+ if (len == 1)
+ return *sp; /* single character */
+ SETERROR(REG_ECOLLATE); /* neither */
+ return 0;
+#endif
+}
+
+/*
+ - may_escape - determine whether 'ch' is escape-able in the current context
+ == static int may_escape(struct parse *p, const wint_t ch)
+ */
+static bool
+may_escape(struct parse *p, const wint_t ch)
+{
+
+ if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
+ return (true);
+ if (isalpha(ch) || ch == '\'' || ch == '`')
+ return (false);
+ return (true);
+#ifdef NOTYET
+ /*
+ * Build a whitelist of characters that may be escaped to produce an
+ * ordinary in the current context. This assumes that these have not
+ * been otherwise interpreted as a special character. Escaping an
+ * ordinary character yields undefined results according to
+ * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
+ * advantage of this and use escaped ordinary characters to provide
+ * special meaning, e.g. \b, \B, \w, \W, \s, \S.
+ */
+ switch(ch) {
+ case '|':
+ case '+':
+ case '?':
+ /* The above characters may not be escaped in BREs */
+ if (!(p->g->cflags®_EXTENDED))
+ return (false);
+ /* Fallthrough */
+ case '(':
+ case ')':
+ case '{':
+ case '}':
+ case '.':
+ case '[':
+ case ']':
+ case '\\':
+ case '*':
+ case '^':
+ case '$':
+ return (true);
+ default:
+ return (false);
+ }
+#endif
}
/*
- othercase - return the case counterpart of an alphabetic
- == static int othercase(int ch);
+ == static wint_t othercase(wint_t ch);
*/
-static int /* if no counterpart, return ch */
-othercase(
- int ch)
+static wint_t /* if no counterpart, return ch */
+othercase(wint_t ch)
{
- assert(isalpha(ch));
- if (isupper(ch))
- return(tolower(ch));
- else if (islower(ch))
- return(toupper(ch));
+ assert(iswalpha(ch));
+ if (iswupper(ch))
+ return(towlower(ch));
+ else if (iswlower(ch))
+ return(towupper(ch));
else /* peculiar, but could happen */
return(ch);
}
/*
- bothcases - emit a dualcase version of a two-case character
- == static void bothcases(struct parse *p, int ch);
+ == static void bothcases(struct parse *p, wint_t ch);
*
* Boy, is this implementation ever a kludge...
*/
static void
-bothcases(
- struct parse *p,
- int ch)
+bothcases(struct parse *p, wint_t ch)
{
- const char *oldnext;
- const char *oldend;
- char bracket[3];
+ const char *oldnext = p->next;
+ const char *oldend = p->end;
+ char bracket[3 + MB_LEN_MAX];
+ size_t n;
_DIAGASSERT(p != NULL);
- oldnext = p->next;
- oldend = p->end;
-
assert(othercase(ch) != ch); /* p_bracket() would recurse */
p->next = bracket;
- p->end = bracket+2;
- bracket[0] = ch;
- bracket[1] = ']';
- bracket[2] = '\0';
+#ifdef NLS
+ mbstate_t mbs;
+ memset(&mbs, 0, sizeof(mbs));
+ n = wcrtomb(bracket, ch, &mbs);
+ assert(n != (size_t)-1);
+#else
+ n = 0;
+ bracket[n++] = ch;
+#endif
+ bracket[n] = ']';
+ bracket[n + 1] = '\0';
+ p->end = bracket+n+1;
p_bracket(p);
- assert(p->next == bracket+2);
+ assert(p->next == p->end);
p->next = oldnext;
p->end = oldend;
}
/*
- ordinary - emit an ordinary character
- == static void ordinary(struct parse *p, int ch);
+ == static void ordinary(struct parse *p, wint_t ch);
*/
static void
-ordinary(
- struct parse *p,
- int ch)
+ordinary(struct parse *p, wint_t ch)
{
- cat_t *cap;
- unsigned char uc = (unsigned char)ch;
+ cset *cs;
_DIAGASSERT(p != NULL);
- cap = p->g->categories;
- if ((p->g->cflags & REG_ICASE) && isalpha(uc) && othercase(uc) != uc)
- bothcases(p, uc);
+ if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch)
+ bothcases(p, ch);
+ else if ((wint_t)(ch & OPDMASK) == ch)
+ EMIT(OCHAR, (size_t)ch);
else {
- EMIT(OCHAR, (sopno)uc);
- if (cap[uc] == 0) {
- _DIAGASSERT(__type_fit(unsigned char,
- p->g->ncategories + 1));
- cap[uc] = (unsigned char)p->g->ncategories++;
- }
+ /*
+ * Kludge: character is too big to fit into an OCHAR operand.
+ * Emit a singleton set.
+ */
+ if ((cs = allocset(p)) == NULL)
+ return;
+ CHadd(p, cs, ch);
+ EMIT(OANYOF, (size_t)(cs - p->g->sets));
}
}
@@ -1102,18 +1549,14 @@
* Boy, is this implementation ever a kludge...
*/
static void
-nonnewline(
- struct parse *p)
+nonnewline(struct parse *p)
{
- const char *oldnext;
- const char *oldend;
+ const char *oldnext = p->next;
+ const char *oldend = p->end;
char bracket[4];
_DIAGASSERT(p != NULL);
- oldnext = p->next;
- oldend = p->end;
-
p->next = bracket;
p->end = bracket+3;
bracket[0] = '^';
@@ -1128,18 +1571,15 @@
/*
- repeat - generate code for a bounded repetition, recursively if needed
- == static void repeat(struct parse *p, sopno start, int from, int to,
- == size_t reclimit);
+ == static void repeat(struct parse *p, sopno start, int from, int to);
*/
static void
-repeat(
- struct parse *p,
- sopno start, /* operand from here to end of strip */
- int from, /* repeated from this number */
- int to, /* to this number of times (maybe INFINITY) */
- size_t reclimit)
+repeat(struct parse *p,
+ sopno start, /* operand from here to end of strip */
+ int from, /* repeated from this number */
+ int to) /* to this number of times (maybe INFINITY) */
{
- sopno finish;
+ sopno finish = HERE();
# define N 2
# define INF 3
# define REP(f, t) ((f)*8 + (t))
@@ -1148,13 +1588,9 @@
_DIAGASSERT(p != NULL);
- if (reclimit++ > RECLIMIT)
- p->error = REG_ESPACE;
- if (p->error)
+ if (p->error != 0) /* head off possible runaway recursion */
return;
- finish = HERE();
-
assert(from <= to);
switch (REP(MAP(from), MAP(to))) {
@@ -1166,7 +1602,7 @@
case REP(0, INF): /* as x{1,}? */
/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
INSERT(OCH_, start); /* offset is wrong... */
- repeat(p, start+1, 1, to, reclimit);
+ repeat(p, start+1, 1, to);
ASTERN(OOR1, start);
AHEAD(start); /* ... fix it */
EMIT(OOR2, 0);
@@ -1186,7 +1622,7 @@
ASTERN(O_CH, THERETHERE());
copy = dupl(p, start+1, finish+1);
assert(copy == finish+4);
- repeat(p, copy, 1, to-1, reclimit);
+ repeat(p, copy, 1, to-1);
break;
case REP(1, INF): /* as x+ */
INSERT(OPLUS_, start);
@@ -1194,11 +1630,11 @@
break;
case REP(N, N): /* as xx{m-1,n-1} */
copy = dupl(p, start, finish);
- repeat(p, copy, from-1, to-1, reclimit);
+ repeat(p, copy, from-1, to-1);
break;
case REP(N, INF): /* as xx{n-1,INF} */
copy = dupl(p, start, finish);
- repeat(p, copy, from-1, to, reclimit);
+ repeat(p, copy, from-1, to);
break;
default: /* "can't happen" */
SETERROR(REG_ASSERT); /* just in case */
@@ -1207,13 +1643,39 @@
}
/*
+ - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
+ - character from the parse struct, signals a REG_ILLSEQ error if the
+ - character can't be converted. Returns the number of bytes consumed.
+ */
+static wint_t
+wgetnext(struct parse *p)
+{
+#ifdef NLS
+ mbstate_t mbs;
+ wchar_t wc;
+ size_t n;
+
+ memset(&mbs, 0, sizeof(mbs));
+ n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs);
+ if (n == (size_t)-1 || n == (size_t)-2) {
+ SETERROR(REG_ILLSEQ);
+ return (0);
+ }
+ if (n == 0)
+ n = 1;
+ p->next += n;
+ return wc;
+#else
+ return *p->next++;
+#endif
+}
+
+/*
- seterr - set an error condition
== static int seterr(struct parse *p, int e);
*/
static int /* useless but makes type checking happy */
-seterr(
- struct parse *p,
- int e)
+seterr(struct parse *p, int e)
{
_DIAGASSERT(p != NULL);
@@ -1230,55 +1692,22 @@
== static cset *allocset(struct parse *p);
*/
static cset *
-allocset(
- struct parse *p)
+allocset(struct parse *p)
{
- size_t no;
- size_t nc;
- size_t nbytes;
- cset *cs;
- size_t css;
- size_t i;
- void *old_ptr;
+ cset *cs, *ncs;
_DIAGASSERT(p != NULL);
- no = p->g->ncsets++;
- css = (size_t)p->g->csetsize;
- if (no >= p->ncsalloc) { /* need another column of space */
- p->ncsalloc += CHAR_BIT;
- nc = p->ncsalloc;
- assert(nc % CHAR_BIT == 0);
- nbytes = nc / CHAR_BIT * css;
- if (MEMSIZE(p) > MEMLIMIT)
- goto oomem;
- if (reallocarr(&p->g->sets, nc, sizeof(cset)))
- goto oomem;
- old_ptr = p->g->setbits;
- if (reallocarr(&p->g->setbits, nc / CHAR_BIT, css)) {
- free(old_ptr);
- goto oomem;
- }
- if (old_ptr != p->g->setbits) {
- for (i = 0; i < no; i++)
- p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
- }
- (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
+ ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
+ if (ncs == NULL) {
+ SETERROR(REG_ESPACE);
+ return (NULL);
}
-
- cs = &p->g->sets[no];
- cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
- cs->mask = 1 << (unsigned int)((no) % CHAR_BIT);
- cs->hash = 0;
- cs->smultis = 0;
- cs->multis = NULL;
+ p->g->sets = ncs;
+ cs = &p->g->sets[p->g->ncsets++];
+ memset(cs, 0, sizeof(*cs));
return(cs);
-
-oomem:
- SETERROR(REG_ESPACE);
- /* caller's responsibility not to do set ops */
- return NULL;
}
/*
@@ -1286,353 +1715,128 @@
== static void freeset(struct parse *p, cset *cs);
*/
static void
-freeset(
- struct parse *p,
- cset *cs)
+freeset(struct parse *p, cset *cs)
{
- size_t i;
cset *top;
- size_t css;
_DIAGASSERT(p != NULL);
_DIAGASSERT(cs != NULL);
top = &p->g->sets[p->g->ncsets];
- css = (size_t)p->g->csetsize;
- for (i = 0; i < css; i++)
- CHsub(cs, (int)i);
+ free(cs->wides);
+ free(cs->ranges);
+ free(cs->types);
+ memset(cs, 0, sizeof(*cs));
if (cs == top-1) /* recover only the easy case */
p->g->ncsets--;
}
/*
- - freezeset - final processing on a set of characters
- == static int freezeset(struct parse *p, cset *cs);
- *
- * The main task here is merging identical sets. This is usually a waste
- * of time (although the hash code minimizes the overhead), but can win
- * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
- * is done using addition rather than xor -- all ASCII [aA] sets xor to
- * the same value!
+ - singleton - Determine whether a set contains only one character,
+ - returning it if so, otherwise returning OUT.
*/
-static sopno /* set number */
-freezeset(
- struct parse *p,
- cset *cs)
+static wint_t
+singleton(cset *cs)
{
- uch h;
- size_t i;
- cset *top;
- cset *cs2;
- size_t css;
+ wint_t i, s, n;
- _DIAGASSERT(p != NULL);
- _DIAGASSERT(cs != NULL);
-
- h = cs->hash;
- top = &p->g->sets[p->g->ncsets];
- css = (size_t)p->g->csetsize;
-
- /* look for an earlier one which is the same */
- for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
- if (cs2->hash == h && cs2 != cs) {
- /* maybe */
- for (i = 0; i < css; i++)
- if (!!CHIN(cs2, i) != !!CHIN(cs, i))
- break; /* no */
- if (i == css)
- break; /* yes */
- }
-
- if (cs2 < top) { /* found one */
- freeset(p, cs);
- cs = cs2;
- }
-
- return (sopno)(cs - p->g->sets);
-}
-
-/*
- - firstch - return first character in a set (which must have at least one)
- == static int firstch(struct parse *p, cset *cs);
- */
-static int /* character; there is no "none" value */
-firstch(
- struct parse *p,
- cset *cs)
-{
- size_t i;
- size_t css;
-
- _DIAGASSERT(p != NULL);
- _DIAGASSERT(cs != NULL);
-
- css = (size_t)p->g->csetsize;
-
- for (i = 0; i < css; i++)
- if (CHIN(cs, i))
- return((char)i);
- assert(never);
- return(0); /* arbitrary */
-}
-
-/*
- - nch - number of characters in a set
- == static int nch(struct parse *p, cset *cs);
- */
-static int
-nch(
- struct parse *p,
- cset *cs)
-{
- size_t i;
- size_t css;
- int n = 0;
-
- _DIAGASSERT(p != NULL);
- _DIAGASSERT(cs != NULL);
-
- css = (size_t)p->g->csetsize;
-
- for (i = 0; i < css; i++)
- if (CHIN(cs, i))
+ for (i = n = 0; i < NC; i++)
+ if (CHIN(cs, i)) {
n++;
- return(n);
+ s = i;
+ }
+ if (n == 1)
+ return (s);
+ if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
+ cs->icase == 0)
+ return (cs->wides[0]);
+ /* Don't bother handling the other cases. */
+ return (OUT);
}
/*
- - mcadd - add a collating element to a cset
- == static void mcadd(struct parse *p, cset *cs, \
- == char *cp);
+ - CHadd - add character to character set.
*/
static void
-mcadd(
- struct parse *p,
- cset *cs,
- const char *cp)
+CHadd(struct parse *p, cset *cs, wint_t ch)
{
- size_t oldend;
+ wint_t nch, *newwides;
_DIAGASSERT(p != NULL);
_DIAGASSERT(cs != NULL);
- _DIAGASSERT(cp != NULL);
- oldend = cs->smultis;
+ assert(ch >= 0);
+ if (ch < NC)
+ cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7);
+ else {
+ newwides = reallocarray(cs->wides, cs->nwides + 1,
+ sizeof(*cs->wides));
+ if (newwides == NULL) {
+ SETERROR(REG_ESPACE);
+ return;
+ }
+ cs->wides = newwides;
+ cs->wides[cs->nwides++] = ch;
+ }
+ if (cs->icase) {
+ if ((nch = towlower(ch)) < NC)
+ cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
+ if ((nch = towupper(ch)) < NC)
+ cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
+ }
+}
- cs->smultis += strlen(cp) + 1;
- if (cs->multis == NULL)
- cs->multis = malloc(cs->smultis);
- else
- cs->multis = realloc(cs->multis, cs->smultis);
- if (cs->multis == NULL) {
+/*
+ - CHaddrange - add all characters in the range [min,max] to a character set.
+ */
+static void
+CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
+{
+ crange *newranges;
+
+ _DIAGASSERT(p != NULL);
+ _DIAGASSERT(cs != NULL);
+
+ for (; min < NC && min <= max; min++)
+ CHadd(p, cs, min);
+ if (min >= max)
+ return;
+ newranges = reallocarray(cs->ranges, cs->nranges + 1,
+ sizeof(*cs->ranges));
+ if (newranges == NULL) {
SETERROR(REG_ESPACE);
return;
}
-
- (void) strcpy(cs->multis + oldend - 1, cp);
- cs->multis[cs->smultis - 1] = '\0';
+ cs->ranges = newranges;
+ cs->ranges[cs->nranges].min = min;
+ cs->ranges[cs->nranges].max = max;
+ cs->nranges++;
}
-#if 0
/*
- - mcsub - subtract a collating element from a cset
- == static void mcsub(cset *cs, char *cp);
+ - CHaddtype - add all characters of a certain type to a character set.
*/
static void
-mcsub(
- cset *cs,
- char *cp)
+CHaddtype(struct parse *p, cset *cs, wctype_t wct)
{
- char *fp;
- size_t len;
+ wint_t i;
+ wctype_t *newtypes;
+ _DIAGASSERT(p != NULL);
_DIAGASSERT(cs != NULL);
- _DIAGASSERT(cp != NULL);
- fp = mcfind(cs, cp);
- len = strlen(fp);
-
- assert(fp != NULL);
- (void) memmove(fp, fp + len + 1,
- cs->smultis - (fp + len + 1 - cs->multis));
- cs->smultis -= len;
-
- if (cs->smultis == 0) {
- free(cs->multis);
- cs->multis = NULL;
+ for (i = 0; i < NC; i++)
+ if (iswctype(i, wct))
+ CHadd(p, cs, i);
+ newtypes = reallocarray(cs->types, cs->ntypes + 1,
+ sizeof(*cs->types));
+ if (newtypes == NULL) {
+ SETERROR(REG_ESPACE);
return;
}
-
- cs->multis = realloc(cs->multis, cs->smultis);
- assert(cs->multis != NULL);
-}
-
-/*
- - mcin - is a collating element in a cset?
- == static int mcin(cset *cs, char *cp);
- */
-static int
-mcin(
- cset *cs,
- char *cp)
-{
-
- _DIAGASSERT(cs != NULL);
- _DIAGASSERT(cp != NULL);
-
- return(mcfind(cs, cp) != NULL);
-}
-
-/*
- - mcfind - find a collating element in a cset
- == static char *mcfind(cset *cs, char *cp);
- */
-static char *
-mcfind(
- cset *cs,
- char *cp)
-{
- char *p;
-
- _DIAGASSERT(cs != NULL);
- _DIAGASSERT(cp != NULL);
-
- if (cs->multis == NULL)
- return(NULL);
- for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
- if (strcmp(cp, p) == 0)
- return(p);
- return(NULL);
-}
-#endif
-
-/*
- - mcinvert - invert the list of collating elements in a cset
- == static void mcinvert(struct parse *p, cset *cs);
- *
- * This would have to know the set of possibilities. Implementation
- * is deferred.
- */
-/* ARGSUSED */
-static void
-mcinvert(
- struct parse *p,
- cset *cs)
-{
-
- _DIAGASSERT(p != NULL);
- _DIAGASSERT(cs != NULL);
-
- assert(cs->multis == NULL); /* xxx */
-}
-
-/*
- - mccase - add case counterparts of the list of collating elements in a cset
- == static void mccase(struct parse *p, cset *cs);
- *
- * This would have to know the set of possibilities. Implementation
- * is deferred.
- */
-/* ARGSUSED */
-static void
-mccase(
- struct parse *p,
- cset *cs)
-{
-
- _DIAGASSERT(p != NULL);
- _DIAGASSERT(cs != NULL);
-
- assert(cs->multis == NULL); /* xxx */
-}
-
-/*
- - isinsets - is this character in any sets?
- == static int isinsets(struct re_guts *g, int c);
- */
-static int /* predicate */
-isinsets(
- struct re_guts *g,
- int c)
-{
- uch *col;
- size_t i;
- size_t ncols;
- unsigned uc = (unsigned char)c;
-
- _DIAGASSERT(g != NULL);
-
- if (g->setbits == NULL)
- return 0;
-
- ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
-
- for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
- if (col[uc] != 0)
- return(1);
- return(0);
-}
-
-/*
- - samesets - are these two characters in exactly the same sets?
- == static int samesets(struct re_guts *g, int c1, int c2);
- */
-static int /* predicate */
-samesets(
- struct re_guts *g,
- int c1,
- int c2)
-{
- uch *col;
- size_t i;
- size_t ncols;
- unsigned uc1 = (unsigned char)c1;
- unsigned uc2 = (unsigned char)c2;
-
- _DIAGASSERT(g != NULL);
-
- ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
-
- for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
- if (col[uc1] != col[uc2])
- return(0);
- return(1);
-}
-
-/*
- - categorize - sort out character categories
- == static void categorize(struct parse *p, struct re_guts *g);
- */
-static void
-categorize(
- struct parse *p,
- struct re_guts *g)
-{
- cat_t *cats;
- int c;
- int c2;
- cat_t cat;
-
- _DIAGASSERT(p != NULL);
- _DIAGASSERT(g != NULL);
-
- cats = g->categories;
-
- /* avoid making error situations worse */
- if (p->error != 0)
- return;
-
- for (c = CHAR_MIN; c <= CHAR_MAX; c++)
- if (cats[c] == 0 && isinsets(g, c)) {
- _DIAGASSERT(__type_fit(unsigned char,
- g->ncategories + 1));
- cat = g->ncategories++;
- cats[c] = cat;
- for (c2 = c+1; c2 <= CHAR_MAX; c2++)
- if (cats[c2] == 0 && samesets(g, c, c2))
- cats[c2] = cat;
- }
+ cs->types = newtypes;
+ cs->types[cs->ntypes++] = wct;
}
/*
@@ -1640,25 +1844,22 @@
== static sopno dupl(struct parse *p, sopno start, sopno finish);
*/
static sopno /* start of duplicate */
-dupl(
- struct parse *p,
- sopno start, /* from here */
- sopno finish) /* to this less one */
+dupl(struct parse *p,
+ sopno start, /* from here */
+ sopno finish) /* to this less one */
{
- sopno ret;
+ sopno ret = HERE();
sopno len = finish - start;
_DIAGASSERT(p != NULL);
- ret = HERE();
-
assert(finish >= start);
if (len == 0)
return(ret);
- if (!enlarge(p, p->ssize + len))/* this many unexpected additions */
- return ret;
- (void)memcpy(p->strip + p->slen, p->strip + start,
- (size_t)len * sizeof(sop));
+ if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
+ return(ret);
+ (void) memcpy(p->strip + p->slen,
+ p->strip + start, len * sizeof(*p->strip));
p->slen += len;
return(ret);
}
@@ -1672,17 +1873,14 @@
* some changes to the data structures. Maybe later.
*/
static void
-doemit(
- struct parse *p,
- sop op,
- sopno opnd)
+doemit(struct parse *p, sop op, size_t opnd)
{
- _DIAGASSERT(p != NULL);
-
/* avoid making error situations worse */
if (p->error != 0)
return;
+ _DIAGASSERT(p != NULL);
+
/* deal with oversize operands ("can't happen", more or less) */
assert(opnd < 1<<OPSHIFT);
@@ -1692,7 +1890,7 @@
return;
/* finally, it's all reduced to the easy case */
- p->strip[p->slen++] = (sop)SOP(op, opnd);
+ p->strip[p->slen++] = (sopno)SOP(op, opnd);
}
/*
@@ -1700,11 +1898,7 @@
== static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
*/
static void
-doinsert(
- struct parse *p,
- sop op,
- sopno opnd,
- sopno pos)
+doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
{
sopno sn;
sop s;
@@ -1732,7 +1926,8 @@
}
}
- memmove(&p->strip[pos+1], &p->strip[pos], (HERE()-pos-1)*sizeof(sop));
+ memmove(&p->strip[pos+1], &p->strip[pos],
+ (HERE()-pos-1)*sizeof(*p->strip));
p->strip[pos] = s;
}
@@ -1741,10 +1936,7 @@
== static void dofwd(struct parse *p, sopno pos, sop value);
*/
static void
-dofwd(
- struct parse *p,
- sopno pos,
- sopno value)
+dofwd(struct parse *p, sopno pos, sop value)
{
_DIAGASSERT(p != NULL);
@@ -1754,25 +1946,29 @@
return;
assert(value < 1<<OPSHIFT);
- p->strip[pos] = (sop)(OP(p->strip[pos]) | value);
+ p->strip[pos] = OP(p->strip[pos]) | value;
}
/*
- enlarge - enlarge the strip
- == static void enlarge(struct parse *p, sopno size);
+ == static int enlarge(struct parse *p, sopno size);
*/
static int
enlarge(struct parse *p, sopno size)
{
+ sop *sp;
+
_DIAGASSERT(p != NULL);
if (p->ssize >= size)
return 1;
- if (MEMSIZE(p) > MEMLIMIT || reallocarr(&p->strip, size, sizeof(sop))) {
+ sp = reallocarray(p->strip, size, sizeof(*p->strip));
+ if (sp == NULL) {
SETERROR(REG_ESPACE);
return 0;
}
+ p->strip = sp;
p->ssize = size;
return 1;
}
@@ -1782,18 +1978,18 @@
== static void stripsnug(struct parse *p, struct re_guts *g);
*/
static void
-stripsnug(
- struct parse *p,
- struct re_guts *g)
+stripsnug(struct parse *p, struct re_guts *g)
{
_DIAGASSERT(p != NULL);
_DIAGASSERT(g != NULL);
g->nstates = p->slen;
- g->strip = p->strip;
- reallocarr(&g->strip, p->slen, sizeof(sop));
- /* Ignore error as tries to free memory only. */
+ g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip));
+ if (g->strip == NULL) {
+ SETERROR(REG_ESPACE);
+ g->strip = p->strip;
+ }
}
/*
@@ -1807,9 +2003,7 @@
* Note that must and mlen got initialized during setup.
*/
static void
-findmust(
- struct parse *p,
- struct re_guts *g)
+findmust(struct parse *p, struct re_guts *g)
{
sop *scan;
sop *start = NULL;
@@ -1817,7 +2011,8 @@
sopno newlen;
sop s;
char *cp;
- sopno i;
+ int offset;
+ mbstate_t mbs;
_DIAGASSERT(p != NULL);
_DIAGASSERT(g != NULL);
@@ -1826,16 +2021,39 @@
if (p->error != 0)
return;
+#ifdef notyet
+ /*
+ * It's not generally safe to do a ``char'' substring search on
+ * multibyte character strings, but it's safe for at least
+ * UTF-8 (see RFC 3629).
+ */
+ if (MB_CUR_MAX > 1 &&
+ strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
+ return;
+#endif
+
/* find the longest OCHAR sequence in strip */
newlen = 0;
+ offset = 0;
+ g->moffset = 0;
scan = g->strip + 1;
do {
s = *scan++;
switch (OP(s)) {
case OCHAR: /* sequence member */
- if (newlen == 0) /* new sequence */
+ if (newlen == 0) { /* new sequence */
+ memset(&mbs, 0, sizeof(mbs));
newstart = scan - 1;
+ }
+#ifdef NLS
+ char buf[MB_LEN_MAX];
+ size_t clen = wcrtomb(buf, (int)OPND(s), &mbs);
+ if (clen == (size_t)-1)
+ goto toohard;
+ newlen += (sopno)clen;
+#else
newlen++;
+#endif
break;
case OPLUS_: /* things that don't break one */
case OLPAREN:
@@ -1843,60 +2061,346 @@
break;
case OQUEST_: /* things that must be skipped */
case OCH_:
+ offset = altoffset(scan, offset);
scan--;
do {
scan += OPND(s);
s = *scan;
/* assert() interferes w debug printouts */
- if (OP(s) != O_QUEST && OP(s) != O_CH &&
- OP(s) != OOR2) {
+ if (OP(s) != O_QUEST &&
+ OP(s) != O_CH && OP(s) != OOR2) {
g->iflags |= BAD;
return;
}
} while (OP(s) != O_QUEST && OP(s) != O_CH);
/* FALLTHROUGH */
- default: /* things that break a sequence */
- if (newlen > g->mlen) { /* ends one */
+ case OBOW: /* things that break a sequence */
+ case OEOW:
+ case OBOL:
+ case OEOL:
+ case OBOS:
+ case OEOS:
+ case OWBND:
+ case ONWBND:
+ case O_QUEST:
+ case O_CH:
+ case OEND:
+ if (newlen > (sopno)g->mlen) { /* ends one */
start = newstart;
g->mlen = newlen;
+ if (offset > -1) {
+ g->moffset += offset;
+ offset = newlen;
+ } else
+ g->moffset = offset;
+ } else {
+ if (offset > -1)
+ offset += newlen;
}
newlen = 0;
break;
+ case OANY:
+ if (newlen > (sopno)g->mlen) { /* ends one */
+ start = newstart;
+ g->mlen = newlen;
+ if (offset > -1) {
+ g->moffset += offset;
+ offset = newlen;
+ } else
+ g->moffset = offset;
+ } else {
+ if (offset > -1)
+ offset += newlen;
+ }
+ if (offset > -1)
+ offset++;
+ newlen = 0;
+ break;
+ case OANYOF: /* may or may not invalidate offset */
+ /* First, everything as OANY */
+ if (newlen > (sopno)g->mlen) { /* ends one */
+ start = newstart;
+ g->mlen = newlen;
+ if (offset > -1) {
+ g->moffset += offset;
+ offset = newlen;
+ } else
+ g->moffset = offset;
+ } else {
+ if (offset > -1)
+ offset += newlen;
+ }
+ if (offset > -1)
+ offset++;
+ newlen = 0;
+ break;
+#ifdef NLS
+ toohard:/*FALLTHROUGH*/
+#endif
+ default:
+ /* Anything here makes it impossible or too hard
+ * to calculate the offset -- so we give up;
+ * save the last known good offset, in case the
+ * must sequence doesn't occur later.
+ */
+ if (newlen > (sopno)g->mlen) { /* ends one */
+ start = newstart;
+ g->mlen = newlen;
+ if (offset > -1)
+ g->moffset += offset;
+ else
+ g->moffset = offset;
+ }
+ offset = -1;
+ newlen = 0;
+ break;
}
} while (OP(s) != OEND);
- if (start == NULL)
- g->mlen = 0;
-
- if (g->mlen == 0) /* there isn't one */
+ if (g->mlen == 0) { /* there isn't one */
+ g->moffset = -1;
return;
+ }
/* turn it into a character string */
g->must = malloc((size_t)g->mlen + 1);
if (g->must == NULL) { /* argh; just forget it */
g->mlen = 0;
+ g->moffset = -1;
return;
}
cp = g->must;
scan = start;
- for (i = g->mlen; i > 0; i--) {
+ memset(&mbs, 0, sizeof(mbs));
+ while (cp < g->must + g->mlen) {
while (OP(s = *scan++) != OCHAR)
continue;
- assert(cp < g->must + g->mlen);
- *cp++ = (char)OPND(s);
+#ifdef NLS
+ size_t clen = wcrtomb(cp, (int)OPND(s), &mbs);
+ assert(clen != (size_t)-1);
+ cp += clen;
+#else
+ *cp++ = OPND(s);
+#endif
}
assert(cp == g->must + g->mlen);
*cp++ = '\0'; /* just on general principles */
}
/*
+ - altoffset - choose biggest offset among multiple choices
+ == static int altoffset(sop *scan, int offset);
+ *
+ * Compute, recursively if necessary, the largest offset among multiple
+ * re paths.
+ */
+static int
+altoffset(sop *scan, int offset)
+{
+ int largest;
+ int try;
+ sop s;
+
+ _DIAGASSERT(scan != NULL);
+
+ /* If we gave up already on offsets, return */
+ if (offset == -1)
+ return -1;
+
+ largest = 0;
+ try = 0;
+ s = *scan++;
+ while (OP(s) != O_QUEST && OP(s) != O_CH) {
+ switch (OP(s)) {
+ case OOR1:
+ if (try > largest)
+ largest = try;
+ try = 0;
+ break;
+ case OQUEST_:
+ case OCH_:
+ try = altoffset(scan, try);
+ if (try == -1)
+ return -1;
+ scan--;
+ do {
+ scan += OPND(s);
+ s = *scan;
+ if (OP(s) != O_QUEST &&
+ OP(s) != O_CH && OP(s) != OOR2)
+ return -1;
+ } while (OP(s) != O_QUEST && OP(s) != O_CH);
+ /* We must skip to the next position, or we'll
+ * leave altoffset() too early.
+ */
+ scan++;
+ break;
+ case OANYOF:
+ case OCHAR:
+ case OANY:
+ try++;
+ /*FALLTHROUGH*/
+ case OBOW:
+ case OEOW:
+ case OWBND:
+ case ONWBND:
+ case OLPAREN:
+ case ORPAREN:
+ case OOR2:
+ break;
+ default:
+ try = -1;
+ break;
+ }
+ if (try == -1)
+ return -1;
+ s = *scan++;
+ }
+
+ if (try > largest)
+ largest = try;
+
+ return largest+offset;
+}
+
+/*
+ - computejumps - compute char jumps for BM scan
+ == static void computejumps(struct parse *p, struct re_guts *g);
+ *
+ * This algorithm assumes g->must exists and is has size greater than
+ * zero. It's based on the algorithm found on Computer Algorithms by
+ * Sara Baase.
+ *
+ * A char jump is the number of characters one needs to jump based on
+ * the value of the character from the text that was mismatched.
+ */
+static void
+computejumps(struct parse *p, struct re_guts *g)
+{
+ int ch;
+ size_t mindex;
+
+ _DIAGASSERT(p != NULL);
+ _DIAGASSERT(g != NULL);
+
+ /* Avoid making errors worse */
+ if (p->error != 0)
+ return;
+
+ g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump));
+ if (g->charjump == NULL) /* Not a fatal error */
+ return;
+ /* Adjust for signed chars, if necessary */
+ g->charjump = &g->charjump[-(CHAR_MIN)];
+
+ /* If the character does not exist in the pattern, the jump
+ * is equal to the number of characters in the pattern.
+ */
+ for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
+ g->charjump[ch] = g->mlen;
+
+ /* If the character does exist, compute the jump that would
+ * take us to the last character in the pattern equal to it
+ * (notice that we match right to left, so that last character
+ * is the first one that would be matched).
+ */
+ for (mindex = 0; mindex < g->mlen; mindex++)
+ g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
+}
+
+/*
+ - computematchjumps - compute match jumps for BM scan
+ == static void computematchjumps(struct parse *p, struct re_guts *g);
+ *
+ * This algorithm assumes g->must exists and is has size greater than
+ * zero. It's based on the algorithm found on Computer Algorithms by
+ * Sara Baase.
+ *
+ * A match jump is the number of characters one needs to advance based
+ * on the already-matched suffix.
+ * Notice that all values here are minus (g->mlen-1), because of the way
+ * the search algorithm works.
+ */
+static void
+computematchjumps(struct parse *p, struct re_guts *g)
+{
+ size_t mindex; /* General "must" iterator */
+ size_t suffix; /* Keeps track of matching suffix */
+ size_t ssuffix; /* Keeps track of suffixes' suffix */
+ size_t* pmatches; /* pmatches[k] points to the next i
+ * such that i+1...mlen is a substring
+ * of k+1...k+mlen-i-1
+ */
+
+ _DIAGASSERT(p != NULL);
+ _DIAGASSERT(g != NULL);
+
+ /* Avoid making errors worse */
+ if (p->error != 0)
+ return;
+
+ pmatches = calloc(g->mlen, sizeof(*pmatches));
+ if (pmatches == NULL) {
+ g->matchjump = NULL;
+ return;
+ }
+
+ g->matchjump = calloc(g->mlen, sizeof(*g->matchjump));
+ if (g->matchjump == NULL) { /* Not a fatal error */
+ free(pmatches);
+ return;
+ }
+
+ /* Set maximum possible jump for each character in the pattern */
+ for (mindex = 0; mindex < g->mlen; mindex++)
+ g->matchjump[mindex] = 2 * g->mlen - mindex - 1;
+
+ /* Compute pmatches[] */
+ for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) {
+ pmatches[mindex] = suffix;
+
+ /* If a mismatch is found, interrupting the substring,
+ * compute the matchjump for that position. If no
+ * mismatch is found, then a text substring mismatched
+ * against the suffix will also mismatch against the
+ * substring.
+ */
+ while (suffix < g->mlen
+ && g->must[mindex] != g->must[suffix]) {
+ g->matchjump[suffix] = MIN(g->matchjump[suffix],
+ g->mlen - mindex - 1);
+ suffix = pmatches[suffix];
+ }
+ }
+
+ /* Compute the matchjump up to the last substring found to jump
+ * to the beginning of the largest must pattern prefix matching
+ * it's own suffix.
+ */
+ for (mindex = 0; mindex <= suffix; mindex++)
+ g->matchjump[mindex] = MIN(g->matchjump[mindex],
+ g->mlen + suffix - mindex);
+
+ ssuffix = pmatches[suffix];
+ while (suffix < g->mlen) {
+ while (suffix <= ssuffix && suffix < g->mlen) {
+ g->matchjump[suffix] = MIN(g->matchjump[suffix],
+ g->mlen + ssuffix - suffix);
+ suffix++;
+ }
+ if (suffix < g->mlen)
+ ssuffix = pmatches[ssuffix];
+ }
+
+ free(pmatches);
+}
+
+/*
- pluscount - count + nesting
== static sopno pluscount(struct parse *p, struct re_guts *g);
*/
static sopno /* nesting depth */
-pluscount(
- struct parse *p,
- struct re_guts *g)
+pluscount(struct parse *p, struct re_guts *g)
{
sop *scan;
sop s;
diff --git a/libc/upstream-netbsd/lib/libc/regex/regerror.c b/libc/upstream-netbsd/lib/libc/regex/regerror.c
index e00d7c0..cfd7704 100644
--- a/libc/upstream-netbsd/lib/libc/regex/regerror.c
+++ b/libc/upstream-netbsd/lib/libc/regex/regerror.c
@@ -1,6 +1,9 @@
-/* $NetBSD: regerror.c,v 1.23 2007/02/09 23:44:18 junyoung Exp $ */
+/* $NetBSD: regerror.c,v 1.26 2022/11/05 11:33:55 riastradh Exp $ */
/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
* Copyright (c) 1992, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
@@ -34,76 +37,38 @@
* @(#)regerror.c 8.4 (Berkeley) 3/20/94
*/
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)regerror.c 8.4 (Berkeley) 3/20/94
- */
+#if HAVE_NBTOOL_CONFIG_H
+#include "nbtool_config.h"
+#endif
#include <sys/cdefs.h>
-#if defined(LIBC_SCCS) && !defined(lint)
#if 0
static char sccsid[] = "@(#)regerror.c 8.4 (Berkeley) 3/20/94";
-#else
-__RCSID("$NetBSD: regerror.c,v 1.23 2007/02/09 23:44:18 junyoung Exp $");
+__FBSDID("$FreeBSD: head/lib/libc/regex/regerror.c 326025 2017-11-20 19:49:47Z pfg $");
#endif
-#endif /* LIBC_SCCS and not lint */
+__RCSID("$NetBSD: regerror.c,v 1.26 2022/11/05 11:33:55 riastradh Exp $");
#include "namespace.h"
#include <sys/types.h>
-
-#include <assert.h>
-#include <ctype.h>
-#include <limits.h>
#include <stdio.h>
-#include <stdlib.h>
#include <string.h>
+#include <limits.h>
+#include <stdlib.h>
#include <regex.h>
+#include "utils.h"
+
#ifdef __weak_alias
__weak_alias(regerror,_regerror)
#endif
-#include "utils.h"
-
/* ========= begin header generated by ./mkh ========= */
#ifdef __cplusplus
extern "C" {
#endif
/* === regerror.c === */
-static const char *regatoi(const regex_t *preg, char *localbuf, size_t buflen);
+static const char *regatoi(const regex_t *preg, char *localbufm, size_t buflen);
#ifdef __cplusplus
}
@@ -126,6 +91,8 @@
= #define REG_EMPTY 14
= #define REG_ASSERT 15
= #define REG_INVARG 16
+ = #define REG_ENOSYS 17
+ = #define REG_ILLSEQ 18
= #define REG_ATOI 255 // convert name to number (!)
= #define REG_ITOA 0400 // convert number to name (!)
*/
@@ -134,36 +101,36 @@
const char *name;
const char *explain;
} rerrs[] = {
- { REG_NOMATCH, "REG_NOMATCH", "regexec() failed to match" },
- { REG_BADPAT, "REG_BADPAT", "invalid regular expression" },
- { REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element" },
- { REG_ECTYPE, "REG_ECTYPE", "invalid character class" },
- { REG_EESCAPE, "REG_EESCAPE", "trailing backslash (\\)" },
- { REG_ESUBREG, "REG_ESUBREG", "invalid backreference number" },
- { REG_EBRACK, "REG_EBRACK", "brackets ([ ]) not balanced" },
- { REG_EPAREN, "REG_EPAREN", "parentheses not balanced" },
- { REG_EBRACE, "REG_EBRACE", "braces not balanced" },
- { REG_BADBR, "REG_BADBR", "invalid repetition count(s)" },
- { REG_ERANGE, "REG_ERANGE", "invalid character range" },
- { REG_ESPACE, "REG_ESPACE", "out of memory" },
- { REG_BADRPT, "REG_BADRPT", "repetition-operator operand invalid" },
- { REG_EMPTY, "REG_EMPTY", "empty (sub)expression" },
- { REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug" },
- { REG_INVARG, "REG_INVARG", "invalid argument to regex routine" },
- { 0, "", "*** unknown regexp error code ***" }
+ {REG_NOMATCH, "REG_NOMATCH", "regexec() failed to match"},
+ {REG_BADPAT, "REG_BADPAT", "invalid regular expression"},
+ {REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element"},
+ {REG_ECTYPE, "REG_ECTYPE", "invalid character class"},
+ {REG_EESCAPE, "REG_EESCAPE", "trailing backslash (\\)"},
+ {REG_ESUBREG, "REG_ESUBREG", "invalid backreference number"},
+ {REG_EBRACK, "REG_EBRACK", "brackets ([ ]) not balanced"},
+ {REG_EPAREN, "REG_EPAREN", "parentheses not balanced"},
+ {REG_EBRACE, "REG_EBRACE", "braces not balanced"},
+ {REG_BADBR, "REG_BADBR", "invalid repetition count(s)"},
+ {REG_ERANGE, "REG_ERANGE", "invalid character range"},
+ {REG_ESPACE, "REG_ESPACE", "out of memory"},
+ {REG_BADRPT, "REG_BADRPT", "repetition-operator operand invalid"},
+ {REG_EMPTY, "REG_EMPTY", "empty (sub)expression"},
+ {REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug"},
+ {REG_INVARG, "REG_INVARG", "invalid argument to regex routine"},
+ {REG_ILLSEQ, "REG_ILLSEQ", "illegal byte sequence"},
+ {0, "", "*** unknown regexp error code ***"}
};
/*
- * regerror - the interface to error numbers
- * extern size_t regerror(int, const regex_t *, char *, size_t);
+ - regerror - the interface to error numbers
+ = extern size_t regerror(int, const regex_t *, char *, size_t);
*/
/* ARGSUSED */
size_t
-regerror(
- int errcode,
- const regex_t *preg,
- char *errbuf,
- size_t errbuf_size)
+regerror(int errcode,
+ const regex_t * __restrict preg,
+ char * __restrict errbuf,
+ size_t errbuf_size)
{
const struct rerr *r;
size_t len;
@@ -172,21 +139,20 @@
char convbuf[50];
_DIAGASSERT(errcode != REG_ATOI || preg != NULL);
- _DIAGASSERT(errbuf != NULL);
+ _DIAGASSERT(errbuf_size == 0 || errbuf != NULL);
- if (errcode == REG_ATOI)
+ if (errcode == REG_ATOI) {
s = regatoi(preg, convbuf, sizeof convbuf);
- else {
+ } else {
for (r = rerrs; r->code != 0; r++)
if (r->code == target)
break;
-
- if (errcode & REG_ITOA) {
- if (r->code != 0) {
- (void)strlcpy(convbuf, r->name, sizeof convbuf);
- } else
- (void)snprintf(convbuf, sizeof convbuf,
- "REG_0x%x", target);
+
+ if (errcode®_ITOA) {
+ if (r->code != 0)
+ (void) strlcpy(convbuf, r->name, sizeof(convbuf));
+ else
+ snprintf(convbuf, sizeof(convbuf), "REG_0x%x", target);
s = convbuf;
} else
s = r->explain;
@@ -194,21 +160,17 @@
len = strlen(s) + 1;
if (errbuf_size > 0)
- (void)strlcpy(errbuf, s, errbuf_size);
+ (void) strlcpy(errbuf, s, errbuf_size);
return(len);
}
/*
- * regatoi - internal routine to implement REG_ATOI
- * static const char *regatoi(const regex_t *preg, char *localbuf,
- * size_t buflen);
+ - regatoi - internal routine to implement REG_ATOI
+ == static char *regatoi(const regex_t *preg, char *localbuf);
*/
static const char *
-regatoi(
- const regex_t *preg,
- char *localbuf,
- size_t buflen)
+regatoi(const regex_t *preg, char *localbuf, size_t buflen)
{
const struct rerr *r;
@@ -218,6 +180,6 @@
if (r->code == 0)
return "0";
- (void)snprintf(localbuf, buflen, "%d", r->code);
+ snprintf(localbuf, buflen, "%d", r->code);
return localbuf;
}
diff --git a/libc/upstream-netbsd/lib/libc/regex/regex2.h b/libc/upstream-netbsd/lib/libc/regex/regex2.h
index 7c877ee..fbfff0d 100644
--- a/libc/upstream-netbsd/lib/libc/regex/regex2.h
+++ b/libc/upstream-netbsd/lib/libc/regex/regex2.h
@@ -1,6 +1,9 @@
-/* $NetBSD: regex2.h,v 1.13 2011/10/09 18:23:00 christos Exp $ */
+/* $NetBSD: regex2.h,v 1.15 2021/02/24 18:13:21 christos Exp $ */
/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
* Copyright (c) 1992, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
@@ -32,43 +35,7 @@
* SUCH DAMAGE.
*
* @(#)regex2.h 8.4 (Berkeley) 3/20/94
- */
-
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)regex2.h 8.4 (Berkeley) 3/20/94
+ * $FreeBSD: head/lib/libc/regex/regex2.h 368359 2020-12-05 03:18:48Z kevans $
*/
/*
@@ -109,68 +76,100 @@
* In state representations, an operator's bit is on to signify a state
* immediately *preceding* "execution" of that operator.
*/
-typedef u_int32_t sop; /* strip operator */
-typedef size_t sopno;
-#define OPRMASK ((u_int32_t)0xf8000000UL)
-#define OPDMASK ((u_int32_t)0x07ffffffUL)
-#define OPSHIFT ((unsigned)27)
+typedef uint32_t sop; /* strip operator */
+typedef uint32_t sopno;
+#define OPRMASK 0xf8000000U
+#define OPDMASK 0x07ffffffU
+#define OPSHIFT (27U)
#define OP(n) ((n)&OPRMASK)
-#define OPND(n) ((int)((n)&OPDMASK))
+#define OPND(n) ((n)&OPDMASK)
#define SOP(op, opnd) ((op)|(opnd))
-
-#define OPC(n) (((u_int32_t)(n))<<OPSHIFT)
-/* operators meaning operand */
-/* (back, fwd are offsets) */
-#define OEND OPC(1) /* endmarker - */
-#define OCHAR OPC(2) /* character unsigned char */
-#define OBOL OPC(3) /* left anchor - */
-#define OEOL OPC(4) /* right anchor - */
-#define OANY OPC(5) /* . - */
-#define OANYOF OPC(6) /* [...] set number */
-#define OBACK_ OPC(7) /* begin \d paren number */
-#define O_BACK OPC(8) /* end \d paren number */
-#define OPLUS_ OPC(9) /* + prefix fwd to suffix */
-#define O_PLUS OPC(10) /* + suffix back to prefix */
-#define OQUEST_ OPC(11) /* ? prefix fwd to suffix */
-#define O_QUEST OPC(12) /* ? suffix back to prefix */
-#define OLPAREN OPC(13) /* ( fwd to ) */
-#define ORPAREN OPC(14) /* ) back to ( */
-#define OCH_ OPC(15) /* begin choice fwd to OOR2 */
-#define OOR1 OPC(16) /* | pt. 1 back to OOR1 or OCH_ */
-#define OOR2 OPC(17) /* | pt. 2 fwd to OOR2 or O_CH */
-#define O_CH OPC(18) /* end choice back to OOR1 */
-#define OBOW OPC(19) /* begin word - */
-#define OEOW OPC(20) /* end word - */
+/* operators meaning operand */
+/* (back, fwd are offsets) */
+#define OEND (1U<<OPSHIFT) /* endmarker - */
+#define OCHAR (2U<<OPSHIFT) /* character wide character */
+#define OBOL (3U<<OPSHIFT) /* left anchor - */
+#define OEOL (4U<<OPSHIFT) /* right anchor - */
+#define OANY (5U<<OPSHIFT) /* . - */
+#define OANYOF (6U<<OPSHIFT) /* [...] set number */
+#define OBACK_ (7U<<OPSHIFT) /* begin \d paren number */
+#define O_BACK (8U<<OPSHIFT) /* end \d paren number */
+#define OPLUS_ (9U<<OPSHIFT) /* + prefix fwd to suffix */
+#define O_PLUS (10U<<OPSHIFT) /* + suffix back to prefix */
+#define OQUEST_ (11U<<OPSHIFT) /* ? prefix fwd to suffix */
+#define O_QUEST (12U<<OPSHIFT) /* ? suffix back to prefix */
+#define OLPAREN (13U<<OPSHIFT) /* ( fwd to ) */
+#define ORPAREN (14U<<OPSHIFT) /* ) back to ( */
+#define OCH_ (15U<<OPSHIFT) /* begin choice fwd to OOR2 */
+#define OOR1 (16U<<OPSHIFT) /* | pt. 1 back to OOR1 or OCH_ */
+#define OOR2 (17U<<OPSHIFT) /* | pt. 2 fwd to OOR2 or O_CH */
+#define O_CH (18U<<OPSHIFT) /* end choice back to OOR1 */
+#define OBOW (19U<<OPSHIFT) /* begin word - */
+#define OEOW (20U<<OPSHIFT) /* end word - */
+#define OBOS (21U<<OPSHIFT) /* begin subj. - */
+#define OEOS (22U<<OPSHIFT) /* end subj. - */
+#define OWBND (23U<<OPSHIFT) /* word bound - */
+#define ONWBND (24U<<OPSHIFT) /* not bound - */
/*
- * Structure for [] character-set representation. Character sets are
- * done as bit vectors, grouped 8 to a byte vector for compactness.
- * The individual set therefore has both a pointer to the byte vector
- * and a mask to pick out the relevant bit of each byte. A hash code
- * simplifies testing whether two sets could be identical.
- *
- * This will get trickier for multicharacter collating elements. As
- * preliminary hooks for dealing with such things, we also carry along
- * a string of multi-character elements, and decide the size of the
- * vectors at run time.
+ * Structures for [] character-set representation.
*/
typedef struct {
- uch *ptr; /* -> uch [csetsize] */
- uch mask; /* bit within array */
- uch hash; /* hash code */
- size_t smultis;
- char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */
+ wint_t min;
+ wint_t max;
+} crange;
+typedef struct {
+ unsigned char bmp[NC_MAX / 8];
+ wctype_t *types;
+ unsigned int ntypes;
+ wint_t *wides;
+ unsigned int nwides;
+ crange *ranges;
+ unsigned int nranges;
+ int invert;
+ int icase;
} cset;
-/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
-#define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c))
-#define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c))
-#define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask)
-#define MCadd(p, cs, cp) mcadd(p, cs, cp) /* regcomp() internal fns */
-#define MCsub(p, cs, cp) mcsub(p, cs, cp)
-#define MCin(p, cs, cp) mcin(p, cs, cp)
-/* stuff for character categories */
-typedef unsigned char cat_t;
+static int
+CHIN1(cset *cs, wint_t ch)
+{
+ unsigned int i;
+
+ assert(ch >= 0);
+ if (ch < NC)
+ return (((cs->bmp[(unsigned)ch >> 3] & (1 << (ch & 7))) != 0) ^
+ cs->invert);
+ for (i = 0; i < cs->nwides; i++) {
+ if (cs->icase) {
+ if (ch == towlower(cs->wides[i]) ||
+ ch == towupper(cs->wides[i]))
+ return (!cs->invert);
+ } else if (ch == cs->wides[i])
+ return (!cs->invert);
+ }
+ for (i = 0; i < cs->nranges; i++)
+ if (cs->ranges[i].min <= ch && ch <= cs->ranges[i].max)
+ return (!cs->invert);
+ for (i = 0; i < cs->ntypes; i++)
+ if (iswctype(ch, cs->types[i]))
+ return (!cs->invert);
+ return (cs->invert);
+}
+
+static __inline int
+CHIN(cset *cs, wint_t ch)
+{
+
+ assert(ch >= 0);
+ if (ch < NC)
+ return (((cs->bmp[(unsigned)ch >> 3] & (1 << (ch & 7))) != 0) ^
+ cs->invert);
+ else if (cs->icase)
+ return (CHIN1(cs, ch) || CHIN1(cs, towlower(ch)) ||
+ CHIN1(cs, towupper(ch)));
+ else
+ return (CHIN1(cs, ch));
+}
/*
* main compiled-expression structure
@@ -179,10 +178,8 @@
int magic;
# define MAGIC2 ((('R'^0200)<<8)|'E')
sop *strip; /* malloced area for strip */
- size_t csetsize; /* number of bits in a cset vector */
size_t ncsets; /* number of csets in use */
cset *sets; /* -> cset [ncsets] */
- uch *setbits; /* -> uch[csetsize][ncsets/CHAR_BIT] */
int cflags; /* copy of regcomp() cflags argument */
sopno nstates; /* = number of sops */
sopno firststate; /* the initial OEND (normally 0) */
@@ -193,17 +190,17 @@
# define BAD 04 /* something wrong */
size_t nbol; /* number of ^ used */
size_t neol; /* number of $ used */
- size_t ncategories; /* how many character categories */
- cat_t *categories; /* ->catspace[-CHAR_MIN] */
char *must; /* match must contain this string */
+ int moffset; /* latest point at which must may be located */
+ size_t *charjump; /* Boyer-Moore char jump table */
+ size_t *matchjump; /* Boyer-Moore match jump table */
size_t mlen; /* length of must */
size_t nsub; /* copy of re_nsub */
int backrefs; /* does it use back references? */
sopno nplus; /* how deep does it nest +s? */
- /* catspace must be last */
- cat_t catspace[1]; /* actually [NC] */
};
/* misc utilities */
-#define OUT (CHAR_MAX+1) /* a non-character value */
-#define ISWORD(c) (isalnum((unsigned char)c) || (c) == '_')
+#define OUT (CHAR_MIN - 1) /* a non-character value */
+#define IGN (CHAR_MIN - 2)
+#define ISWORD(c) (iswalnum((uch)(c)) || (c) == '_')
diff --git a/libc/upstream-netbsd/lib/libc/regex/regexec.c b/libc/upstream-netbsd/lib/libc/regex/regexec.c
index f16e0b6..213a90b 100644
--- a/libc/upstream-netbsd/lib/libc/regex/regexec.c
+++ b/libc/upstream-netbsd/lib/libc/regex/regexec.c
@@ -1,6 +1,9 @@
-/* $NetBSD: regexec.c,v 1.22 2012/03/13 21:13:43 christos Exp $ */
+/* $NetBSD: regexec.c,v 1.26 2021/02/26 19:24:47 christos Exp $ */
/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
* Copyright (c) 1992, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
@@ -34,91 +37,96 @@
* @(#)regexec.c 8.3 (Berkeley) 3/20/94
*/
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)regexec.c 8.3 (Berkeley) 3/20/94
- */
+#if HAVE_NBTOOL_CONFIG_H
+#include "nbtool_config.h"
+#endif
#include <sys/cdefs.h>
-#if defined(LIBC_SCCS) && !defined(lint)
#if 0
static char sccsid[] = "@(#)regexec.c 8.3 (Berkeley) 3/20/94";
-#else
-__RCSID("$NetBSD: regexec.c,v 1.22 2012/03/13 21:13:43 christos Exp $");
+__FBSDID("$FreeBSD: head/lib/libc/regex/regexec.c 326025 2017-11-20 19:49:47Z pfg $");
#endif
-#endif /* LIBC_SCCS and not lint */
+__RCSID("$NetBSD: regexec.c,v 1.26 2021/02/26 19:24:47 christos Exp $");
/*
* the outer shell of regexec()
*
- * This file includes engine.c *twice*, after muchos fiddling with the
+ * This file includes engine.c three times, after muchos fiddling with the
* macros that code uses. This lets the same code operate on two different
- * representations for state sets.
+ * representations for state sets and characters.
*/
-#include "namespace.h"
-#include <sys/types.h>
-#include <assert.h>
-#include <ctype.h>
-#include <limits.h>
+#ifndef LIBHACK
+#include "namespace.h"
+#endif
+#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <limits.h>
+#include <ctype.h>
#include <regex.h>
-#ifdef __weak_alias
+#if defined(__weak_alias) && !defined(LIBHACK)
__weak_alias(regexec,_regexec)
#endif
#include "utils.h"
#include "regex2.h"
+static __inline size_t
+xmbrtowc(wint_t *wi, const char *s, size_t n, mbstate_t *mbs, wint_t dummy)
+{
+#ifdef NLS
+ size_t nr;
+ wchar_t wc;
+
+ nr = mbrtowc(&wc, s, n, mbs);
+ if (wi != NULL)
+ *wi = wc;
+ if (nr == 0)
+ return (1);
+ else if (nr == (size_t)-1 || nr == (size_t)-2) {
+ memset(mbs, 0, sizeof(*mbs));
+ if (wi != NULL)
+ *wi = dummy;
+ return (1);
+ } else
+ return (nr);
+#else
+ if (wi)
+ *wi = *s;
+ return 1;
+#endif
+}
+
+static __inline size_t
+xmbrtowc_dummy(wint_t *wi,
+ const char *s,
+ size_t n __unused,
+ mbstate_t *mbs __unused,
+ wint_t dummy __unused)
+{
+
+ if (wi != NULL)
+ *wi = (unsigned char)*s;
+ return (1);
+}
+
/* macros for manipulating states, small version */
-#define states unsigned long
-#define states1 unsigned long /* for later use in regexec() decision */
+#define states long
+#define states1 states /* for later use in regexec() decision */
#define CLEAR(v) ((v) = 0)
#define SET0(v, n) ((v) &= ~((unsigned long)1 << (n)))
#define SET1(v, n) ((v) |= (unsigned long)1 << (n))
#define ISSET(v, n) (((v) & ((unsigned long)1 << (n))) != 0)
#define ASSIGN(d, s) ((d) = (s))
#define EQ(a, b) ((a) == (b))
-#define STATEVARS int dummy /* dummy version */
+#define STATEVARS long dummy /* dummy version */
#define STATESETUP(m, n) /* nothing */
#define STATETEARDOWN(m) /* nothing */
#define SETUP(v) ((v) = 0)
-#define onestate unsigned long
+#define onestate long
#define INIT(o, n) ((o) = (unsigned long)1 << (n))
#define INC(o) ((o) <<= 1)
#define ISSTATEIN(v, o) (((v) & (o)) != 0)
@@ -127,6 +135,9 @@
#define FWD(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) << (n))
#define BACK(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) >> (n))
#define ISSETBACK(v, n) (((v) & ((unsigned long)here >> (n))) != 0)
+/* no multibyte support */
+#define XMBRTOWC xmbrtowc_dummy
+#define ZAPSTATE(mbs) ((void)(mbs))
/* function names */
#define SNAMES /* engine.c looks after details */
@@ -152,26 +163,25 @@
#undef BACK
#undef ISSETBACK
#undef SNAMES
+#undef XMBRTOWC
+#undef ZAPSTATE
/* macros for manipulating states, large version */
#define states char *
-#define CLEAR(v) memset(v, 0, (size_t)m->g->nstates)
+#define CLEAR(v) memset(v, 0, m->g->nstates)
#define SET0(v, n) ((v)[n] = 0)
#define SET1(v, n) ((v)[n] = 1)
#define ISSET(v, n) ((v)[n])
-#define ASSIGN(d, s) memcpy(d, s, (size_t)m->g->nstates)
-#define EQ(a, b) (memcmp(a, b, (size_t)m->g->nstates) == 0)
-#define STATEVARS int vn; char *space
-#define STATESETUP(m, nv) \
- if (((m)->space = malloc((size_t)((nv)*(m)->g->nstates))) == NULL) \
- return(REG_ESPACE); \
- else \
- (m)->vn = 0
-
-#define STATETEARDOWN(m) { free((m)->space); m->space = NULL; }
-#define SETUP(v) ((v) = &m->space[(size_t)(m->vn++ * m->g->nstates)])
-#define onestate int
-#define INIT(o, n) ((o) = (int)(n))
+#define ASSIGN(d, s) memcpy(d, s, m->g->nstates)
+#define EQ(a, b) (memcmp(a, b, m->g->nstates) == 0)
+#define STATEVARS long vn; char *space
+#define STATESETUP(m, nv) { (m)->space = malloc((nv)*(m)->g->nstates); \
+ if ((m)->space == NULL) return(REG_ESPACE); \
+ (m)->vn = 0; }
+#define STATETEARDOWN(m) { free((m)->space); }
+#define SETUP(v) ((v) = &m->space[m->vn++ * m->g->nstates])
+#define onestate long
+#define INIT(o, n) ((o) = (n))
#define INC(o) ((o)++)
#define ISSTATEIN(v, o) ((v)[o])
/* some abbreviations; note that some of these know variable names! */
@@ -179,11 +189,24 @@
#define FWD(dst, src, n) ((dst)[here+(n)] |= (src)[here])
#define BACK(dst, src, n) ((dst)[here-(n)] |= (src)[here])
#define ISSETBACK(v, n) ((v)[here - (n)])
+/* no multibyte support */
+#define XMBRTOWC xmbrtowc_dummy
+#define ZAPSTATE(mbs) ((void)(mbs))
/* function names */
#define LNAMES /* flag */
#include "engine.c"
+/* multibyte character & large states version */
+#undef LNAMES
+#undef XMBRTOWC
+#undef ZAPSTATE
+#define XMBRTOWC xmbrtowc
+#define ZAPSTATE(mbs) memset((mbs), 0, sizeof(*(mbs)))
+#define MNAMES
+
+#include "engine.c"
+
/*
- regexec - interface for matching
= extern int regexec(const regex_t *, const char *, size_t, \
@@ -200,21 +223,18 @@
* have been prototyped.
*/
int /* 0 success, REG_NOMATCH failure */
-regexec(
- const regex_t *preg,
- const char *string,
- size_t nmatch,
- regmatch_t pmatch[],
- int eflags)
+regexec(const regex_t * __restrict preg,
+ const char * __restrict string,
+ size_t nmatch,
+ regmatch_t pmatch[__restrict],
+ int eflags)
{
struct re_guts *g = preg->re_g;
- char *s;
#ifdef REDEBUG
# define GOODFLAGS(f) (f)
#else
# define GOODFLAGS(f) ((f)&(REG_NOTBOL|REG_NOTEOL|REG_STARTEND))
#endif
-
_DIAGASSERT(preg != NULL);
_DIAGASSERT(string != NULL);
@@ -225,10 +245,10 @@
return(REG_BADPAT);
eflags = GOODFLAGS(eflags);
- s = __UNCONST(string);
-
- if (g->nstates <= (sopno)(CHAR_BIT*sizeof(states1)) && !(eflags®_LARGE))
- return(smatcher(g, s, nmatch, pmatch, eflags));
+ if (MB_CUR_MAX > 1)
+ return(mmatcher(g, string, nmatch, pmatch, eflags));
+ else if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags®_LARGE))
+ return(smatcher(g, string, nmatch, pmatch, eflags));
else
- return(lmatcher(g, s, nmatch, pmatch, eflags));
+ return(lmatcher(g, string, nmatch, pmatch, eflags));
}
diff --git a/libc/upstream-netbsd/lib/libc/regex/regfree.c b/libc/upstream-netbsd/lib/libc/regex/regfree.c
index ce011ea..7e388b1 100644
--- a/libc/upstream-netbsd/lib/libc/regex/regfree.c
+++ b/libc/upstream-netbsd/lib/libc/regex/regfree.c
@@ -1,6 +1,9 @@
-/* $NetBSD: regfree.c,v 1.15 2007/02/09 23:44:18 junyoung Exp $ */
+/* $NetBSD: regfree.c,v 1.19 2021/02/26 19:24:47 christos Exp $ */
/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
* Copyright (c) 1992, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
@@ -34,58 +37,22 @@
* @(#)regfree.c 8.3 (Berkeley) 3/20/94
*/
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)regfree.c 8.3 (Berkeley) 3/20/94
- */
+#if HAVE_NBTOOL_CONFIG_H
+#include "nbtool_config.h"
+#endif
#include <sys/cdefs.h>
-#if defined(LIBC_SCCS) && !defined(lint)
#if 0
static char sccsid[] = "@(#)regfree.c 8.3 (Berkeley) 3/20/94";
-#else
-__RCSID("$NetBSD: regfree.c,v 1.15 2007/02/09 23:44:18 junyoung Exp $");
+__FBSDID("$FreeBSD: head/lib/libc/regex/regfree.c 326025 2017-11-20 19:49:47Z pfg $");
#endif
-#endif /* LIBC_SCCS and not lint */
+__RCSID("$NetBSD: regfree.c,v 1.19 2021/02/26 19:24:47 christos Exp $");
#include "namespace.h"
#include <sys/types.h>
-
-#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
+#include <limits.h>
#include <regex.h>
#ifdef __weak_alias
@@ -100,10 +67,10 @@
= extern void regfree(regex_t *);
*/
void
-regfree(
- regex_t *preg)
+regfree(regex_t *preg)
{
struct re_guts *g;
+ unsigned int i;
_DIAGASSERT(preg != NULL);
@@ -119,11 +86,19 @@
if (g->strip != NULL)
free(g->strip);
- if (g->sets != NULL)
+ if (g->sets != NULL) {
+ for (i = 0; i < g->ncsets; i++) {
+ free(g->sets[i].ranges);
+ free(g->sets[i].wides);
+ free(g->sets[i].types);
+ }
free(g->sets);
- if (g->setbits != NULL)
- free(g->setbits);
+ }
if (g->must != NULL)
free(g->must);
+ if (g->charjump != NULL)
+ free(&g->charjump[CHAR_MIN]);
+ if (g->matchjump != NULL)
+ free(g->matchjump);
free(g);
}
diff --git a/libc/upstream-netbsd/lib/libc/regex/utils.h b/libc/upstream-netbsd/lib/libc/regex/utils.h
index 762caee..972f555 100644
--- a/libc/upstream-netbsd/lib/libc/regex/utils.h
+++ b/libc/upstream-netbsd/lib/libc/regex/utils.h
@@ -1,6 +1,9 @@
-/* $NetBSD: utils.h,v 1.6 2003/08/07 16:43:21 agc Exp $ */
+/* $NetBSD: utils.h,v 1.9 2021/04/22 19:20:24 christos Exp $ */
/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
* Copyright (c) 1992, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
@@ -32,49 +35,38 @@
* SUCH DAMAGE.
*
* @(#)utils.h 8.3 (Berkeley) 3/20/94
+ * $FreeBSD: head/lib/libc/regex/utils.h 341838 2018-12-12 04:23:00Z yuripv $
*/
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)utils.h 8.3 (Berkeley) 3/20/94
- */
+#ifdef NLS
+#include <wchar.h>
+#include <wctype.h>
+#else
+#include <ctype.h>
+#define wint_t regex_wint_t
+#define mbstate_t regex_mbstate_t
+#define wctype_t regex_wctype_t
+typedef short wint_t;
+typedef char mbstate_t;
+typedef short wctype_t;
+#define iswupper(a) isupper(a)
+#define iswlower(a) islower(a)
+#define iswalpha(a) isalpha(a)
+#define iswalnum(a) isalnum(a)
+#define towupper(a) toupper(a)
+#define towlower(a) tolower(a)
+extern wctype_t __regex_wctype(const char *);
+extern int __regex_iswctype(wint_t, wctype_t);
+#define wctype(s) __regex_wctype(s)
+#define iswctype(c, t) __regex_iswctype((c), (t))
+#endif
/* utility definitions */
#define DUPMAX _POSIX2_RE_DUP_MAX /* xxx is this right? */
#define INFINITY (DUPMAX + 1)
-#define NC (CHAR_MAX - CHAR_MIN + 1)
+
+#define NC_MAX (CHAR_MAX - CHAR_MIN + 1)
+#define NC ((MB_CUR_MAX) == 1 ? (NC_MAX) : (128))
typedef unsigned char uch;
/* switch off assertions (if not already off) if no REDEBUG */
diff --git a/libc/upstream-netbsd/lib/libc/stdlib/bsearch.c b/libc/upstream-netbsd/lib/libc/stdlib/bsearch.c
index 2b0e0d8..e48fe85 100644
--- a/libc/upstream-netbsd/lib/libc/stdlib/bsearch.c
+++ b/libc/upstream-netbsd/lib/libc/stdlib/bsearch.c
@@ -1,4 +1,4 @@
-/* $NetBSD: bsearch.c,v 1.15 2012/03/04 20:01:45 christos Exp $ */
+/* $NetBSD: bsearch.c,v 1.16 2022/05/31 08:43:14 andvar Exp $ */
/*
* Copyright (c) 1990, 1993
@@ -34,7 +34,7 @@
#if 0
static char sccsid[] = "@(#)bsearch.c 8.1 (Berkeley) 6/4/93";
#else
-__RCSID("$NetBSD: bsearch.c,v 1.15 2012/03/04 20:01:45 christos Exp $");
+__RCSID("$NetBSD: bsearch.c,v 1.16 2022/05/31 08:43:14 andvar Exp $");
#endif
#endif /* LIBC_SCCS and not lint */
@@ -50,7 +50,7 @@
* is odd, moving left simply involves halving lim: e.g., when lim
* is 5 we look at item 2, so we change lim to 2 so that we will
* look at items 0 & 1. If lim is even, the same applies. If lim
- * is odd, moving right again involes halving lim, this time moving
+ * is odd, moving right again involves halving lim, this time moving
* the base up one item past p: e.g., when lim is 5 we change base
* to item 3 and make lim 2 so that we will look at items 3 and 4.
* If lim is even, however, we have to shrink it by one before