diff --git a/regexp/jimregexp.c b/regexp/jimregexp.c index e3fe4fc05..7fd6d473e 100644 --- a/regexp/jimregexp.c +++ b/regexp/jimregexp.c @@ -1,1909 +1,1923 @@ /* * vi:se ts=8: * * regcomp and regexec -- regsub and regerror are elsewhere * * Copyright (c) 1986 by University of Toronto. * Written by Henry Spencer. Not derived from licensed software. * * Permission is granted to anyone to use this software for any * purpose on any computer system, and to redistribute it freely, * subject to the following restrictions: * * 1. The author is not responsible for the consequences of use of * this software, no matter how awful, even if they arise * from defects in it. * * 2. The origin of this software must not be misrepresented, either * by explicit claim or by omission. * * 3. Altered versions must be plainly marked as such, and must not * be misrepresented as being the original software. *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | *** to assist in implementing egrep. *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching *** as in BSD grep and ex. *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h *** was moved into regexp.h, and the include of regexp.h now uses "'s *** to avoid conflicting with the system regexp.h. Const, bless its *** soul, was removed so it can compile everywhere. The declaration *** of strchr() was in conflict on AIX, so it was removed (as it is *** happily defined in string.h). *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald *** seiwald@perforce.com, on 20 January 2000, to use function prototypes. *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald *** seiwald@perforce.com, on 05 November 2002, to const string literals. * * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett * on 16 October 2010, to remove static state and add better Tcl ARE compatibility. * This includes counted repetitions, UTF-8 support, character classes, * shorthand character classes, increased number of parentheses to 100, * backslash escape sequences. It also removes \n as an alternative to |. * *** THIS IS AN ALTERED VERSION. It was altered to offer POSIX-like *** regular expression routines of regcomp/regexec/regerror/regfree, *** with UTF-8 support, by NIIBE Yutaka on *** 2020-02-14. * * Beware that some of this code is subtly aware of the way operator * precedence is structured in regular expressions. Serious changes in * regular-expression syntax might require a total rethink. */ #if defined(JIM_REGEXP) #include #include #include #include #include "jimregexp.h" #include "utf8.h" #define UCHAR(c) ((unsigned char)(c)) /* An arbitrary limit, but this seems enough. Must be less than 1000. */ #define REG_MAX_PAREN 100 /* * Structure for regexp "program". This is essentially a linear encoding * of a nondeterministic finite-state machine (aka syntax charts or * "railroad normal form" in parsing technology). Each node is an opcode * plus a "next" pointer, possibly plus an operand. "Next" pointers of * all nodes except BRANCH implement concatenation; a "next" pointer with * a BRANCH on both ends of it is connecting two alternatives. (Here we * have one of the subtle syntax dependencies: an individual BRANCH (as * opposed to a collection of them) is never concatenated with anything * because of operator precedence.) The operand of some types of node is * a literal string; for others, it is a node leading into a sub-FSM. In * particular, the operand of a BRANCH node is the first node of the branch. * (NB this is *not* a tree structure: the tail of the branch connects * to the thing following the set of BRANCHes.) The opcodes are: */ /* definition number opnd? meaning */ #define END 0 /* no End of program. */ #define BOL 1 /* no Match "" at beginning of line. */ #define EOL 2 /* no Match "" at end of line. */ #define ANY 3 /* no Match any one character. */ #define ANYOF 4 /* str Match any character in this string. */ #define ANYBUT 5 /* str Match any character not in this string. */ #define BRANCH 6 /* node Match this alternative, or the next... */ #define BACK 7 /* no Match "", "next" ptr points backward. */ #define EXACTLY 8 /* str Match this string. */ #define NOTHING 9 /* no Match empty string. */ #define REP 10 /* max,min Match this (simple) thing [min,max] times. */ #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, minimal match. */ #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */ #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */ #define BOLX 14 /* no Match "" at beginning of input. */ #define EOLX 15 /* no Match "" at end of input. */ #define WORDA 16 /* no Match "" at wordchar, where prev is nonword */ #define WORDZ 17 /* no Match "" at nonwordchar, where prev is word */ #define OPENNC 1000 /* no Non-capturing parentheses - must be OPEN-1 */ #define OPEN 1001 /* no Mark this point in input as start of #n. */ /* OPEN+1 is number 1, etc. */ /* must not be any other opts between OPEN and CLOSE */ #define CLOSENC 2000 /* no Non-capturing parentheses - must be CLOSE-1 */ #define CLOSE 2001 /* no Analogous to OPEN. */ #define CLOSE_END (CLOSE+REG_MAX_PAREN) /* * The first word of the regexp internal "program" is actually this magic * number; the start node begins in the second word. */ #define REG_MAGIC 0xFADED00D /* * Opcode notes: * * BRANCH The set of branches constituting a single choice are hooked * together with their "next" pointers, since precedence prevents * anything being concatenated to any individual branch. The * "next" pointer of the last BRANCH in a choice points to the * thing following the whole choice. This is also where the * final "next" pointer of each individual branch points; each * branch starts with the operand node of a BRANCH node. * * BACK Normal "next" pointers all implicitly point forward; BACK * exists to make loop structures possible. * * REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented * as either simple repeats (REP) or complex repeats (REPX). * These opcodes include a "min" and "max" count after the opcode. * This is followed by a fourth "current count" word that is * only used by REPX, as it implements a recursive match. * REPMIN and REPXMIN are identical except they implement minimal repeats. * * OPEN,CLOSE ...are numbered at compile time. */ /* * A node is one word of opcode followed by one word of "next" pointer. * The "next" pointer value is a positive offset from the opcode of the node * containing it. * An operand, if any, simply follows the node. (Note that much of the * code generation knows about this implicit relationship.) */ #define OP(preg, p) (preg->program[p]) #define NEXT(preg, p) (preg->program[p + 1]) #define OPERAND(p) ((p) + 2) /* * See regmagic.h for one further detail of program structure. */ /* * Utility definitions. */ #define FAIL(R,M) { (R)->err = (M); return (M); } #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{') #define META "^$.[()|?{+*" /* * Flags to be passed up and down. */ #define HASWIDTH 1 /* Known never to match null string. */ #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */ #define SPSTART 4 /* Starts with * or +. */ #define WORST 0 /* Worst case. */ #define MAX_REP_COUNT 1000000 /* * Forward declarations for regcomp()'s friends. */ static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp ); static int regpiece(regex_t *preg, int *flagp ); static int regbranch(regex_t *preg, int *flagp ); static int regatom(regex_t *preg, int *flagp ); static int regnode(regex_t *preg, int op ); static int regnext(regex_t *preg, int p ); static void regc(regex_t *preg, int b ); static int reginsert(regex_t *preg, int op, int size, int opnd ); static void regtail(regex_t *preg, int p, int val); static void regoptail(regex_t *preg, int p, int val ); static int regopsize(regex_t *preg, int p ); static int reg_range_find(const int *string, int c); static const char *str_find(const char *string, int c, int nocase); static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase); /*#define DEBUG*/ #ifdef DEBUG static int regnarrate = 0; static void regdump(regex_t *preg); static const char *regprop( int op ); #endif /** * Returns the length of the null-terminated integer sequence. */ static int str_int_len(const int *seq) { int n = 0; while (*seq++) { n++; } return n; } /* - regcomp - compile a regular expression into internal code * * We can't allocate space until we know how big the compiled form will be, * but we can't compile it (and thus know how big it is) until we've got a * place to put the code. So we cheat: we compile it twice, once with code * generation turned off and size counting turned on, and once "for real". * This also means that we don't allocate space until we are sure that the * thing really will compile successfully, and we never have to move the * code and thus invalidate pointers into it. (Note that it has to be in * one piece because free() must be able to free it all.) * * Beware that the optimization-preparation code in here knows about some * of the structure of the compiled regexp. */ int regcomp(regex_t *preg, const char *exp, int cflags) { int scan; int longest; unsigned len; int flags; #ifdef DEBUG fprintf(stderr, "Compiling: '%s'\n", exp); #endif memset(preg, 0, sizeof(*preg)); if (exp == NULL) FAIL(preg, REG_ERR_NULL_ARGUMENT); /* First pass: determine size, legality. */ preg->cflags = cflags; preg->regparse = exp; /* Allocate space. */ preg->proglen = (strlen(exp) + 1) * 5; preg->program = malloc(preg->proglen * sizeof(int)); if (preg->program == NULL) FAIL(preg, REG_ERR_NOMEM); /* Note that since we store a magic value as the first item in the program, * program offsets will never be 0 */ regc(preg, REG_MAGIC); if (reg(preg, 0, &flags) == 0) { return preg->err; } /* Small enough for pointer-storage convention? */ if (preg->re_nsub >= REG_MAX_PAREN) /* Probably could be 65535L. */ FAIL(preg,REG_ERR_TOO_BIG); /* Dig out information for optimizations. */ preg->regstart = 0; /* Worst-case defaults. */ preg->reganch = 0; preg->regmust = 0; preg->regmlen = 0; scan = 1; /* First BRANCH. */ if (OP(preg, regnext(preg, scan)) == END) { /* Only one top-level choice. */ scan = OPERAND(scan); /* Starting-point info. */ if (OP(preg, scan) == EXACTLY) { preg->regstart = preg->program[OPERAND(scan)]; } else if (OP(preg, scan) == BOL) preg->reganch++; /* * If there's something expensive in the r.e., find the * longest literal string that must appear and make it the * regmust. Resolve ties in favor of later strings, since * the regstart check works with the beginning of the r.e. * and avoiding duplication strengthens checking. Not a * strong reason, but sufficient in the absence of others. */ if (flags&SPSTART) { longest = 0; len = 0; for (; scan != 0; scan = regnext(preg, scan)) { if (OP(preg, scan) == EXACTLY) { int plen = str_int_len(preg->program + OPERAND(scan)); if (plen >= len) { longest = OPERAND(scan); len = plen; } } } preg->regmust = longest; preg->regmlen = len; } } #ifdef DEBUG regdump(preg); #endif return 0; } /* - reg - regular expression, i.e. main body or parenthesized thing * * Caller must absorb opening parenthesis. * * Combining parenthesis handling with the base level of regular expression * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp ) { int ret; int br; int ender; int parno = 0; int flags; *flagp = HASWIDTH; /* Tentatively. */ /* Make an OPEN node, if parenthesized. */ if (paren) { if (preg->regparse[0] == '?' && preg->regparse[1] == ':') { /* non-capturing paren */ preg->regparse += 2; parno = -1; } else { parno = ++preg->re_nsub; } ret = regnode(preg, OPEN+parno); } else ret = 0; /* Pick up the branches, linking them together. */ br = regbranch(preg, &flags); if (br == 0) return 0; if (ret != 0) regtail(preg, ret, br); /* OPEN -> first. */ else ret = br; if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags&SPSTART; while (*preg->regparse == '|') { preg->regparse++; br = regbranch(preg, &flags); if (br == 0) return 0; regtail(preg, ret, br); /* BRANCH -> BRANCH. */ if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags&SPSTART; } /* Make a closing node, and hook it on the end. */ ender = regnode(preg, (paren) ? CLOSE+parno : END); regtail(preg, ret, ender); /* Hook the tails of the branches to the closing node. */ for (br = ret; br != 0; br = regnext(preg, br)) regoptail(preg, br, ender); /* Check for proper termination. */ if (paren && *preg->regparse++ != ')') { preg->err = REG_ERR_UNMATCHED_PAREN; return 0; } else if (!paren && *preg->regparse != '\0') { if (*preg->regparse == ')') { preg->err = REG_ERR_UNMATCHED_PAREN; return 0; } else { preg->err = REG_ERR_JUNK_ON_END; return 0; } } return(ret); } /* - regbranch - one alternative of an | operator * * Implements the concatenation operator. */ static int regbranch(regex_t *preg, int *flagp ) { int ret; int chain; int latest; int flags; *flagp = WORST; /* Tentatively. */ ret = regnode(preg, BRANCH); chain = 0; while (*preg->regparse != '\0' && *preg->regparse != ')' && *preg->regparse != '|') { latest = regpiece(preg, &flags); if (latest == 0) return 0; *flagp |= flags&HASWIDTH; if (chain == 0) {/* First piece. */ *flagp |= flags&SPSTART; } else { regtail(preg, chain, latest); } chain = latest; } if (chain == 0) /* Loop ran zero times. */ (void) regnode(preg, NOTHING); return(ret); } /* - regpiece - something followed by possible [*+?] * * Note that the branching code sequences used for ? and the general cases * of * and + are somewhat optimized: they use the same NOTHING node as * both the endmarker for their branch list and the body of the last branch. * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */ static int regpiece(regex_t *preg, int *flagp) { int ret; char op; int next; int flags; int min; int max; ret = regatom(preg, &flags); if (ret == 0) return 0; op = *preg->regparse; if (!ISMULT(op)) { *flagp = flags; return(ret); } if (!(flags&HASWIDTH) && op != '?') { preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY; return 0; } /* Handle braces (counted repetition) by expansion */ if (op == '{') { char *end; min = strtoul(preg->regparse + 1, &end, 10); if (end == preg->regparse + 1) { preg->err = REG_ERR_BAD_COUNT; return 0; } if (*end == '}') { max = min; } else if (*end == '\0') { preg->err = REG_ERR_UNMATCHED_BRACES; return 0; } else { preg->regparse = end; max = strtoul(preg->regparse + 1, &end, 10); if (*end != '}') { preg->err = REG_ERR_UNMATCHED_BRACES; return 0; } } if (end == preg->regparse + 1) { max = MAX_REP_COUNT; } else if (max < min || max >= 100) { preg->err = REG_ERR_BAD_COUNT; return 0; } if (min >= 100) { preg->err = REG_ERR_BAD_COUNT; return 0; } preg->regparse = strchr(preg->regparse, '}'); } else { min = (op == '+'); max = (op == '?' ? 1 : MAX_REP_COUNT); } if (preg->regparse[1] == '?') { preg->regparse++; next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret); } else { next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret); } preg->program[ret + 2] = max; preg->program[ret + 3] = min; preg->program[ret + 4] = 0; *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART); if (!(flags & SIMPLE)) { int back = regnode(preg, BACK); regtail(preg, back, ret); regtail(preg, next, back); } preg->regparse++; if (ISMULT(*preg->regparse)) { preg->err = REG_ERR_NESTED_COUNT; return 0; } return ret; } /** * Add all characters in the inclusive range between lower and upper. * * Handles a swapped range (upper < lower). */ static void reg_addrange(regex_t *preg, int lower, int upper) { if (lower > upper) { reg_addrange(preg, upper, lower); } /* Add a range as length, start */ regc(preg, upper - lower + 1); regc(preg, lower); } /** * Add a null-terminated literal string as a set of ranges. */ static void reg_addrange_str(regex_t *preg, const char *str) { while (*str) { reg_addrange(preg, *str, *str); str++; } } /** * Extracts the next unicode char from utf8. * * If 'upper' is set, converts the char to uppercase. */ static int reg_utf8_tounicode_case(const char *s, int *uc, int upper) { int l = utf8_tounicode(s, uc); if (upper) { *uc = utf8_upper(*uc); } return l; } /** * Converts a hex digit to decimal. * * Returns -1 for an invalid hex digit. */ static int hexdigitval(int c) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 10; if (c >= 'A' && c <= 'F') return c - 'A' + 10; return -1; } /** * Parses up to 'n' hex digits at 's' and stores the result in *uc. * * Returns the number of hex digits parsed. * If there are no hex digits, returns 0 and stores nothing. */ static int parse_hex(const char *s, int n, int *uc) { int val = 0; int k; for (k = 0; k < n; k++) { int c = hexdigitval(*s++); if (c == -1) { break; } val = (val << 4) | c; } if (k) { *uc = val; } return k; } /** * Call for chars after a backlash to decode the escape sequence. * * Stores the result in *ch. * * Returns the number of bytes consumed. */ static int reg_decode_escape(const char *s, int *ch) { int n; const char *s0 = s; *ch = *s++; switch (*ch) { case 'b': *ch = '\b'; break; case 'e': *ch = 27; break; case 'f': *ch = '\f'; break; case 'n': *ch = '\n'; break; case 'r': *ch = '\r'; break; case 't': *ch = '\t'; break; case 'v': *ch = '\v'; break; case 'u': if (*s == '{') { /* Expect \u{NNNN} */ n = parse_hex(s + 1, 6, ch); if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) { s += n + 2; } else { /* Invalid, so just treat as an escaped 'u' */ *ch = 'u'; } } else if ((n = parse_hex(s, 4, ch)) > 0) { s += n; } break; case 'U': if ((n = parse_hex(s, 8, ch)) > 0) { s += n; } break; case 'x': if ((n = parse_hex(s, 2, ch)) > 0) { s += n; } break; case '\0': s--; *ch = '\\'; break; } return s - s0; } /* - regatom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. */ static int regatom(regex_t *preg, int *flagp) { int ret; int flags; int nocase = (preg->cflags & REG_ICASE); int ch; int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase); *flagp = WORST; /* Tentatively. */ preg->regparse += n; switch (ch) { /* FIXME: these chars only have meaning at beg/end of pat? */ case '^': ret = regnode(preg, BOL); break; case '$': ret = regnode(preg, EOL); break; case '.': ret = regnode(preg, ANY); *flagp |= HASWIDTH|SIMPLE; break; case '[': { const char *pattern = preg->regparse; if (*pattern == '^') { /* Complement of range. */ ret = regnode(preg, ANYBUT); pattern++; } else ret = regnode(preg, ANYOF); /* Special case. If the first char is ']' or '-', it is part of the set */ if (*pattern == ']' || *pattern == '-') { reg_addrange(preg, *pattern, *pattern); pattern++; } - while (*pattern && *pattern != ']') { + while (*pattern != ']') { /* Is this a range? a-z */ int start; int end; enum { CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER, CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT, CC_NUM }; int cc; + if (!*pattern) { + preg->err = REG_ERR_UNMATCHED_BRACKET; + return 0; + } + pattern += reg_utf8_tounicode_case(pattern, &start, nocase); if (start == '\\') { /* First check for class shorthand escapes */ switch (*pattern) { case 's': pattern++; cc = CC_SPACE; goto cc_switch; case 'd': pattern++; cc = CC_DIGIT; goto cc_switch; case 'w': pattern++; reg_addrange(preg, '_', '_'); cc = CC_ALNUM; goto cc_switch; } pattern += reg_decode_escape(pattern, &start); if (start == 0) { preg->err = REG_ERR_NULL_CHAR; return 0; } + if (start == '\\' && *pattern == 0) { + preg->err = REG_ERR_INVALID_ESCAPE; + return 0; + } } if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') { /* skip '-' */ pattern += utf8_tounicode(pattern, &end); pattern += reg_utf8_tounicode_case(pattern, &end, nocase); if (end == '\\') { pattern += reg_decode_escape(pattern, &end); if (end == 0) { preg->err = REG_ERR_NULL_CHAR; return 0; } + if (start == '\\' && *pattern == 0) { + preg->err = REG_ERR_INVALID_ESCAPE; + return 0; + } } reg_addrange(preg, start, end); continue; } if (start == '[' && pattern[0] == ':') { static const char *character_class[] = { ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:", ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:", }; for (cc = 0; cc < CC_NUM; cc++) { n = strlen(character_class[cc]); if (strncmp(pattern, character_class[cc], n) == 0) { /* Found a character class */ pattern += n + 1; break; } } if (cc != CC_NUM) { cc_switch: switch (cc) { case CC_ALNUM: reg_addrange(preg, '0', '9'); /* Fall through */ case CC_ALPHA: if ((preg->cflags & REG_ICASE) == 0) { reg_addrange(preg, 'a', 'z'); } reg_addrange(preg, 'A', 'Z'); break; case CC_SPACE: reg_addrange_str(preg, " \t\r\n\f\v"); break; case CC_BLANK: reg_addrange_str(preg, " \t"); break; case CC_UPPER: reg_addrange(preg, 'A', 'Z'); break; case CC_LOWER: reg_addrange(preg, 'a', 'z'); break; case CC_XDIGIT: reg_addrange(preg, 'a', 'f'); reg_addrange(preg, 'A', 'F'); /* Fall through */ case CC_DIGIT: reg_addrange(preg, '0', '9'); break; case CC_CNTRL: reg_addrange(preg, 0, 31); reg_addrange(preg, 127, 127); break; case CC_PRINT: reg_addrange(preg, ' ', '~'); break; case CC_GRAPH: reg_addrange(preg, '!', '~'); break; case CC_PUNCT: reg_addrange(preg, '!', '/'); reg_addrange(preg, ':', '@'); reg_addrange(preg, '[', '`'); reg_addrange(preg, '{', '~'); break; } continue; } } /* Not a range, so just add the char */ reg_addrange(preg, start, start); } regc(preg, '\0'); if (*pattern) { pattern++; } preg->regparse = pattern; *flagp |= HASWIDTH|SIMPLE; } break; case '(': ret = reg(preg, 1, &flags); if (ret == 0) return 0; *flagp |= flags&(HASWIDTH|SPSTART); break; case '\0': case '|': case ')': preg->err = REG_ERR_INTERNAL; return 0; /* Supposed to be caught earlier. */ case '?': case '+': case '*': case '{': preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING; return 0; case '\\': ch = *preg->regparse++; switch (ch) { case '\0': - preg->err = REG_ERR_TRAILING_BACKSLASH; + preg->err = REG_ERR_INVALID_ESCAPE; return 0; case 'A': ret = regnode(preg, BOLX); break; case 'Z': ret = regnode(preg, EOLX); break; case '<': case 'm': ret = regnode(preg, WORDA); break; case '>': case 'M': ret = regnode(preg, WORDZ); break; case 'd': case 'D': ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT); reg_addrange(preg, '0', '9'); regc(preg, '\0'); *flagp |= HASWIDTH|SIMPLE; break; case 'w': case 'W': ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT); if ((preg->cflags & REG_ICASE) == 0) { reg_addrange(preg, 'a', 'z'); } reg_addrange(preg, 'A', 'Z'); reg_addrange(preg, '0', '9'); reg_addrange(preg, '_', '_'); regc(preg, '\0'); *flagp |= HASWIDTH|SIMPLE; break; case 's': case 'S': ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT); reg_addrange_str(preg," \t\r\n\f\v"); regc(preg, '\0'); *flagp |= HASWIDTH|SIMPLE; break; /* FIXME: Someday handle \1, \2, ... */ default: /* Handle general quoted chars in exact-match routine */ /* Back up to include the backslash */ preg->regparse--; goto de_fault; } break; de_fault: default: { /* * Encode a string of characters to be matched exactly. */ int added = 0; /* Back up to pick up the first char of interest */ preg->regparse -= n; ret = regnode(preg, EXACTLY); /* Note that a META operator such as ? or * consumes the * preceding char. * Thus we must be careful to look ahead by 2 and add the * last char as it's own EXACTLY if necessary */ /* Until end of string or a META char is reached */ while (*preg->regparse && strchr(META, *preg->regparse) == NULL) { n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE)); if (ch == '\\' && preg->regparse[n]) { /* Non-trailing backslash. * Is this a special escape, or a regular escape? */ if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) { /* A special escape. All done with EXACTLY */ break; } /* Decode it. Note that we add the length for the escape * sequence to the length for the backlash so we can skip * the entire sequence, or not as required. */ n += reg_decode_escape(preg->regparse + n, &ch); if (ch == 0) { preg->err = REG_ERR_NULL_CHAR; return 0; } } /* Now we have one char 'ch' of length 'n'. * Check to see if the following char is a MULT */ if (ISMULT(preg->regparse[n])) { /* Yes. But do we already have some EXACTLY chars? */ if (added) { /* Yes, so return what we have and pick up the current char next time around */ break; } /* No, so add this single char and finish */ regc(preg, ch); added++; preg->regparse += n; break; } /* No, so just add this char normally */ regc(preg, ch); added++; preg->regparse += n; } regc(preg, '\0'); *flagp |= HASWIDTH; if (added == 1) *flagp |= SIMPLE; break; } break; } return(ret); } static void reg_grow(regex_t *preg, int n) { if (preg->p + n >= preg->proglen) { preg->proglen = (preg->p + n) * 2; preg->program = realloc(preg->program, preg->proglen * sizeof(int)); } } /* - regnode - emit a node */ /* Location. */ static int regnode(regex_t *preg, int op) { reg_grow(preg, 2); /* The OP followed by a next pointer */ preg->program[preg->p++] = op; preg->program[preg->p++] = 0; /* Return the start of the node */ return preg->p - 2; } /* - regc - emit (if appropriate) a byte of code */ static void regc(regex_t *preg, int b ) { reg_grow(preg, 1); preg->program[preg->p++] = b; } /* - reginsert - insert an operator in front of already-emitted operand * * Means relocating the operand. * Returns the new location of the original operand. */ static int reginsert(regex_t *preg, int op, int size, int opnd ) { reg_grow(preg, size); /* Move everything from opnd up */ memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd)); /* Zero out the new space */ memset(preg->program + opnd, 0, sizeof(int) * size); preg->program[opnd] = op; preg->p += size; return opnd + size; } /* - regtail - set the next-pointer at the end of a node chain */ static void regtail(regex_t *preg, int p, int val) { int scan; int temp; int offset; /* Find last node. */ scan = p; for (;;) { temp = regnext(preg, scan); if (temp == 0) break; scan = temp; } if (OP(preg, scan) == BACK) offset = scan - val; else offset = val - scan; preg->program[scan + 1] = offset; } /* - regoptail - regtail on operand of first argument; nop if operandless */ static void regoptail(regex_t *preg, int p, int val ) { /* "Operandless" and "op != BRANCH" are synonymous in practice. */ if (p != 0 && OP(preg, p) == BRANCH) { regtail(preg, OPERAND(p), val); } } /* * regexec and friends */ /* * Forwards. */ static int regtry(regex_t *preg, const char *string ); static int regmatch(regex_t *preg, int prog); static int regrepeat(regex_t *preg, int p, int max); /* - regexec - match a regexp against a string */ int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags) { const char *s; int scan; /* Be paranoid... */ if (preg == NULL || preg->program == NULL || string == NULL) { return REG_ERR_NULL_ARGUMENT; } /* Check validity of program. */ if (*preg->program != REG_MAGIC) { return REG_ERR_CORRUPTED; } #ifdef DEBUG fprintf(stderr, "regexec: %s\n", string); regdump(preg); #endif preg->eflags = eflags; preg->pmatch = pmatch; preg->nmatch = nmatch; preg->start = string; /* All offsets are computed from here */ /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */ for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) { int op = OP(preg, scan); if (op == END) break; if (op == REPX || op == REPXMIN) preg->program[scan + 4] = 0; } /* If there is a "must appear" string, look for it. */ if (preg->regmust != 0) { s = string; while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) { if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) { break; } s++; } if (s == NULL) /* Not present. */ return REG_NOMATCH; } /* Mark beginning of line for ^ . */ preg->regbol = string; /* Simplest case: anchored match need be tried only once (maybe per line). */ if (preg->reganch) { if (eflags & REG_NOTBOL) { /* This is an anchored search, but not an BOL, so possibly skip to the next line */ goto nextline; } while (1) { if (regtry(preg, string)) { return REG_NOERROR; } if (*string) { nextline: if (preg->cflags & REG_NEWLINE) { /* Try the next anchor? */ string = strchr(string, '\n'); if (string) { preg->regbol = ++string; continue; } } } return REG_NOMATCH; } } /* Messy cases: unanchored match. */ s = string; if (preg->regstart != '\0') { /* We know what char it must start with. */ while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) { if (regtry(preg, s)) return REG_NOERROR; s++; } } else /* We don't -- general case. */ while (1) { if (regtry(preg, s)) return REG_NOERROR; if (*s == '\0') { break; } else { int c; s += utf8_tounicode(s, &c); } } /* Failure. */ return REG_NOMATCH; } /* - regtry - try match at specific point */ /* 0 failure, 1 success */ static int regtry( regex_t *preg, const char *string ) { int i; preg->reginput = string; for (i = 0; i < preg->nmatch; i++) { if (preg->pmatch) { preg->pmatch[i].rm_so = -1; preg->pmatch[i].rm_eo = -1; } } if (regmatch(preg, 1)) { if (preg->pmatch) { preg->pmatch[0].rm_so = string - preg->start; preg->pmatch[0].rm_eo = preg->reginput - preg->start; } return(1); } else return(0); } /** * Returns bytes matched if 'pattern' is a prefix of 'string'. * * If 'nocase' is non-zero, does a case-insensitive match. * * Returns -1 on not found. */ static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase) { const char *s = string; while (proglen && *s) { int ch; int n = reg_utf8_tounicode_case(s, &ch, nocase); if (ch != *prog) { return -1; } prog++; s += n; proglen--; } if (proglen == 0) { return s - string; } return -1; } /** * Searchs for 'c' in the range 'range'. * * Returns 1 if found, or 0 if not. */ static int reg_range_find(const int *range, int c) { while (*range) { /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/ if (c >= range[1] && c <= (range[0] + range[1] - 1)) { return 1; } range += 2; } return 0; } /** * Search for the character 'c' in the utf-8 string 'string'. * * If 'nocase' is set, the 'string' is assumed to be uppercase * and 'c' is converted to uppercase before matching. * * Returns the byte position in the string where the 'c' was found, or * NULL if not found. */ static const char *str_find(const char *string, int c, int nocase) { if (nocase) { /* The "string" should already be converted to uppercase */ c = utf8_upper(c); } while (*string) { int ch; int n = reg_utf8_tounicode_case(string, &ch, nocase); if (c == ch) { return string; } string += n; } return NULL; } /** * Returns true if 'ch' is an end-of-line char. * * In REG_NEWLINE mode, \n is considered EOL in * addition to \0 */ static int reg_iseol(regex_t *preg, int ch) { if (preg->cflags & REG_NEWLINE) { return ch == '\0' || ch == '\n'; } else { return ch == '\0'; } } static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin) { int nextch = '\0'; const char *save; int no; int c; int max = preg->program[scan + 2]; int min = preg->program[scan + 3]; int next = regnext(preg, scan); /* * Lookahead to avoid useless match attempts * when we know what character comes next. */ if (OP(preg, next) == EXACTLY) { nextch = preg->program[OPERAND(next)]; } save = preg->reginput; no = regrepeat(preg, scan + 5, max); if (no < min) { return 0; } if (matchmin) { /* from min up to no */ max = no; no = min; } /* else from no down to min */ while (1) { if (matchmin) { if (no > max) { break; } } else { if (no < min) { break; } } preg->reginput = save + utf8_index(save, no); reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE)); /* If it could work, try it. */ if (reg_iseol(preg, nextch) || c == nextch) { if (regmatch(preg, next)) { return(1); } } if (matchmin) { /* Couldn't or didn't, add one more */ no++; } else { /* Couldn't or didn't -- back up. */ no--; } } return(0); } static int regmatchrepeat(regex_t *preg, int scan, int matchmin) { int *scanpt = preg->program + scan; int max = scanpt[2]; int min = scanpt[3]; /* Have we reached min? */ if (scanpt[4] < min) { /* No, so get another one */ scanpt[4]++; if (regmatch(preg, scan + 5)) { return 1; } scanpt[4]--; return 0; } if (scanpt[4] > max) { return 0; } if (matchmin) { /* minimal, so try other branch first */ if (regmatch(preg, regnext(preg, scan))) { return 1; } /* No, so try one more */ scanpt[4]++; if (regmatch(preg, scan + 5)) { return 1; } scanpt[4]--; return 0; } /* maximal, so try this branch again */ if (scanpt[4] < max) { scanpt[4]++; if (regmatch(preg, scan + 5)) { return 1; } scanpt[4]--; } /* At this point we are at max with no match. Try the other branch */ return regmatch(preg, regnext(preg, scan)); } /* - regmatch - main matching routine * * Conceptually the strategy is simple: check to see whether the current * node matches, call self recursively to see whether the rest matches, * and then act accordingly. In practice we make some effort to avoid * recursion, in particular by going through "ordinary" nodes (that don't * need to know whether the rest of the match failed) by a loop instead of * by recursion. */ /* 0 failure, 1 success */ static int regmatch(regex_t *preg, int prog) { int scan; /* Current node. */ int next; /* Next node. */ const char *save; scan = prog; #ifdef DEBUG if (scan != 0 && regnarrate) fprintf(stderr, "%s(\n", regprop(scan)); #endif while (scan != 0) { int n; int c; #ifdef DEBUG if (regnarrate) { fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */ } #endif next = regnext(preg, scan); n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE)); switch (OP(preg, scan)) { case BOLX: if ((preg->eflags & REG_NOTBOL)) { return(0); } /* Fall through */ case BOL: if (preg->reginput != preg->regbol) { return(0); } break; case EOLX: if (c != 0) { /* For EOLX, only match real end of line, not newline */ return 0; } break; case EOL: if (!reg_iseol(preg, c)) { return(0); } break; case WORDA: /* Must be looking at a letter, digit, or _ */ if ((!isalnum(UCHAR(c))) && c != '_') return(0); /* Prev must be BOL or nonword */ if (preg->reginput > preg->regbol && (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_')) return(0); break; case WORDZ: /* Can't match at BOL */ if (preg->reginput > preg->regbol) { /* Current must be EOL or nonword */ if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') { c = preg->reginput[-1]; /* Previous must be word */ if (isalnum(UCHAR(c)) || c == '_') { break; } } } /* No */ return(0); case ANY: if (reg_iseol(preg, c)) return 0; preg->reginput += n; break; case EXACTLY: { int opnd; int len; int slen; opnd = OPERAND(scan); len = str_int_len(preg->program + opnd); slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE); if (slen < 0) { return(0); } preg->reginput += slen; } break; case ANYOF: if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) { return(0); } preg->reginput += n; break; case ANYBUT: if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) { return(0); } preg->reginput += n; break; case NOTHING: break; case BACK: break; case BRANCH: if (OP(preg, next) != BRANCH) /* No choice. */ next = OPERAND(scan); /* Avoid recursion. */ else { do { save = preg->reginput; if (regmatch(preg, OPERAND(scan))) { return(1); } preg->reginput = save; scan = regnext(preg, scan); } while (scan != 0 && OP(preg, scan) == BRANCH); return(0); /* NOTREACHED */ } break; case REP: case REPMIN: return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN); case REPX: case REPXMIN: return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN); case END: return 1; /* Success! */ case OPENNC: case CLOSENC: return regmatch(preg, next); default: if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) { save = preg->reginput; if (regmatch(preg, next)) { if (OP(preg, scan) < CLOSE) { int no = OP(preg, scan) - OPEN; if (no < preg->nmatch && preg->pmatch && preg->pmatch[no].rm_so == -1) { preg->pmatch[no].rm_so = save - preg->start; } } else { int no = OP(preg, scan) - CLOSE; if (no < preg->nmatch && preg->pmatch && preg->pmatch[no].rm_eo == -1) { preg->pmatch[no].rm_eo = save - preg->start; } } return(1); } /* Restore input position after failure */ preg->reginput = save; return(0); } return REG_ERR_INTERNAL; } scan = next; } /* * We get here only if there's trouble -- normally "case END" is * the terminating point. */ return REG_ERR_INTERNAL; } /* - regrepeat - repeatedly match something simple, report how many */ static int regrepeat(regex_t *preg, int p, int max) { int count = 0; const char *scan; int opnd; int ch; int n; scan = preg->reginput; opnd = OPERAND(p); switch (OP(preg, p)) { case ANY: while (!reg_iseol(preg, *scan) && count < max) { count++; scan += utf8_charlen(*scan); } break; case EXACTLY: while (count < max) { n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE); if (preg->program[opnd] != ch) { break; } count++; scan += n; } break; case ANYOF: while (count < max) { n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE); if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) { break; } count++; scan += n; } break; case ANYBUT: while (count < max) { n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE); if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) { break; } count++; scan += n; } break; default: /* Oh dear. Called inappropriately. */ preg->err = REG_ERR_INTERNAL; count = 0; /* Best compromise. */ break; } preg->reginput = scan; return(count); } /* - regnext - dig the "next" pointer out of a node */ static int regnext(regex_t *preg, int p ) { int offset; offset = NEXT(preg, p); if (offset == 0) return 0; if (OP(preg, p) == BACK) return(p-offset); else return(p+offset); } /* - regopsize - returns the size of opcode + operands at 'p' in words */ static int regopsize(regex_t *preg, int p ) { /* Almost all opcodes are 2 words, but some are more */ switch (OP(preg, p)) { case REP: case REPMIN: case REPX: case REPXMIN: return 5; case ANYOF: case ANYBUT: case EXACTLY: { int s = p + 2; while (preg->program[s++]) { } return s - p; } } return 2; } #if defined(DEBUG) && !defined(JIM_BOOTSTRAP) /* - regdump - dump a regexp onto stdout in vaguely comprehensible form */ static void regdump(regex_t *preg) { int s; int op = EXACTLY; /* Arbitrary non-END op. */ int next; char buf[MAX_UTF8_LEN + 1]; int i; for (i = 1; i < preg->p; i++) { printf("%02x ", (unsigned char)preg->program[i]); if (i % 16 == 0) { printf("\n"); } } printf("\n"); s = 1; while (op != END && s < preg->p) { /* While that wasn't END last time... */ op = OP(preg, s); printf("%3d: %s", s, regprop(op)); /* Where, what. */ next = regnext(preg, s); if (next == 0) /* Next ptr. */ printf("(0)"); else printf("(%d)", next); s += 2; if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) { int max = preg->program[s]; int min = preg->program[s + 1]; if (max == 65535) { printf("{%d,*}", min); } else { printf("{%d,%d}", min, max); } printf(" %d", preg->program[s + 2]); s += 3; } else if (op == ANYOF || op == ANYBUT) { /* set of ranges */ while (preg->program[s]) { int len = preg->program[s++]; int first = preg->program[s++]; buf[utf8_getchars(buf, first)] = 0; printf("%s", buf); if (len > 1) { buf[utf8_getchars(buf, first + len - 1)] = 0; printf("-%s", buf); } } s++; } else if (op == EXACTLY) { /* Literal string, where present. */ while (preg->program[s]) { buf[utf8_getchars(buf, preg->program[s])] = 0; printf("%s", buf); s++; } s++; } putchar('\n'); } if (op == END) { /* Header fields of interest. */ if (preg->regstart) { buf[utf8_getchars(buf, preg->regstart)] = 0; printf("start '%s' ", buf); } if (preg->reganch) printf("anchored "); if (preg->regmust != 0) { int i; printf("must have:"); for (i = 0; i < preg->regmlen; i++) { putchar(preg->program[preg->regmust + i]); } putchar('\n'); } } printf("\n"); } /* - regprop - printable representation of opcode */ static const char *regprop( int op ) { static char buf[50]; switch (op) { case BOL: return "BOL"; case EOL: return "EOL"; case BOLX: return "BOLX"; case EOLX: return "EOLX"; case ANY: return "ANY"; case ANYOF: return "ANYOF"; case ANYBUT: return "ANYBUT"; case BRANCH: return "BRANCH"; case EXACTLY: return "EXACTLY"; case NOTHING: return "NOTHING"; case BACK: return "BACK"; case END: return "END"; case REP: return "REP"; case REPMIN: return "REPMIN"; case REPX: return "REPX"; case REPXMIN: return "REPXMIN"; case WORDA: return "WORDA"; case WORDZ: return "WORDZ"; case OPENNC: return "OPEN"; case CLOSENC: return "CLOSE"; default: if (op >= OPEN && op < CLOSE) { snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN); } else if (op >= CLOSE && op < CLOSE_END) { snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE); } else { snprintf(buf, sizeof(buf), "?%d?\n", op); } return(buf); } } #endif /* JIM_BOOTSTRAP */ size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) { static const char *error_strings[] = { "success", "no match", "bad pattern", "null argument", "unknown error", "too big", "out of memory", "too many ()", "parentheses () not balanced", "braces {} not balanced", "invalid repetition count(s)", "extra characters", "*+ of empty atom", "nested count", "internal error", "count follows nothing", - "trailing backslash", + "invalid escape \\ sequence", "corrupted program", "contains null char", + "brackets [] not balanced", }; const char *err; (void)preg; if (errcode < 0 || errcode >= REG_ERR_NUM) { err = "Bad error code"; } else { err = error_strings[errcode]; } return snprintf(errbuf, errbuf_size, "%s", err); } void regfree(regex_t *preg) { free(preg->program); } #endif diff --git a/regexp/jimregexp.h b/regexp/jimregexp.h index 581b7104c..ab734797b 100644 --- a/regexp/jimregexp.h +++ b/regexp/jimregexp.h @@ -1,109 +1,110 @@ #ifndef JIMREGEXP_H #define JIMREGEXP_H /** regexp(3)-compatible regular expression implementation for Jim. * * See jimregexp.c for details */ #ifdef __cplusplus extern "C" { #endif #include typedef struct { int rm_so; int rm_eo; } regmatch_t; /* * The "internal use only" fields in regexp.h are present to pass info from * compile to execute that permits the execute phase to run lots faster on * simple cases. They are: * * regstart char that must begin a match; '\0' if none obvious * reganch is the match anchored (at beginning-of-line only)? * regmust string (pointer into program) that match must include, or NULL * regmlen length of regmust string * * Regstart and reganch permit very fast decisions on suitable starting points * for a match, cutting down the work a lot. Regmust permits fast rejection * of lines that cannot possibly match. The regmust tests are costly enough * that regcomp() supplies a regmust only if the r.e. contains something * potentially expensive (at present, the only such thing detected is * or + * at the start of the r.e., which can involve a lot of backup). Regmlen is * supplied because the test in regexec() needs it and regcomp() is computing * it anyway. */ struct regexp { /* -- public -- */ int re_nsub; /* number of parenthesized subexpressions */ /* -- private -- */ int cflags; /* Flags used when compiling */ int err; /* Any error which occurred during compile */ int regstart; /* Internal use only. */ int reganch; /* Internal use only. */ int regmust; /* Internal use only. */ int regmlen; /* Internal use only. */ int *program; /* Allocated */ /* working state - compile */ const char *regparse; /* Input-scan pointer. */ int p; /* Current output pos in program */ int proglen; /* Allocated program size */ /* working state - exec */ int eflags; /* Flags used when executing */ const char *start; /* Initial string pointer. */ const char *reginput; /* Current input pointer. */ const char *regbol; /* Beginning of input, for ^ check. */ /* Input to regexec() */ regmatch_t *pmatch; /* submatches will be stored here */ int nmatch; /* size of pmatch[] */ }; typedef struct regexp regex_t; #define REG_EXTENDED 0 #define REG_NEWLINE 1 #define REG_ICASE 2 #define REG_NOTBOL 16 enum { REG_NOERROR, /* Success. */ REG_NOMATCH, /* Didn't find a match (for regexec). */ REG_BADPAT, /* >= REG_BADPAT is an error */ REG_ERR_NULL_ARGUMENT, REG_ERR_UNKNOWN, REG_ERR_TOO_BIG, REG_ERR_NOMEM, REG_ERR_TOO_MANY_PAREN, REG_ERR_UNMATCHED_PAREN, REG_ERR_UNMATCHED_BRACES, REG_ERR_BAD_COUNT, REG_ERR_JUNK_ON_END, REG_ERR_OPERAND_COULD_BE_EMPTY, REG_ERR_NESTED_COUNT, REG_ERR_INTERNAL, REG_ERR_COUNT_FOLLOWS_NOTHING, - REG_ERR_TRAILING_BACKSLASH, + REG_ERR_INVALID_ESCAPE, REG_ERR_CORRUPTED, REG_ERR_NULL_CHAR, + REG_ERR_UNMATCHED_BRACKET, REG_ERR_NUM }; int regcomp(regex_t *preg, const char *regex, int cflags); int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size); void regfree(regex_t *preg); #ifdef __cplusplus } #endif #endif