diff --git a/regexp/LICENSE b/regexp/LICENSE new file mode 100644 index 000000000..22568e5d2 --- /dev/null +++ b/regexp/LICENSE @@ -0,0 +1,45 @@ +Unless explicitly stated, all files within Jim repository are released +under following license: + +/* Jim - A small embeddable Tcl interpreter + * + * Copyright 2005 Salvatore Sanfilippo + * Copyright 2005 Clemens Hintze + * Copyright 2005 patthoyts - Pat Thoyts + * Copyright 2008 oharboe - Øyvind Harboe - oyvind.harboe@zylin.com + * Copyright 2008 Andrew Lunn + * Copyright 2008 Duane Ellis + * Copyright 2008 Uwe Klein + * Copyright 2008 Steve Bennett + * Copyright 2009 Nico Coesel + * Copyright 2009 Zachary T Welch zw@superlucidity.net + * Copyright 2009 David Brownell + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE JIM TCL PROJECT ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * JIM TCL PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation + * are those of the authors and should not be interpreted as representing + * official policies, either expressed or implied, of the Jim Tcl Project. + */ diff --git a/regexp/jimregexp.c b/regexp/jimregexp.c new file mode 100644 index 000000000..3f2711b95 --- /dev/null +++ b/regexp/jimregexp.c @@ -0,0 +1,1899 @@ +/* + * vi:se ts=8: + * + * regcomp and regexec -- regsub and regerror are elsewhere + * + * Copyright (c) 1986 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. + * + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it freely, + * subject to the following restrictions: + * + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. + * + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. + * + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented as being the original software. + *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, + *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | + *** to assist in implementing egrep. + *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, + *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching + *** as in BSD grep and ex. + *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, + *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. + *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, + *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. + *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald + *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h + *** was moved into regexp.h, and the include of regexp.h now uses "'s + *** to avoid conflicting with the system regexp.h. Const, bless its + *** soul, was removed so it can compile everywhere. The declaration + *** of strchr() was in conflict on AIX, so it was removed (as it is + *** happily defined in string.h). + *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald + *** seiwald@perforce.com, on 20 January 2000, to use function prototypes. + *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald + *** seiwald@perforce.com, on 05 November 2002, to const string literals. + * + * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett + * on 16 October 2010, to remove static state and add better Tcl ARE compatibility. + * This includes counted repetitions, UTF-8 support, character classes, + * shorthand character classes, increased number of parentheses to 100, + * backslash escape sequences. It also removes \n as an alternative to |. + * + * Beware that some of this code is subtly aware of the way operator + * precedence is structured in regular expressions. Serious changes in + * regular-expression syntax might require a total rethink. + */ + +#include "jimautoconf.h" + +#if defined(JIM_REGEXP) +#include +#include +#include +#include + +#include "jim.h" +#include "jimregexp.h" +#include "utf8.h" + +/* An arbitrary limit, but this seems enough. Must be less than 1000. */ +#define REG_MAX_PAREN 100 + +/* + * Structure for regexp "program". This is essentially a linear encoding + * of a nondeterministic finite-state machine (aka syntax charts or + * "railroad normal form" in parsing technology). Each node is an opcode + * plus a "next" pointer, possibly plus an operand. "Next" pointers of + * all nodes except BRANCH implement concatenation; a "next" pointer with + * a BRANCH on both ends of it is connecting two alternatives. (Here we + * have one of the subtle syntax dependencies: an individual BRANCH (as + * opposed to a collection of them) is never concatenated with anything + * because of operator precedence.) The operand of some types of node is + * a literal string; for others, it is a node leading into a sub-FSM. In + * particular, the operand of a BRANCH node is the first node of the branch. + * (NB this is *not* a tree structure: the tail of the branch connects + * to the thing following the set of BRANCHes.) The opcodes are: + */ + +/* definition number opnd? meaning */ +#define END 0 /* no End of program. */ +#define BOL 1 /* no Match "" at beginning of line. */ +#define EOL 2 /* no Match "" at end of line. */ +#define ANY 3 /* no Match any one character. */ +#define ANYOF 4 /* str Match any character in this string. */ +#define ANYBUT 5 /* str Match any character not in this string. */ +#define BRANCH 6 /* node Match this alternative, or the next... */ +#define BACK 7 /* no Match "", "next" ptr points backward. */ +#define EXACTLY 8 /* str Match this string. */ +#define NOTHING 9 /* no Match empty string. */ +#define REP 10 /* max,min Match this (simple) thing [min,max] times. */ +#define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, minimal match. */ +#define REPX 12 /* max,min Match this (complex) thing [min,max] times. */ +#define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */ +#define BOLX 14 /* no Match "" at beginning of input. */ +#define EOLX 15 /* no Match "" at end of input. */ +#define WORDA 16 /* no Match "" at wordchar, where prev is nonword */ +#define WORDZ 17 /* no Match "" at nonwordchar, where prev is word */ + +#define OPENNC 1000 /* no Non-capturing parentheses - must be OPEN-1 */ +#define OPEN 1001 /* no Mark this point in input as start of #n. */ + /* OPEN+1 is number 1, etc. */ + +/* must not be any other opts between OPEN and CLOSE */ + +#define CLOSENC 2000 /* no Non-capturing parentheses - must be CLOSE-1 */ +#define CLOSE 2001 /* no Analogous to OPEN. */ +#define CLOSE_END (CLOSE+REG_MAX_PAREN) + +/* + * The first word of the regexp internal "program" is actually this magic + * number; the start node begins in the second word. + */ +#define REG_MAGIC 0xFADED00D + +/* + * Opcode notes: + * + * BRANCH The set of branches constituting a single choice are hooked + * together with their "next" pointers, since precedence prevents + * anything being concatenated to any individual branch. The + * "next" pointer of the last BRANCH in a choice points to the + * thing following the whole choice. This is also where the + * final "next" pointer of each individual branch points; each + * branch starts with the operand node of a BRANCH node. + * + * BACK Normal "next" pointers all implicitly point forward; BACK + * exists to make loop structures possible. + * + * REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented + * as either simple repeats (REP) or complex repeats (REPX). + * These opcodes include a "min" and "max" count after the opcode. + * This is followed by a fourth "current count" word that is + * only used by REPX, as it implements a recursive match. + * REPMIN and REPXMIN are identical except they implement minimal repeats. + * + * OPEN,CLOSE ...are numbered at compile time. + */ + +/* + * A node is one word of opcode followed by one word of "next" pointer. + * The "next" pointer value is a positive offset from the opcode of the node + * containing it. + * An operand, if any, simply follows the node. (Note that much of the + * code generation knows about this implicit relationship.) + */ +#define OP(preg, p) (preg->program[p]) +#define NEXT(preg, p) (preg->program[p + 1]) +#define OPERAND(p) ((p) + 2) + +/* + * See regmagic.h for one further detail of program structure. + */ + + +/* + * Utility definitions. + */ + +#define FAIL(R,M) { (R)->err = (M); return (M); } +#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{') +#define META "^$.[()|?{+*" + +/* + * Flags to be passed up and down. + */ +#define HASWIDTH 1 /* Known never to match null string. */ +#define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */ +#define SPSTART 4 /* Starts with * or +. */ +#define WORST 0 /* Worst case. */ + +#define MAX_REP_COUNT 1000000 + +/* + * Forward declarations for regcomp()'s friends. + */ +static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp ); +static int regpiece(regex_t *preg, int *flagp ); +static int regbranch(regex_t *preg, int *flagp ); +static int regatom(regex_t *preg, int *flagp ); +static int regnode(regex_t *preg, int op ); +static int regnext(regex_t *preg, int p ); +static void regc(regex_t *preg, int b ); +static int reginsert(regex_t *preg, int op, int size, int opnd ); +static void regtail(regex_t *preg, int p, int val); +static void regoptail(regex_t *preg, int p, int val ); +static int regopsize(regex_t *preg, int p ); + +static int reg_range_find(const int *string, int c); +static const char *str_find(const char *string, int c, int nocase); +static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase); + +/*#define DEBUG*/ +#ifdef DEBUG +static int regnarrate = 0; +static void regdump(regex_t *preg); +static const char *regprop( int op ); +#endif + + +/** + * Returns the length of the null-terminated integer sequence. + */ +static int str_int_len(const int *seq) +{ + int n = 0; + while (*seq++) { + n++; + } + return n; +} + +/* + - regcomp - compile a regular expression into internal code + * + * We can't allocate space until we know how big the compiled form will be, + * but we can't compile it (and thus know how big it is) until we've got a + * place to put the code. So we cheat: we compile it twice, once with code + * generation turned off and size counting turned on, and once "for real". + * This also means that we don't allocate space until we are sure that the + * thing really will compile successfully, and we never have to move the + * code and thus invalidate pointers into it. (Note that it has to be in + * one piece because free() must be able to free it all.) + * + * Beware that the optimization-preparation code in here knows about some + * of the structure of the compiled regexp. + */ +int regcomp(regex_t *preg, const char *exp, int cflags) +{ + int scan; + int longest; + unsigned len; + int flags; + +#ifdef DEBUG + fprintf(stderr, "Compiling: '%s'\n", exp); +#endif + memset(preg, 0, sizeof(*preg)); + + if (exp == NULL) + FAIL(preg, REG_ERR_NULL_ARGUMENT); + + /* First pass: determine size, legality. */ + preg->cflags = cflags; + preg->regparse = exp; + + /* Allocate space. */ + preg->proglen = (strlen(exp) + 1) * 5; + preg->program = malloc(preg->proglen * sizeof(int)); + if (preg->program == NULL) + FAIL(preg, REG_ERR_NOMEM); + + /* Note that since we store a magic value as the first item in the program, + * program offsets will never be 0 + */ + regc(preg, REG_MAGIC); + if (reg(preg, 0, &flags) == 0) { + return preg->err; + } + + /* Small enough for pointer-storage convention? */ + if (preg->re_nsub >= REG_MAX_PAREN) /* Probably could be 65535L. */ + FAIL(preg,REG_ERR_TOO_BIG); + + /* Dig out information for optimizations. */ + preg->regstart = 0; /* Worst-case defaults. */ + preg->reganch = 0; + preg->regmust = 0; + preg->regmlen = 0; + scan = 1; /* First BRANCH. */ + if (OP(preg, regnext(preg, scan)) == END) { /* Only one top-level choice. */ + scan = OPERAND(scan); + + /* Starting-point info. */ + if (OP(preg, scan) == EXACTLY) { + preg->regstart = preg->program[OPERAND(scan)]; + } + else if (OP(preg, scan) == BOL) + preg->reganch++; + + /* + * If there's something expensive in the r.e., find the + * longest literal string that must appear and make it the + * regmust. Resolve ties in favor of later strings, since + * the regstart check works with the beginning of the r.e. + * and avoiding duplication strengthens checking. Not a + * strong reason, but sufficient in the absence of others. + */ + if (flags&SPSTART) { + longest = 0; + len = 0; + for (; scan != 0; scan = regnext(preg, scan)) { + if (OP(preg, scan) == EXACTLY) { + int plen = str_int_len(preg->program + OPERAND(scan)); + if (plen >= len) { + longest = OPERAND(scan); + len = plen; + } + } + } + preg->regmust = longest; + preg->regmlen = len; + } + } + +#ifdef DEBUG + regdump(preg); +#endif + + return 0; +} + +/* + - reg - regular expression, i.e. main body or parenthesized thing + * + * Caller must absorb opening parenthesis. + * + * Combining parenthesis handling with the base level of regular expression + * is a trifle forced, but the need to tie the tails of the branches to what + * follows makes it hard to avoid. + */ +static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp ) +{ + int ret; + int br; + int ender; + int parno = 0; + int flags; + + *flagp = HASWIDTH; /* Tentatively. */ + + /* Make an OPEN node, if parenthesized. */ + if (paren) { + if (preg->regparse[0] == '?' && preg->regparse[1] == ':') { + /* non-capturing paren */ + preg->regparse += 2; + parno = -1; + } + else { + parno = ++preg->re_nsub; + } + ret = regnode(preg, OPEN+parno); + } else + ret = 0; + + /* Pick up the branches, linking them together. */ + br = regbranch(preg, &flags); + if (br == 0) + return 0; + if (ret != 0) + regtail(preg, ret, br); /* OPEN -> first. */ + else + ret = br; + if (!(flags&HASWIDTH)) + *flagp &= ~HASWIDTH; + *flagp |= flags&SPSTART; + while (*preg->regparse == '|') { + preg->regparse++; + br = regbranch(preg, &flags); + if (br == 0) + return 0; + regtail(preg, ret, br); /* BRANCH -> BRANCH. */ + if (!(flags&HASWIDTH)) + *flagp &= ~HASWIDTH; + *flagp |= flags&SPSTART; + } + + /* Make a closing node, and hook it on the end. */ + ender = regnode(preg, (paren) ? CLOSE+parno : END); + regtail(preg, ret, ender); + + /* Hook the tails of the branches to the closing node. */ + for (br = ret; br != 0; br = regnext(preg, br)) + regoptail(preg, br, ender); + + /* Check for proper termination. */ + if (paren && *preg->regparse++ != ')') { + preg->err = REG_ERR_UNMATCHED_PAREN; + return 0; + } else if (!paren && *preg->regparse != '\0') { + if (*preg->regparse == ')') { + preg->err = REG_ERR_UNMATCHED_PAREN; + return 0; + } else { + preg->err = REG_ERR_JUNK_ON_END; + return 0; + } + } + + return(ret); +} + +/* + - regbranch - one alternative of an | operator + * + * Implements the concatenation operator. + */ +static int regbranch(regex_t *preg, int *flagp ) +{ + int ret; + int chain; + int latest; + int flags; + + *flagp = WORST; /* Tentatively. */ + + ret = regnode(preg, BRANCH); + chain = 0; + while (*preg->regparse != '\0' && *preg->regparse != ')' && + *preg->regparse != '|') { + latest = regpiece(preg, &flags); + if (latest == 0) + return 0; + *flagp |= flags&HASWIDTH; + if (chain == 0) {/* First piece. */ + *flagp |= flags&SPSTART; + } + else { + regtail(preg, chain, latest); + } + chain = latest; + } + if (chain == 0) /* Loop ran zero times. */ + (void) regnode(preg, NOTHING); + + return(ret); +} + +/* + - regpiece - something followed by possible [*+?] + * + * Note that the branching code sequences used for ? and the general cases + * of * and + are somewhat optimized: they use the same NOTHING node as + * both the endmarker for their branch list and the body of the last branch. + * It might seem that this node could be dispensed with entirely, but the + * endmarker role is not redundant. + */ +static int regpiece(regex_t *preg, int *flagp) +{ + int ret; + char op; + int next; + int flags; + int min; + int max; + + ret = regatom(preg, &flags); + if (ret == 0) + return 0; + + op = *preg->regparse; + if (!ISMULT(op)) { + *flagp = flags; + return(ret); + } + + if (!(flags&HASWIDTH) && op != '?') { + preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY; + return 0; + } + + /* Handle braces (counted repetition) by expansion */ + if (op == '{') { + char *end; + + min = strtoul(preg->regparse + 1, &end, 10); + if (end == preg->regparse + 1) { + preg->err = REG_ERR_BAD_COUNT; + return 0; + } + if (*end == '}') { + max = min; + } + else if (*end == '\0') { + preg->err = REG_ERR_UNMATCHED_BRACES; + return 0; + } + else { + preg->regparse = end; + max = strtoul(preg->regparse + 1, &end, 10); + if (*end != '}') { + preg->err = REG_ERR_UNMATCHED_BRACES; + return 0; + } + } + if (end == preg->regparse + 1) { + max = MAX_REP_COUNT; + } + else if (max < min || max >= 100) { + preg->err = REG_ERR_BAD_COUNT; + return 0; + } + if (min >= 100) { + preg->err = REG_ERR_BAD_COUNT; + return 0; + } + + preg->regparse = strchr(preg->regparse, '}'); + } + else { + min = (op == '+'); + max = (op == '?' ? 1 : MAX_REP_COUNT); + } + + if (preg->regparse[1] == '?') { + preg->regparse++; + next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret); + } + else { + next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret); + } + preg->program[ret + 2] = max; + preg->program[ret + 3] = min; + preg->program[ret + 4] = 0; + + *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART); + + if (!(flags & SIMPLE)) { + int back = regnode(preg, BACK); + regtail(preg, back, ret); + regtail(preg, next, back); + } + + preg->regparse++; + if (ISMULT(*preg->regparse)) { + preg->err = REG_ERR_NESTED_COUNT; + return 0; + } + + return ret; +} + +/** + * Add all characters in the inclusive range between lower and upper. + * + * Handles a swapped range (upper < lower). + */ +static void reg_addrange(regex_t *preg, int lower, int upper) +{ + if (lower > upper) { + reg_addrange(preg, upper, lower); + } + /* Add a range as length, start */ + regc(preg, upper - lower + 1); + regc(preg, lower); +} + +/** + * Add a null-terminated literal string as a set of ranges. + */ +static void reg_addrange_str(regex_t *preg, const char *str) +{ + while (*str) { + reg_addrange(preg, *str, *str); + str++; + } +} + +/** + * Extracts the next unicode char from utf8. + * + * If 'upper' is set, converts the char to uppercase. + */ +static int reg_utf8_tounicode_case(const char *s, int *uc, int upper) +{ + int l = utf8_tounicode(s, uc); + if (upper) { + *uc = utf8_upper(*uc); + } + return l; +} + +/** + * Converts a hex digit to decimal. + * + * Returns -1 for an invalid hex digit. + */ +static int hexdigitval(int c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + return -1; +} + +/** + * Parses up to 'n' hex digits at 's' and stores the result in *uc. + * + * Returns the number of hex digits parsed. + * If there are no hex digits, returns 0 and stores nothing. + */ +static int parse_hex(const char *s, int n, int *uc) +{ + int val = 0; + int k; + + for (k = 0; k < n; k++) { + int c = hexdigitval(*s++); + if (c == -1) { + break; + } + val = (val << 4) | c; + } + if (k) { + *uc = val; + } + return k; +} + +/** + * Call for chars after a backlash to decode the escape sequence. + * + * Stores the result in *ch. + * + * Returns the number of bytes consumed. + */ +static int reg_decode_escape(const char *s, int *ch) +{ + int n; + const char *s0 = s; + + *ch = *s++; + + switch (*ch) { + case 'b': *ch = '\b'; break; + case 'e': *ch = 27; break; + case 'f': *ch = '\f'; break; + case 'n': *ch = '\n'; break; + case 'r': *ch = '\r'; break; + case 't': *ch = '\t'; break; + case 'v': *ch = '\v'; break; + case 'u': + if (*s == '{') { + /* Expect \u{NNNN} */ + n = parse_hex(s + 1, 6, ch); + if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) { + s += n + 2; + } + else { + /* Invalid, so just treat as an escaped 'u' */ + *ch = 'u'; + } + } + else if ((n = parse_hex(s, 4, ch)) > 0) { + s += n; + } + break; + case 'U': + if ((n = parse_hex(s, 8, ch)) > 0) { + s += n; + } + break; + case 'x': + if ((n = parse_hex(s, 2, ch)) > 0) { + s += n; + } + break; + case '\0': + s--; + *ch = '\\'; + break; + } + return s - s0; +} + +/* + - regatom - the lowest level + * + * Optimization: gobbles an entire sequence of ordinary characters so that + * it can turn them into a single node, which is smaller to store and + * faster to run. Backslashed characters are exceptions, each becoming a + * separate node; the code is simpler that way and it's not worth fixing. + */ +static int regatom(regex_t *preg, int *flagp) +{ + int ret; + int flags; + int nocase = (preg->cflags & REG_ICASE); + + int ch; + int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase); + + *flagp = WORST; /* Tentatively. */ + + preg->regparse += n; + switch (ch) { + /* FIXME: these chars only have meaning at beg/end of pat? */ + case '^': + ret = regnode(preg, BOL); + break; + case '$': + ret = regnode(preg, EOL); + break; + case '.': + ret = regnode(preg, ANY); + *flagp |= HASWIDTH|SIMPLE; + break; + case '[': { + const char *pattern = preg->regparse; + + if (*pattern == '^') { /* Complement of range. */ + ret = regnode(preg, ANYBUT); + pattern++; + } else + ret = regnode(preg, ANYOF); + + /* Special case. If the first char is ']' or '-', it is part of the set */ + if (*pattern == ']' || *pattern == '-') { + reg_addrange(preg, *pattern, *pattern); + pattern++; + } + + while (*pattern && *pattern != ']') { + /* Is this a range? a-z */ + int start; + int end; + + enum { + CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER, + CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT, + CC_NUM + }; + int cc; + + pattern += reg_utf8_tounicode_case(pattern, &start, nocase); + if (start == '\\') { + /* First check for class shorthand escapes */ + switch (*pattern) { + case 's': + pattern++; + cc = CC_SPACE; + goto cc_switch; + case 'd': + pattern++; + cc = CC_DIGIT; + goto cc_switch; + case 'w': + pattern++; + reg_addrange(preg, '_', '_'); + cc = CC_ALNUM; + goto cc_switch; + } + pattern += reg_decode_escape(pattern, &start); + if (start == 0) { + preg->err = REG_ERR_NULL_CHAR; + return 0; + } + } + if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') { + /* skip '-' */ + pattern += utf8_tounicode(pattern, &end); + pattern += reg_utf8_tounicode_case(pattern, &end, nocase); + if (end == '\\') { + pattern += reg_decode_escape(pattern, &end); + if (end == 0) { + preg->err = REG_ERR_NULL_CHAR; + return 0; + } + } + + reg_addrange(preg, start, end); + continue; + } + if (start == '[' && pattern[0] == ':') { + static const char *character_class[] = { + ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:", + ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:", + }; + + for (cc = 0; cc < CC_NUM; cc++) { + n = strlen(character_class[cc]); + if (strncmp(pattern, character_class[cc], n) == 0) { + /* Found a character class */ + pattern += n + 1; + break; + } + } + if (cc != CC_NUM) { +cc_switch: + switch (cc) { + case CC_ALNUM: + reg_addrange(preg, '0', '9'); + /* Fall through */ + case CC_ALPHA: + if ((preg->cflags & REG_ICASE) == 0) { + reg_addrange(preg, 'a', 'z'); + } + reg_addrange(preg, 'A', 'Z'); + break; + case CC_SPACE: + reg_addrange_str(preg, " \t\r\n\f\v"); + break; + case CC_BLANK: + reg_addrange_str(preg, " \t"); + break; + case CC_UPPER: + reg_addrange(preg, 'A', 'Z'); + break; + case CC_LOWER: + reg_addrange(preg, 'a', 'z'); + break; + case CC_XDIGIT: + reg_addrange(preg, 'a', 'f'); + reg_addrange(preg, 'A', 'F'); + /* Fall through */ + case CC_DIGIT: + reg_addrange(preg, '0', '9'); + break; + case CC_CNTRL: + reg_addrange(preg, 0, 31); + reg_addrange(preg, 127, 127); + break; + case CC_PRINT: + reg_addrange(preg, ' ', '~'); + break; + case CC_GRAPH: + reg_addrange(preg, '!', '~'); + break; + case CC_PUNCT: + reg_addrange(preg, '!', '/'); + reg_addrange(preg, ':', '@'); + reg_addrange(preg, '[', '`'); + reg_addrange(preg, '{', '~'); + break; + } + continue; + } + } + /* Not a range, so just add the char */ + reg_addrange(preg, start, start); + } + regc(preg, '\0'); + + if (*pattern) { + pattern++; + } + preg->regparse = pattern; + + *flagp |= HASWIDTH|SIMPLE; + } + break; + case '(': + ret = reg(preg, 1, &flags); + if (ret == 0) + return 0; + *flagp |= flags&(HASWIDTH|SPSTART); + break; + case '\0': + case '|': + case ')': + preg->err = REG_ERR_INTERNAL; + return 0; /* Supposed to be caught earlier. */ + case '?': + case '+': + case '*': + case '{': + preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING; + return 0; + case '\\': + ch = *preg->regparse++; + switch (ch) { + case '\0': + preg->err = REG_ERR_TRAILING_BACKSLASH; + return 0; + case 'A': + ret = regnode(preg, BOLX); + break; + case 'Z': + ret = regnode(preg, EOLX); + break; + case '<': + case 'm': + ret = regnode(preg, WORDA); + break; + case '>': + case 'M': + ret = regnode(preg, WORDZ); + break; + case 'd': + case 'D': + ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT); + reg_addrange(preg, '0', '9'); + regc(preg, '\0'); + *flagp |= HASWIDTH|SIMPLE; + break; + case 'w': + case 'W': + ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT); + if ((preg->cflags & REG_ICASE) == 0) { + reg_addrange(preg, 'a', 'z'); + } + reg_addrange(preg, 'A', 'Z'); + reg_addrange(preg, '0', '9'); + reg_addrange(preg, '_', '_'); + regc(preg, '\0'); + *flagp |= HASWIDTH|SIMPLE; + break; + case 's': + case 'S': + ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT); + reg_addrange_str(preg," \t\r\n\f\v"); + regc(preg, '\0'); + *flagp |= HASWIDTH|SIMPLE; + break; + /* FIXME: Someday handle \1, \2, ... */ + default: + /* Handle general quoted chars in exact-match routine */ + /* Back up to include the backslash */ + preg->regparse--; + goto de_fault; + } + break; + de_fault: + default: { + /* + * Encode a string of characters to be matched exactly. + */ + int added = 0; + + /* Back up to pick up the first char of interest */ + preg->regparse -= n; + + ret = regnode(preg, EXACTLY); + + /* Note that a META operator such as ? or * consumes the + * preceding char. + * Thus we must be careful to look ahead by 2 and add the + * last char as it's own EXACTLY if necessary + */ + + /* Until end of string or a META char is reached */ + while (*preg->regparse && strchr(META, *preg->regparse) == NULL) { + n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE)); + if (ch == '\\' && preg->regparse[n]) { + /* Non-trailing backslash. + * Is this a special escape, or a regular escape? + */ + if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) { + /* A special escape. All done with EXACTLY */ + break; + } + /* Decode it. Note that we add the length for the escape + * sequence to the length for the backlash so we can skip + * the entire sequence, or not as required. + */ + n += reg_decode_escape(preg->regparse + n, &ch); + if (ch == 0) { + preg->err = REG_ERR_NULL_CHAR; + return 0; + } + } + + /* Now we have one char 'ch' of length 'n'. + * Check to see if the following char is a MULT + */ + + if (ISMULT(preg->regparse[n])) { + /* Yes. But do we already have some EXACTLY chars? */ + if (added) { + /* Yes, so return what we have and pick up the current char next time around */ + break; + } + /* No, so add this single char and finish */ + regc(preg, ch); + added++; + preg->regparse += n; + break; + } + + /* No, so just add this char normally */ + regc(preg, ch); + added++; + preg->regparse += n; + } + regc(preg, '\0'); + + *flagp |= HASWIDTH; + if (added == 1) + *flagp |= SIMPLE; + break; + } + break; + } + + return(ret); +} + +static void reg_grow(regex_t *preg, int n) +{ + if (preg->p + n >= preg->proglen) { + preg->proglen = (preg->p + n) * 2; + preg->program = realloc(preg->program, preg->proglen * sizeof(int)); + } +} + +/* + - regnode - emit a node + */ +/* Location. */ +static int regnode(regex_t *preg, int op) +{ + reg_grow(preg, 2); + + /* The OP followed by a next pointer */ + preg->program[preg->p++] = op; + preg->program[preg->p++] = 0; + + /* Return the start of the node */ + return preg->p - 2; +} + +/* + - regc - emit (if appropriate) a byte of code + */ +static void regc(regex_t *preg, int b ) +{ + reg_grow(preg, 1); + preg->program[preg->p++] = b; +} + +/* + - reginsert - insert an operator in front of already-emitted operand + * + * Means relocating the operand. + * Returns the new location of the original operand. + */ +static int reginsert(regex_t *preg, int op, int size, int opnd ) +{ + reg_grow(preg, size); + + /* Move everything from opnd up */ + memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd)); + /* Zero out the new space */ + memset(preg->program + opnd, 0, sizeof(int) * size); + + preg->program[opnd] = op; + + preg->p += size; + + return opnd + size; +} + +/* + - regtail - set the next-pointer at the end of a node chain + */ +static void regtail(regex_t *preg, int p, int val) +{ + int scan; + int temp; + int offset; + + /* Find last node. */ + scan = p; + for (;;) { + temp = regnext(preg, scan); + if (temp == 0) + break; + scan = temp; + } + + if (OP(preg, scan) == BACK) + offset = scan - val; + else + offset = val - scan; + + preg->program[scan + 1] = offset; +} + +/* + - regoptail - regtail on operand of first argument; nop if operandless + */ + +static void regoptail(regex_t *preg, int p, int val ) +{ + /* "Operandless" and "op != BRANCH" are synonymous in practice. */ + if (p != 0 && OP(preg, p) == BRANCH) { + regtail(preg, OPERAND(p), val); + } +} + +/* + * regexec and friends + */ + +/* + * Forwards. + */ +static int regtry(regex_t *preg, const char *string ); +static int regmatch(regex_t *preg, int prog); +static int regrepeat(regex_t *preg, int p, int max); + +/* + - regexec - match a regexp against a string + */ +int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags) +{ + const char *s; + int scan; + + /* Be paranoid... */ + if (preg == NULL || preg->program == NULL || string == NULL) { + return REG_ERR_NULL_ARGUMENT; + } + + /* Check validity of program. */ + if (*preg->program != REG_MAGIC) { + return REG_ERR_CORRUPTED; + } + +#ifdef DEBUG + fprintf(stderr, "regexec: %s\n", string); + regdump(preg); +#endif + + preg->eflags = eflags; + preg->pmatch = pmatch; + preg->nmatch = nmatch; + preg->start = string; /* All offsets are computed from here */ + + /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */ + for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) { + int op = OP(preg, scan); + if (op == END) + break; + if (op == REPX || op == REPXMIN) + preg->program[scan + 4] = 0; + } + + /* If there is a "must appear" string, look for it. */ + if (preg->regmust != 0) { + s = string; + while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) { + if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) { + break; + } + s++; + } + if (s == NULL) /* Not present. */ + return REG_NOMATCH; + } + + /* Mark beginning of line for ^ . */ + preg->regbol = string; + + /* Simplest case: anchored match need be tried only once (maybe per line). */ + if (preg->reganch) { + if (eflags & REG_NOTBOL) { + /* This is an anchored search, but not an BOL, so possibly skip to the next line */ + goto nextline; + } + while (1) { + if (regtry(preg, string)) { + return REG_NOERROR; + } + if (*string) { +nextline: + if (preg->cflags & REG_NEWLINE) { + /* Try the next anchor? */ + string = strchr(string, '\n'); + if (string) { + preg->regbol = ++string; + continue; + } + } + } + return REG_NOMATCH; + } + } + + /* Messy cases: unanchored match. */ + s = string; + if (preg->regstart != '\0') { + /* We know what char it must start with. */ + while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) { + if (regtry(preg, s)) + return REG_NOERROR; + s++; + } + } + else + /* We don't -- general case. */ + while (1) { + if (regtry(preg, s)) + return REG_NOERROR; + if (*s == '\0') { + break; + } + else { + int c; + s += utf8_tounicode(s, &c); + } + } + + /* Failure. */ + return REG_NOMATCH; +} + +/* + - regtry - try match at specific point + */ + /* 0 failure, 1 success */ +static int regtry( regex_t *preg, const char *string ) +{ + int i; + + preg->reginput = string; + + for (i = 0; i < preg->nmatch; i++) { + preg->pmatch[i].rm_so = -1; + preg->pmatch[i].rm_eo = -1; + } + if (regmatch(preg, 1)) { + preg->pmatch[0].rm_so = string - preg->start; + preg->pmatch[0].rm_eo = preg->reginput - preg->start; + return(1); + } else + return(0); +} + +/** + * Returns bytes matched if 'pattern' is a prefix of 'string'. + * + * If 'nocase' is non-zero, does a case-insensitive match. + * + * Returns -1 on not found. + */ +static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase) +{ + const char *s = string; + while (proglen && *s) { + int ch; + int n = reg_utf8_tounicode_case(s, &ch, nocase); + if (ch != *prog) { + return -1; + } + prog++; + s += n; + proglen--; + } + if (proglen == 0) { + return s - string; + } + return -1; +} + +/** + * Searchs for 'c' in the range 'range'. + * + * Returns 1 if found, or 0 if not. + */ +static int reg_range_find(const int *range, int c) +{ + while (*range) { + /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/ + if (c >= range[1] && c <= (range[0] + range[1] - 1)) { + return 1; + } + range += 2; + } + return 0; +} + +/** + * Search for the character 'c' in the utf-8 string 'string'. + * + * If 'nocase' is set, the 'string' is assumed to be uppercase + * and 'c' is converted to uppercase before matching. + * + * Returns the byte position in the string where the 'c' was found, or + * NULL if not found. + */ +static const char *str_find(const char *string, int c, int nocase) +{ + if (nocase) { + /* The "string" should already be converted to uppercase */ + c = utf8_upper(c); + } + while (*string) { + int ch; + int n = reg_utf8_tounicode_case(string, &ch, nocase); + if (c == ch) { + return string; + } + string += n; + } + return NULL; +} + +/** + * Returns true if 'ch' is an end-of-line char. + * + * In REG_NEWLINE mode, \n is considered EOL in + * addition to \0 + */ +static int reg_iseol(regex_t *preg, int ch) +{ + if (preg->cflags & REG_NEWLINE) { + return ch == '\0' || ch == '\n'; + } + else { + return ch == '\0'; + } +} + +static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin) +{ + int nextch = '\0'; + const char *save; + int no; + int c; + + int max = preg->program[scan + 2]; + int min = preg->program[scan + 3]; + int next = regnext(preg, scan); + + /* + * Lookahead to avoid useless match attempts + * when we know what character comes next. + */ + if (OP(preg, next) == EXACTLY) { + nextch = preg->program[OPERAND(next)]; + } + save = preg->reginput; + no = regrepeat(preg, scan + 5, max); + if (no < min) { + return 0; + } + if (matchmin) { + /* from min up to no */ + max = no; + no = min; + } + /* else from no down to min */ + while (1) { + if (matchmin) { + if (no > max) { + break; + } + } + else { + if (no < min) { + break; + } + } + preg->reginput = save + utf8_index(save, no); + reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE)); + /* If it could work, try it. */ + if (reg_iseol(preg, nextch) || c == nextch) { + if (regmatch(preg, next)) { + return(1); + } + } + if (matchmin) { + /* Couldn't or didn't, add one more */ + no++; + } + else { + /* Couldn't or didn't -- back up. */ + no--; + } + } + return(0); +} + +static int regmatchrepeat(regex_t *preg, int scan, int matchmin) +{ + int *scanpt = preg->program + scan; + + int max = scanpt[2]; + int min = scanpt[3]; + + /* Have we reached min? */ + if (scanpt[4] < min) { + /* No, so get another one */ + scanpt[4]++; + if (regmatch(preg, scan + 5)) { + return 1; + } + scanpt[4]--; + return 0; + } + if (scanpt[4] > max) { + return 0; + } + + if (matchmin) { + /* minimal, so try other branch first */ + if (regmatch(preg, regnext(preg, scan))) { + return 1; + } + /* No, so try one more */ + scanpt[4]++; + if (regmatch(preg, scan + 5)) { + return 1; + } + scanpt[4]--; + return 0; + } + /* maximal, so try this branch again */ + if (scanpt[4] < max) { + scanpt[4]++; + if (regmatch(preg, scan + 5)) { + return 1; + } + scanpt[4]--; + } + /* At this point we are at max with no match. Try the other branch */ + return regmatch(preg, regnext(preg, scan)); +} + +/* + - regmatch - main matching routine + * + * Conceptually the strategy is simple: check to see whether the current + * node matches, call self recursively to see whether the rest matches, + * and then act accordingly. In practice we make some effort to avoid + * recursion, in particular by going through "ordinary" nodes (that don't + * need to know whether the rest of the match failed) by a loop instead of + * by recursion. + */ +/* 0 failure, 1 success */ +static int regmatch(regex_t *preg, int prog) +{ + int scan; /* Current node. */ + int next; /* Next node. */ + const char *save; + + scan = prog; + +#ifdef DEBUG + if (scan != 0 && regnarrate) + fprintf(stderr, "%s(\n", regprop(scan)); +#endif + while (scan != 0) { + int n; + int c; +#ifdef DEBUG + if (regnarrate) { + fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */ + } +#endif + next = regnext(preg, scan); + n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE)); + + switch (OP(preg, scan)) { + case BOLX: + if ((preg->eflags & REG_NOTBOL)) { + return(0); + } + /* Fall through */ + case BOL: + if (preg->reginput != preg->regbol) { + return(0); + } + break; + case EOLX: + if (c != 0) { + /* For EOLX, only match real end of line, not newline */ + return 0; + } + break; + case EOL: + if (!reg_iseol(preg, c)) { + return(0); + } + break; + case WORDA: + /* Must be looking at a letter, digit, or _ */ + if ((!isalnum(UCHAR(c))) && c != '_') + return(0); + /* Prev must be BOL or nonword */ + if (preg->reginput > preg->regbol && + (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_')) + return(0); + break; + case WORDZ: + /* Can't match at BOL */ + if (preg->reginput > preg->regbol) { + /* Current must be EOL or nonword */ + if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') { + c = preg->reginput[-1]; + /* Previous must be word */ + if (isalnum(UCHAR(c)) || c == '_') { + break; + } + } + } + /* No */ + return(0); + + case ANY: + if (reg_iseol(preg, c)) + return 0; + preg->reginput += n; + break; + case EXACTLY: { + int opnd; + int len; + int slen; + + opnd = OPERAND(scan); + len = str_int_len(preg->program + opnd); + + slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE); + if (slen < 0) { + return(0); + } + preg->reginput += slen; + } + break; + case ANYOF: + if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) { + return(0); + } + preg->reginput += n; + break; + case ANYBUT: + if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) { + return(0); + } + preg->reginput += n; + break; + case NOTHING: + break; + case BACK: + break; + case BRANCH: + if (OP(preg, next) != BRANCH) /* No choice. */ + next = OPERAND(scan); /* Avoid recursion. */ + else { + do { + save = preg->reginput; + if (regmatch(preg, OPERAND(scan))) { + return(1); + } + preg->reginput = save; + scan = regnext(preg, scan); + } while (scan != 0 && OP(preg, scan) == BRANCH); + return(0); + /* NOTREACHED */ + } + break; + case REP: + case REPMIN: + return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN); + + case REPX: + case REPXMIN: + return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN); + + case END: + return 1; /* Success! */ + + case OPENNC: + case CLOSENC: + return regmatch(preg, next); + + default: + if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) { + save = preg->reginput; + if (regmatch(preg, next)) { + if (OP(preg, scan) < CLOSE) { + int no = OP(preg, scan) - OPEN; + if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) { + preg->pmatch[no].rm_so = save - preg->start; + } + } + else { + int no = OP(preg, scan) - CLOSE; + if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) { + preg->pmatch[no].rm_eo = save - preg->start; + } + } + return(1); + } + /* Restore input position after failure */ + preg->reginput = save; + return(0); + } + return REG_ERR_INTERNAL; + } + + scan = next; + } + + /* + * We get here only if there's trouble -- normally "case END" is + * the terminating point. + */ + return REG_ERR_INTERNAL; +} + +/* + - regrepeat - repeatedly match something simple, report how many + */ +static int regrepeat(regex_t *preg, int p, int max) +{ + int count = 0; + const char *scan; + int opnd; + int ch; + int n; + + scan = preg->reginput; + opnd = OPERAND(p); + switch (OP(preg, p)) { + case ANY: + while (!reg_iseol(preg, *scan) && count < max) { + count++; + scan += utf8_charlen(*scan); + } + break; + case EXACTLY: + while (count < max) { + n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE); + if (preg->program[opnd] != ch) { + break; + } + count++; + scan += n; + } + break; + case ANYOF: + while (count < max) { + n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE); + if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) { + break; + } + count++; + scan += n; + } + break; + case ANYBUT: + while (count < max) { + n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE); + if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) { + break; + } + count++; + scan += n; + } + break; + default: /* Oh dear. Called inappropriately. */ + preg->err = REG_ERR_INTERNAL; + count = 0; /* Best compromise. */ + break; + } + preg->reginput = scan; + + return(count); +} + +/* + - regnext - dig the "next" pointer out of a node + */ +static int regnext(regex_t *preg, int p ) +{ + int offset; + + offset = NEXT(preg, p); + + if (offset == 0) + return 0; + + if (OP(preg, p) == BACK) + return(p-offset); + else + return(p+offset); +} + +/* + - regopsize - returns the size of opcode + operands at 'p' in words + */ +static int regopsize(regex_t *preg, int p ) +{ + /* Almost all opcodes are 2 words, but some are more */ + switch (OP(preg, p)) { + case REP: + case REPMIN: + case REPX: + case REPXMIN: + return 5; + + case ANYOF: + case ANYBUT: + case EXACTLY: { + int s = p + 2; + while (preg->program[s++]) { + } + return s - p; + } + } + return 2; +} + +#if defined(DEBUG) && !defined(JIM_BOOTSTRAP) + +/* + - regdump - dump a regexp onto stdout in vaguely comprehensible form + */ +static void regdump(regex_t *preg) +{ + int s; + int op = EXACTLY; /* Arbitrary non-END op. */ + int next; + char buf[MAX_UTF8_LEN + 1]; + + int i; + for (i = 1; i < preg->p; i++) { + printf("%02x ", (unsigned char)preg->program[i]); + if (i % 16 == 0) { + printf("\n"); + } + } + printf("\n"); + + s = 1; + while (op != END && s < preg->p) { /* While that wasn't END last time... */ + op = OP(preg, s); + printf("%3d: %s", s, regprop(op)); /* Where, what. */ + next = regnext(preg, s); + if (next == 0) /* Next ptr. */ + printf("(0)"); + else + printf("(%d)", next); + s += 2; + if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) { + int max = preg->program[s]; + int min = preg->program[s + 1]; + if (max == 65535) { + printf("{%d,*}", min); + } + else { + printf("{%d,%d}", min, max); + } + printf(" %d", preg->program[s + 2]); + s += 3; + } + else if (op == ANYOF || op == ANYBUT) { + /* set of ranges */ + + while (preg->program[s]) { + int len = preg->program[s++]; + int first = preg->program[s++]; + buf[utf8_getchars(buf, first)] = 0; + printf("%s", buf); + if (len > 1) { + buf[utf8_getchars(buf, first + len - 1)] = 0; + printf("-%s", buf); + } + } + s++; + } + else if (op == EXACTLY) { + /* Literal string, where present. */ + + while (preg->program[s]) { + buf[utf8_getchars(buf, preg->program[s])] = 0; + printf("%s", buf); + s++; + } + s++; + } + putchar('\n'); + } + + if (op == END) { + /* Header fields of interest. */ + if (preg->regstart) { + buf[utf8_getchars(buf, preg->regstart)] = 0; + printf("start '%s' ", buf); + } + if (preg->reganch) + printf("anchored "); + if (preg->regmust != 0) { + int i; + printf("must have:"); + for (i = 0; i < preg->regmlen; i++) { + putchar(preg->program[preg->regmust + i]); + } + putchar('\n'); + } + } + printf("\n"); +} + +/* + - regprop - printable representation of opcode + */ +static const char *regprop( int op ) +{ + static char buf[50]; + + switch (op) { + case BOL: + return "BOL"; + case EOL: + return "EOL"; + case BOLX: + return "BOLX"; + case EOLX: + return "EOLX"; + case ANY: + return "ANY"; + case ANYOF: + return "ANYOF"; + case ANYBUT: + return "ANYBUT"; + case BRANCH: + return "BRANCH"; + case EXACTLY: + return "EXACTLY"; + case NOTHING: + return "NOTHING"; + case BACK: + return "BACK"; + case END: + return "END"; + case REP: + return "REP"; + case REPMIN: + return "REPMIN"; + case REPX: + return "REPX"; + case REPXMIN: + return "REPXMIN"; + case WORDA: + return "WORDA"; + case WORDZ: + return "WORDZ"; + case OPENNC: + return "OPEN"; + case CLOSENC: + return "CLOSE"; + default: + if (op >= OPEN && op < CLOSE) { + snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN); + } + else if (op >= CLOSE && op < CLOSE_END) { + snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE); + } + else { + snprintf(buf, sizeof(buf), "?%d?\n", op); + } + return(buf); + } +} +#endif /* JIM_BOOTSTRAP */ + +size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) +{ + static const char *error_strings[] = { + "success", + "no match", + "bad pattern", + "null argument", + "unknown error", + "too big", + "out of memory", + "too many ()", + "parentheses () not balanced", + "braces {} not balanced", + "invalid repetition count(s)", + "extra characters", + "*+ of empty atom", + "nested count", + "internal error", + "count follows nothing", + "trailing backslash", + "corrupted program", + "contains null char", + }; + const char *err; + + if (errcode < 0 || errcode >= REG_ERR_NUM) { + err = "Bad error code"; + } + else { + err = error_strings[errcode]; + } + + return snprintf(errbuf, errbuf_size, "%s", err); +} + +void regfree(regex_t *preg) +{ + free(preg->program); +} + +#endif diff --git a/regexp/jimregexp.h b/regexp/jimregexp.h new file mode 100644 index 000000000..b7598d4b3 --- /dev/null +++ b/regexp/jimregexp.h @@ -0,0 +1,109 @@ +#ifndef JIMREGEXP_H +#define JIMREGEXP_H + +/** regexp(3)-compatible regular expression implementation for Jim. + * + * See jimregexp.c for details + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +typedef struct { + int rm_so; + int rm_eo; +} regmatch_t; + +/* + * The "internal use only" fields in regexp.h are present to pass info from + * compile to execute that permits the execute phase to run lots faster on + * simple cases. They are: + * + * regstart char that must begin a match; '\0' if none obvious + * reganch is the match anchored (at beginning-of-line only)? + * regmust string (pointer into program) that match must include, or NULL + * regmlen length of regmust string + * + * Regstart and reganch permit very fast decisions on suitable starting points + * for a match, cutting down the work a lot. Regmust permits fast rejection + * of lines that cannot possibly match. The regmust tests are costly enough + * that regcomp() supplies a regmust only if the r.e. contains something + * potentially expensive (at present, the only such thing detected is * or + + * at the start of the r.e., which can involve a lot of backup). Regmlen is + * supplied because the test in regexec() needs it and regcomp() is computing + * it anyway. + */ + +typedef struct regexp { + /* -- public -- */ + int re_nsub; /* number of parenthesized subexpressions */ + + /* -- private -- */ + int cflags; /* Flags used when compiling */ + int err; /* Any error which occurred during compile */ + int regstart; /* Internal use only. */ + int reganch; /* Internal use only. */ + int regmust; /* Internal use only. */ + int regmlen; /* Internal use only. */ + int *program; /* Allocated */ + + /* working state - compile */ + const char *regparse; /* Input-scan pointer. */ + int p; /* Current output pos in program */ + int proglen; /* Allocated program size */ + + /* working state - exec */ + int eflags; /* Flags used when executing */ + const char *start; /* Initial string pointer. */ + const char *reginput; /* Current input pointer. */ + const char *regbol; /* Beginning of input, for ^ check. */ + + /* Input to regexec() */ + regmatch_t *pmatch; /* submatches will be stored here */ + int nmatch; /* size of pmatch[] */ +} regexp; + +typedef regexp regex_t; + +#define REG_EXTENDED 0 +#define REG_NEWLINE 1 +#define REG_ICASE 2 + +#define REG_NOTBOL 16 + +enum { + REG_NOERROR, /* Success. */ + REG_NOMATCH, /* Didn't find a match (for regexec). */ + REG_BADPAT, /* >= REG_BADPAT is an error */ + REG_ERR_NULL_ARGUMENT, + REG_ERR_UNKNOWN, + REG_ERR_TOO_BIG, + REG_ERR_NOMEM, + REG_ERR_TOO_MANY_PAREN, + REG_ERR_UNMATCHED_PAREN, + REG_ERR_UNMATCHED_BRACES, + REG_ERR_BAD_COUNT, + REG_ERR_JUNK_ON_END, + REG_ERR_OPERAND_COULD_BE_EMPTY, + REG_ERR_NESTED_COUNT, + REG_ERR_INTERNAL, + REG_ERR_COUNT_FOLLOWS_NOTHING, + REG_ERR_TRAILING_BACKSLASH, + REG_ERR_CORRUPTED, + REG_ERR_NULL_CHAR, + REG_ERR_NUM +}; + +int regcomp(regex_t *preg, const char *regex, int cflags); +int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); +size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size); +void regfree(regex_t *preg); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/regexp/utf8.c b/regexp/utf8.c new file mode 100644 index 000000000..405c20d22 --- /dev/null +++ b/regexp/utf8.c @@ -0,0 +1,262 @@ +/** + * UTF-8 utility functions + * + * (c) 2010-2016 Steve Bennett + * + * See LICENCE for licence details. + */ + +#include +#include +#include +#include +#include +#include "utf8.h" + +/* This one is always implemented */ +int utf8_fromunicode(char *p, unsigned uc) +{ + if (uc <= 0x7f) { + *p = uc; + return 1; + } + else if (uc <= 0x7ff) { + *p++ = 0xc0 | ((uc & 0x7c0) >> 6); + *p = 0x80 | (uc & 0x3f); + return 2; + } + else if (uc <= 0xffff) { + *p++ = 0xe0 | ((uc & 0xf000) >> 12); + *p++ = 0x80 | ((uc & 0xfc0) >> 6); + *p = 0x80 | (uc & 0x3f); + return 3; + } + /* Note: We silently truncate to 21 bits here: 0x1fffff */ + else { + *p++ = 0xf0 | ((uc & 0x1c0000) >> 18); + *p++ = 0x80 | ((uc & 0x3f000) >> 12); + *p++ = 0x80 | ((uc & 0xfc0) >> 6); + *p = 0x80 | (uc & 0x3f); + return 4; + } +} + +#if defined(USE_UTF8) && !defined(JIM_BOOTSTRAP) +int utf8_charlen(int c) +{ + if ((c & 0x80) == 0) { + return 1; + } + if ((c & 0xe0) == 0xc0) { + return 2; + } + if ((c & 0xf0) == 0xe0) { + return 3; + } + if ((c & 0xf8) == 0xf0) { + return 4; + } + /* Invalid sequence, so treat it as a single byte */ + return 1; +} + +int utf8_strlen(const char *str, int bytelen) +{ + int charlen = 0; + if (bytelen < 0) { + bytelen = strlen(str); + } + while (bytelen > 0) { + int c; + int l = utf8_tounicode(str, &c); + charlen++; + str += l; + bytelen -= l; + } + return charlen; +} + +int utf8_strwidth(const char *str, int charlen) +{ + int width = 0; + while (charlen) { + int c; + int l = utf8_tounicode(str, &c); + width += utf8_width(c); + str += l; + charlen--; + } + return width; +} + +int utf8_index(const char *str, int index) +{ + const char *s = str; + while (index--) { + s += utf8_charlen(*s); + } + return s - str; +} + +int utf8_prev_len(const char *str, int len) +{ + int n = 1; + + assert(len > 0); + + /* Look up to len chars backward for a start-of-char byte */ + while (--len) { + if ((str[-n] & 0x80) == 0) { + /* Start of a 1-byte char */ + break; + } + if ((str[-n] & 0xc0) == 0xc0) { + /* Start of a multi-byte char */ + break; + } + n++; + } + return n; +} + +int utf8_tounicode(const char *str, int *uc) +{ + unsigned const char *s = (unsigned const char *)str; + + if (s[0] < 0xc0) { + *uc = s[0]; + return 1; + } + if (s[0] < 0xe0) { + if ((s[1] & 0xc0) == 0x80) { + *uc = ((s[0] & ~0xc0) << 6) | (s[1] & ~0x80); + if (*uc >= 0x80) { + return 2; + } + /* Otherwise this is an invalid sequence */ + } + } + else if (s[0] < 0xf0) { + if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80)) { + *uc = ((s[0] & ~0xe0) << 12) | ((s[1] & ~0x80) << 6) | (s[2] & ~0x80); + if (*uc >= 0x800) { + return 3; + } + /* Otherwise this is an invalid sequence */ + } + } + else if (s[0] < 0xf8) { + if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) { + *uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80); + if (*uc >= 0x10000) { + return 4; + } + /* Otherwise this is an invalid sequence */ + } + } + + /* Invalid sequence, so just return the byte */ + *uc = *s; + return 1; +} + +struct casemap { + unsigned short code; /* code point */ + unsigned short altcode; /* alternate case code point */ +}; + +struct utf8range { + unsigned lower; /* lower inclusive */ + unsigned upper; /* upper exclusive */ +}; + + +/* Generated mapping tables */ +#include "_unicode_mapping.c" + +#define ARRAYSIZE(A) sizeof(A) / sizeof(*(A)) + +static int cmp_casemap(const void *key, const void *cm) +{ + return *(int *)key - (int)((const struct casemap *)cm)->code; +} + +static int utf8_map_case(const struct casemap *mapping, int num, int ch) +{ + /* We only support 16 bit case mapping */ + if (ch <= 0xffff) { + const struct casemap *cm = + bsearch(&ch, mapping, num, sizeof(*mapping), cmp_casemap); + + if (cm) { + return cm->altcode; + } + } + return ch; +} + +static int cmp_range(const void *key, const void *cm) +{ + const struct utf8range *range = (const struct utf8range *)cm; + unsigned ch = *(unsigned *)key; + if (ch < range->lower) { + return -1; + } + if (ch >= range->upper) { + return 1; + } + return 0; +} + +static int utf8_in_range(const struct utf8range *range, int num, int ch) +{ + const struct utf8range *r = + bsearch(&ch, range, num, sizeof(*range), cmp_range); + + if (r) { + return 1; + } + return 0; +} + +int utf8_upper(int ch) +{ + if (isascii(ch)) { + return toupper(ch); + } + return utf8_map_case(unicode_case_mapping_upper, ARRAYSIZE(unicode_case_mapping_upper), ch); +} + +int utf8_lower(int ch) +{ + if (isascii(ch)) { + return tolower(ch); + } + return utf8_map_case(unicode_case_mapping_lower, ARRAYSIZE(unicode_case_mapping_lower), ch); +} + +int utf8_title(int ch) +{ + if (!isascii(ch)) { + int newch = utf8_map_case(unicode_case_mapping_title, ARRAYSIZE(unicode_case_mapping_title), ch); + if (newch != ch) { + return newch ? newch : ch; + } + } + return utf8_upper(ch); +} + +int utf8_width(int ch) +{ + if (!isascii(ch)) { + if (utf8_in_range(unicode_range_combining, ARRAYSIZE(unicode_range_combining), ch)) { + return 0; + } + if (utf8_in_range(unicode_range_wide, ARRAYSIZE(unicode_range_wide), ch)) { + return 2; + } + } + return 1; +} + +#endif /* JIM_BOOTSTRAP */ diff --git a/regexp/utf8.h b/regexp/utf8.h new file mode 100644 index 000000000..99706839a --- /dev/null +++ b/regexp/utf8.h @@ -0,0 +1,150 @@ +#ifndef UTF8_UTIL_H +#define UTF8_UTIL_H + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * UTF-8 utility functions + * + * (c) 2010-2016 Steve Bennett + * + * See LICENCE for licence details. + */ +#include + +/* Currently we support unicode points up to 2^22-1 */ +#define MAX_UTF8_LEN 4 + +/** + * Converts the given unicode codepoint (0 - 0x1fffff) to utf-8 + * and stores the result at 'p'. + * + * Returns the number of utf-8 characters (up to MAX_UTF8_LEN). + */ +int utf8_fromunicode(char *p, unsigned uc); + +#ifndef JIM_UTF8 +#include + +/* No utf-8 support. 1 byte = 1 char */ +#define utf8_strlen(S, B) ((B) < 0 ? (int)strlen(S) : (B)) +#define utf8_strwidth(S, B) utf8_strlen((S), (B)) +#define utf8_tounicode(S, CP) (*(CP) = (unsigned char)*(S), 1) +#define utf8_getchars(CP, C) (*(CP) = (C), 1) +#define utf8_upper(C) toupper(C) +#define utf8_title(C) toupper(C) +#define utf8_lower(C) tolower(C) +#define utf8_index(C, I) (I) +#define utf8_charlen(C) 1 +#define utf8_prev_len(S, L) 1 +#define utf8_width(C) 1 + +#else +#if !defined(JIM_BOOTSTRAP) + +#define utf8_getchars utf8_fromunicode + +/** + * Returns the length of the utf-8 sequence starting with 'c'. + * + * Returns 1-4. + * If 'c' is not a valid start byte, returns 1. + */ +int utf8_charlen(int c); + +/** + * Returns the number of characters in the utf-8 + * string of the given byte length. + * + * Any bytes which are not part of an valid utf-8 + * sequence are treated as individual characters. + * + * The string *must* be null terminated. + * + * Does not support unicode code points > \u1fffff + */ +int utf8_strlen(const char *str, int bytelen); + +/** + * Calculates the display width of the first 'charlen' characters in 'str'. + * See utf8_width() + */ +int utf8_strwidth(const char *str, int charlen); + +/** + * Returns the byte index of the given character in the utf-8 string. + * + * The string *must* be null terminated. + * + * This will return the byte length of a utf-8 string + * if given the char length. + */ +int utf8_index(const char *str, int charindex); + +/** + * Returns the unicode codepoint corresponding to the + * utf-8 sequence 'str'. + * + * Stores the result in *uc and returns the number of bytes + * consumed. + * + * If 'str' is null terminated, then an invalid utf-8 sequence + * at the end of the string will be returned as individual bytes. + * + * If it is not null terminated, the length *must* be checked first. + * + * Does not support unicode code points > \u1fffff + */ +int utf8_tounicode(const char *str, int *uc); + +/** + * Returns the number of bytes before 'str' that the previous + * utf-8 character sequence starts (which may be the middle of a sequence). + * + * Looks back at most 'len' bytes backwards, which must be > 0. + * If no start char is found, returns -len + */ +int utf8_prev_len(const char *str, int len); + +/** + * Returns the upper-case variant of the given unicode codepoint. + * + * Unicode code points > \uffff are returned unchanged. + */ +int utf8_upper(int uc); + +/** + * Returns the title-case variant of the given unicode codepoint. + * + * If none, returns utf8_upper(). + * + * Unicode code points > \uffff are returned unchanged. + */ +int utf8_title(int uc); + +/** + * Returns the lower-case variant of the given unicode codepoint. + * + * NOTE: Use utf8_upper() in preference for case-insensitive matching. + * + * Unicode code points > \uffff are returned unchanged. + */ +int utf8_lower(int uc); + +/** + * Returns the width (in characters) of the given unicode codepoint. + * This is 1 for normal letters and 0 for combining characters and 2 for wide characters. + */ +int utf8_width(int ch); + +#endif /* JIM_BOOTSTRAP */ + +#endif + +#ifdef __cplusplus +} +#endif + +#endif