# define INCLUDE_CTYPE
# include "ed.h"
# include "regexp.h"
/*
* Regular expression matching. This is all fairly standard. Character classes
* are implemented as bitmaps for a fast lookup. A special case is LSTAR, which
* is like STAR except that only the longest possible match can be a match
* (quite a speedup in some cases).
*/
# define EOM 0x00 /* end of match */
# define EOL 0x01 /* pattern$ */
# define ANY 0x02 /* . */
# define SINGLE 0x03 /* 'a' */
# define CCLASS 0x04 /* [a-zA-Z] */
# define STAR 0x05 /* * */
# define LSTAR 0x06 /* * with longest possible match */
# define SOW 0x07 /* \< */
# define EOW 0x08 /* \> */
# define LBRAC 0x09 /* \( */
# define RBRAC 0x0a /* \) */
# define CCLSIZE (256 / 8)
# define CCL(ccl, op, c) ((ccl)[UCHAR(c) / 8] op (1 << ((c) & 7)))
# define CCL_BUF(rx, c) ((rx)->buffer + RXBUFSZ - CCLSIZE * UCHAR(c))
# define CCL_CODE(rx, ccl) (((rx)->buffer + RXBUFSZ - (ccl)) / CCLSIZE)
/*
* NAME: rxbuf->new()
* DESCRIPTION: Create a new regular expression buffer.
*/
rxbuf *rx_new()
{
rxbuf *rx;
rx = ALLOC(rxbuf, 1);
rx->valid = 0;
return rx;
}
/*
* NAME: rxbuf->del()
* DESCRIPTION: Delete a regular expression buffer.
*/
void rx_del(rx)
rxbuf *rx;
{
FREE(rx);
}
/*
* NAME: rxbuf->comp()
* DESCRIPTION: Compile a regular expression. Return 0 if succesful, or an
* errorstring otherwise.
* There are two gotos in this function. Reading the source
* code is considered harmful.
*/
char *rx_comp(rx, pattern)
rxbuf *rx;
char *pattern;
{
register char *p, *m, *prevcode, *prevpat, *cclass, *eoln;
char letter, c, dummy, braclist[NSUBEXP];
int brac, depth;
/* initialize */
rx->valid = 0;
rx->anchor = 0;
rx->firstc = '\0';
cclass = rx->buffer + RXBUFSZ;
eoln = (char *) NULL;
dummy = 0;
prevpat = &dummy;
prevcode = &dummy;
brac = depth = 0;
p = pattern;
m = rx->buffer;
/* process the regular expression */
while (*p) {
switch (*p) {
case '^':
if (*prevpat == 0) { /* is this the first pattern char? */
rx->anchor = 1;
} else {
goto single;
}
break;
case '.':
prevcode = prevpat = m;
*m++ = ANY;
break;
case '*':
if (*prevcode == RBRAC) {
return "Regular expression contains \\)*";
}
if (*prevcode == EOW) {
/* not really an error, but too troublesome */
return "Regular expression contains \\>*";
}
if (*prevpat == 0) {
return "* must follow pattern";
}
switch (*prevcode) {
case STAR:
case LSTAR:
/* ignore */
break;
case ANY:
--m;
prevcode = prevpat = m;
*m++ = STAR;
*m++ = ANY;
break;
case SINGLE:
case CCLASS:
letter = *--m;
c = *--m;
prevcode = prevpat = m;
*m++ = STAR;
*m++ = c;
*m++ = letter;
break;
}
break;
case '[':
cclass -= CCLSIZE;
memset(cclass, '\0', CCLSIZE);
letter = *++p;
if (letter == '^') { /* remember this for later */
p++;
}
if (*p == ']') {
return "Empty character class";
}
while (*p != ']') {
if (*p == '\\') {
p++;
}
if (*p == '\0') {
return "Unmatched [";
}
CCL(cclass, |=, *p);
if (p[1] == '-' && p[2] != ']') { /* subrange */
c = *p;
p += 2;
if (*p == '\0') {
return "Unmatched [";
}
if (UCHAR(c) > UCHAR(*p)) {
return "Invalid character class";
}
while (UCHAR(c) <= UCHAR(*p)) {
/* this could be done more efficiently. */
CCL(cclass, |=, c);
c++;
}
}
p++;
}
if (letter == '^') {
register int i;
/* invert the whole cclass */
i = CCLSIZE;
do {
cclass[i] ^= 0xff;
} while (--i > 0);
*cclass &= ~1; /* never matches 0 */
}
if (*prevpat == STAR) {
/*
* if the starred pattern is followed by something that could
* not possibly match the starred pattern, the STAR can be
* changed into an LSTAR.
*/
if (prevpat[1] == SINGLE) {
if (CCL(cclass, &, prevpat[2]) == 0) {
*prevpat = LSTAR;
}
} else if (prevpat[1] == CCLASS) {
register char *ccl2;
register int i;
i = CCLSIZE;
ccl2 = CCL_BUF(rx, i);
do {
if (cclass[i] & ccl2[i]) break;
} while (--i > 0);
if (i == 0) {
*prevpat = LSTAR;
}
}
}
prevcode = prevpat = m;
*m++ = CCLASS;
*m++ = CCL_CODE(rx, cclass);
break;
case '\\':
switch (*++p) {
case '<':
prevcode = m;
*m++ = SOW;
break;
case '>':
if (*prevpat == 0) {
return "\\> must follow pattern";
}
prevcode = m;
*m++ = EOW;
break;
case '(':
if (brac == NSUBEXP) {
return "Too many \\( \\) pairs";
}
prevcode = m;
*m++ = LBRAC;
*m++ = braclist[depth++] = brac++;
break;
case ')':
if (depth == 0) {
return "Unmatched \\)";
}
prevcode = m;
*m++ = RBRAC;
*m++ = braclist[--depth];
break;
case '\0':
return "Premature end of pattern";
default:
goto single;
}
break;
case '$':
eoln = m;
/* fall through, if it really is a EOL it will be changed later. */
default:
single:
if (*prevpat == STAR) {
/* find out if the STAR could be made LSTAR */
if (prevpat[1] == SINGLE) {
if (prevpat[2] != *p) {
*prevpat = LSTAR;
}
} else if (prevpat[1] == CCLASS) {
if (CCL(CCL_BUF(rx, prevpat[2]), &, *p) == 0) {
*prevpat = LSTAR;
}
}
}
prevcode = prevpat = m;
*m++ = SINGLE;
*m++ = *p;
break;
}
if (m >= cclass - 2) {
return "Regular expression too complex";
}
p++;
}
/* the pattern has been compiled correctly, make some final checks */
if (depth > 0) {
return "Unmatched \\(";
}
rx->start = (char *) NULL;
while (brac < NSUBEXP) {
rx->se[brac++].start = (char *) NULL; /* unused */
}
if (*prevpat == SINGLE && prevpat == eoln) {
*prevpat++ = EOL;
*prevpat = EOL; /* won't hurt */
}
*m = EOM;
if (rx->buffer[0] == SINGLE) {
rx->firstc = rx->buffer[1]; /* first char for quick search */
}
rx->valid = 1; /* buffer contains valid NFA */
return (char *) NULL;
}
/*
* NAME: match()
* DESCRIPTION: match the text (t) against the pattern (m). Return 1 if
* success.
*/
static bool match(rx, text, ic, m, t)
rxbuf *rx;
char *text;
bool ic;
register char *m, *t;
{
register char *p, *cclass, code, c;
for (;;) {
switch (code = *m++) {
case EOM:
/* found a match */
rx->start = t;
return TRUE;
case EOL:
if (*t != '\0') {
return FALSE;
}
/* cannot return at this point as a \) might still follow. */
continue;
case ANY:
if (*t == '\0') { /* any but '\0' */
return FALSE;
}
break;
case SINGLE:
/* match single character */
if (*t != *m && (!ic || tolower(*t) != tolower(*m))) {
return FALSE;
}
m++;
break;
case CCLASS:
/* match character class */
cclass = CCL_BUF(rx, *m++);
if (CCL(cclass, &, *t) == 0) {
if (ic) {
c = tolower(*t);
if (CCL(cclass, &, c)) {
break;
}
}
return FALSE;
}
break;
case STAR:
case LSTAR:
p = t;
/* match as much characters as possible */
switch (*m++) {
case ANY:
while (*p != '\0') {
p++;
}
break;
case SINGLE:
while (*p == *m || (ic && tolower(*p) == tolower(*m))) {
p++;
}
m++;
break;
case CCLASS:
cclass = CCL_BUF(rx, *m++);
if (!ic) {
while (CCL(cclass, &, *p)) {
p++;
}
} else {
c = tolower(*p);
while (CCL(cclass, &, c)) {
p++;
c = tolower(*p);
}
}
break;
}
if (code == LSTAR) {
/* only the maximum match is a match */
t = p;
} else {
/* try all possible lengths of the starred pattern */
while (p > t) {
if (match(rx, text, ic, m, p)) {
return TRUE;
}
--p;
}
}
continue;
case SOW:
/* start of word */
if ((t != text && (isalnum(t[-1]) || t[-1] == '_'))
|| (!isalpha(*t) && *t != '_')) {
return FALSE;
}
continue;
case EOW:
/* end of word */
if ((!isalnum(t[-1]) && t[-1] != '_')
|| (isalpha(*t) || *t == '_')) {
return FALSE;
}
continue;
case LBRAC:
/* start of subexpression */
rx->se[*m++].start = t;
continue;
case RBRAC:
/* end of subexpression */
rx->se[*m].size = t - rx->se[*m].start;
m++;
continue;
}
t++;
}
}
/*
* NAME: rxbuf->exec()
* DESCRIPTION: try to match a string, possibly indexed, possibly with no
* difference between lowercase and uppercase. Return -1 if the
* pattern is invalid, 0 if no match was found, or 1 if a match
* was found.
*/
int rx_exec(rx, text, idx, ic)
register rxbuf *rx;
register char *text;
register int idx;
int ic;
{
rx->start = (char *) NULL;
if (!rx->valid) {
return -1;
}
if (rx->anchor) {
/* the easy case */
if (idx || !match(rx, text, ic, rx->buffer, text)) {
return 0;
}
} else {
for (;;) {
if (rx->firstc != '\0') {
register char *p;
/* find the first character of the pattern in the string */
p = strchr(text + idx, rx->firstc);
if (ic) {
register char *q;
q = strchr(text + idx, toupper(rx->firstc));
if (q != (char*) NULL && (p == (char *) NULL || p > q)) {
p = q;
}
}
if (p != (char *) NULL) {
idx = p - text;
} else {
return 0;
}
}
if (match(rx, text + idx, ic, rx->buffer, text + idx)) {
break;
}
/* if no match, try the next character in the string */
if (text[idx++] == '\0') {
return 0;
}
}
}
/* a match was found, record its starting place and length */
rx->size = rx->start - text - idx;
rx->start -= rx->size;
return 1;
}