/* * Copyright (c) 1980 Regents of the University of California. * All rights reserved. The Berkeley software License Agreement * specifies the terms and conditions for redistribution. */ #if defined(LIBC_SCCS) && !defined(lint) static char sccsid[] = "@(#)regex.c 5.2 (Berkeley) 3/9/86"; #endif LIBC_SCCS and not lint # /* * routines to do regular expression matching * * Entry points: * * re_comp(s) * char *s; * ... returns 0 if the string s was compiled successfully, * a pointer to an error message otherwise. * If passed 0 or a null string returns without changing * the currently compiled re (see note 11 below). * * re_exec(s) * char *s; * ... returns 1 if the string s matches the last compiled regular * expression, * 0 if the string s failed to match the last compiled * regular expression, and * -1 if the compiled regular expression was invalid * (indicating an internal error). * * The strings passed to both re_comp and re_exec may have trailing or * embedded newline characters; they are terminated by nulls. * * The identity of the author of these routines is lost in antiquity; * this is essentially the same as the re code in the original V6 ed. * * The regular expressions recognized are described below. This description * is essentially the same as that for ed. * * A regular expression specifies a set of strings of characters. * A member of this set of strings is said to be matched by * the regular expression. In the following specification for * regular expressions the word `character' means any character but NUL. * * 1. Any character except a special character matches itself. * Special characters are the regular expression delimiter plus * \ [ . and sometimes ^ * $. * 2. A . matches any character. * 3. A \ followed by any character except a digit or ( ) * matches that character. * 4. A nonempty string s bracketed [s] (or [^s]) matches any * character in (or not in) s. In s, \ has no special meaning, * and ] may only appear as the first letter. A substring * a-b, with a and b in ascending ASCII order, stands for * the inclusive range of ASCII characters. * 5. A regular expression of form 1-4 followed by * matches a * sequence of 0 or more matches of the regular expression. * 6. A regular expression, x, of form 1-8, bracketed \(x\) * matches what x matches. * 7. A \ followed by a digit n matches a copy of the string that the * bracketed regular expression beginning with the nth \( matched. * 8. A regular expression of form 1-8, x, followed by a regular * expression of form 1-7, y matches a match for x followed by * a match for y, with the x match being as long as possible * while still permitting a y match. * 9. A regular expression of form 1-8 preceded by ^ (or followed * by $), is constrained to matches that begin at the left * (or end at the right) end of a line. * 10. A regular expression of form 1-9 picks out the longest among * the leftmost matches in a line. * 11. An empty regular expression stands for a copy of the last * regular expression encountered. */ /* * constants for re's */ #define CBRA 1 #define CCHR 2 #define CDOT 4 #define CCL 6 #define NCCL 8 #define CDOL 10 #define CEOF 11 #define CKET 12 #define CBACK 18 #define CSTAR 01 #define ESIZE 512 #define NBRA 9 static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; static char circf; /* * compile the regular expression argument into a dfa */ char * re_comp(sp) register char *sp; { register int c; register char *ep = expbuf; int cclcnt, numbra = 0; char *lastep = 0; char bracket[NBRA]; char *bracketp = &bracket[0]; static char *retoolong = "Regular expression too long"; #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } if (sp == 0 || *sp == '\0') { if (*ep == 0) return("No previous regular expression"); return(0); } if (*sp == '^') { circf = 1; sp++; } else circf = 0; for (;;) { if (ep >= &expbuf[ESIZE]) comerr(retoolong); if ((c = *sp++) == '\0') { if (bracketp != bracket) comerr("unmatched \\("); *ep++ = CEOF; *ep++ = 0; return(0); } if (c != '*') lastep = ep; switch (c) { case '.': *ep++ = CDOT; continue; case '*': if (lastep == 0 || *lastep == CBRA || *lastep == CKET) goto defchar; *lastep |= CSTAR; continue; case '$': if (*sp != '\0') goto defchar; *ep++ = CDOL; continue; case '[': *ep++ = CCL; *ep++ = 0; cclcnt = 1; if ((c = *sp++) == '^') { c = *sp++; ep[-2] = NCCL; } do { if (c == '\0') comerr("missing ]"); if (c == '-' && ep [-1] != 0) { if ((c = *sp++) == ']') { *ep++ = '-'; cclcnt++; break; } while (ep[-1] < c) { *ep = ep[-1] + 1; ep++; cclcnt++; if (ep >= &expbuf[ESIZE]) comerr(retoolong); } } *ep++ = c; cclcnt++; if (ep >= &expbuf[ESIZE]) comerr(retoolong); } while ((c = *sp++) != ']'); lastep[1] = cclcnt; continue; case '\\': if ((c = *sp++) == '(') { if (numbra >= NBRA) comerr("too many \\(\\) pairs"); *bracketp++ = numbra; *ep++ = CBRA; *ep++ = numbra++; continue; } if (c == ')') { if (bracketp <= bracket) comerr("unmatched \\)"); *ep++ = CKET; *ep++ = *--bracketp; continue; } if (c >= '1' && c < ('1' + NBRA)) { *ep++ = CBACK; *ep++ = c - '1'; continue; } *ep++ = CCHR; *ep++ = c; continue; defchar: default: *ep++ = CCHR; *ep++ = c; } } } /* * match the argument string against the compiled re */ int re_exec(p1) register char *p1; { register char *p2 = expbuf; register int c; int rv; for (c = 0; c < NBRA; c++) { braslist[c] = 0; braelist[c] = 0; } if (circf) return((advance(p1, p2))); /* * fast check for first character */ if (*p2 == CCHR) { c = p2[1]; do { if (*p1 != c) continue; if (rv = advance(p1, p2)) return(rv); } while (*p1++); return(0); } /* * regular algorithm */ do if (rv = advance(p1, p2)) return(rv); while (*p1++); return(0); } /* * try to match the next thing in the dfa */ static int advance(lp, ep) register char *lp, *ep; { register char *curlp; int ct, i; int rv; for (;;) switch (*ep++) { case CCHR: if (*ep++ == *lp++) continue; return(0); case CDOT: if (*lp++) continue; return(0); case CDOL: if (*lp == '\0') continue; return(0); case CEOF: return(1); case CCL: if (cclass(ep, *lp++, 1)) { ep += *ep; continue; } return(0); case NCCL: if (cclass(ep, *lp++, 0)) { ep += *ep; continue; } return(0); case CBRA: braslist[*ep++] = lp; continue; case CKET: braelist[*ep++] = lp; continue; case CBACK: if (braelist[i = *ep++] == 0) return(-1); if (backref(i, lp)) { lp += braelist[i] - braslist[i]; continue; } return(0); case CBACK|CSTAR: if (braelist[i = *ep++] == 0) return(-1); curlp = lp; ct = braelist[i] - braslist[i]; while (backref(i, lp)) lp += ct; while (lp >= curlp) { if (rv = advance(lp, ep)) return(rv); lp -= ct; } continue; case CDOT|CSTAR: curlp = lp; while (*lp++) ; goto star; case CCHR|CSTAR: curlp = lp; while (*lp++ == *ep) ; ep++; goto star; case CCL|CSTAR: case NCCL|CSTAR: curlp = lp; while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) ; ep += *ep; goto star; star: do { lp--; if (rv = advance(lp, ep)) return(rv); } while (lp > curlp); return(0); default: return(-1); } } backref(i, lp) register int i; register char *lp; { register char *bp; bp = braslist[i]; while (*bp++ == *lp++) if (bp >= braelist[i]) return(1); return(0); } int cclass(set, c, af) register char *set, c; int af; { register int n; if (c == 0) return(0); n = *set++; while (--n) if (*set++ == c) return(af); return(! af); }