1: /* $Header: search.c,v 4.3 85/05/01 11:50:16 lwall Exp $
   2:  *
   3:  * $Log:	search.c,v $
   4:  * Revision 4.3  85/05/01  11:50:16  lwall
   5:  * Baseline for release with 4.3bsd.
   6:  *
   7:  */
   8: 
   9: /* string search routines */
  10: 
  11: /*		Copyright (c) 1981,1980 James Gosling		*/
  12: 
  13: /* Modified Aug. 12, 1981 by Tom London to include regular expressions
  14:    as in ed.  RE stuff hacked over by jag to correct a few major problems,
  15:    mainly dealing with searching within the buffer rather than copying
  16:    each line to a separate array.  Newlines can now appear in RE's */
  17: 
  18: /* Ripped to shreds and glued back together to make a search package,
  19:  * July 6, 1984, by Larry Wall. (If it doesn't work, it's probably my fault.)
  20:  * Changes include:
  21:  *	Buffer, window, and mlisp stuff gone.
  22:  *	Translation tables reduced to 1 table.
  23:  *	Expression buffer is now dynamically allocated.
  24:  *	Character classes now implemented with a bitmap.
  25:  */
  26: 
  27: #include "EXTERN.h"
  28: #include "common.h"
  29: #include "util.h"
  30: #include "INTERN.h"
  31: #include "search.h"
  32: 
  33: #ifndef BITSPERBYTE
  34: #define BITSPERBYTE 8
  35: #endif
  36: 
  37: #define BMAPSIZ (127 / BITSPERBYTE + 1)
  38: 
  39: /* meta characters in the "compiled" form of a regular expression */
  40: #define CBRA    2       /* \( -- begin bracket */
  41: #define CCHR    4       /* a vanilla character */
  42: #define CDOT    6       /* . -- match anything except a newline */
  43: #define CCL 8       /* [...] -- character class */
  44: #define NCCL    10      /* [^...] -- negated character class */
  45: #define CDOL    12      /* $ -- matches the end of a line */
  46: #define CEND    14      /* The end of the pattern */
  47: #define CKET    16      /* \) -- close bracket */
  48: #define CBACK   18      /* \N -- backreference to the Nth bracketed
  49: 				   string */
  50: #define CIRC    20      /* ^ matches the beginning of a line */
  51: 
  52: #define WORD    32      /* matches word character \w */
  53: #define NWORD   34      /* matches non-word characer \W */
  54: #define WBOUND  36      /* matches word boundary \b */
  55: #define NWBOUND 38      /* matches non-(word boundary) \B */
  56: 
  57: #define STAR    01      /* * -- Kleene star, repeats the previous
  58: 				   REas many times as possible; the value
  59: 				   ORs with the other operator types */
  60: 
  61: #define ASCSIZ 0200
  62: typedef char    TRANSTABLE[ASCSIZ];
  63: 
  64: static  TRANSTABLE trans = {
  65: 0000,0001,0002,0003,0004,0005,0006,0007,
  66: 0010,0011,0012,0013,0014,0015,0016,0017,
  67: 0020,0021,0022,0023,0024,0025,0026,0027,
  68: 0030,0031,0032,0033,0034,0035,0036,0037,
  69: 0040,0041,0042,0043,0044,0045,0046,0047,
  70: 0050,0051,0052,0053,0054,0055,0056,0057,
  71: 0060,0061,0062,0063,0064,0065,0066,0067,
  72: 0070,0071,0072,0073,0074,0075,0076,0077,
  73: 0100,0101,0102,0103,0104,0105,0106,0107,
  74: 0110,0111,0112,0113,0114,0115,0116,0117,
  75: 0120,0121,0122,0123,0124,0125,0126,0127,
  76: 0130,0131,0132,0133,0134,0135,0136,0137,
  77: 0140,0141,0142,0143,0144,0145,0146,0147,
  78: 0150,0151,0152,0153,0154,0155,0156,0157,
  79: 0160,0161,0162,0163,0164,0165,0166,0167,
  80: 0170,0171,0172,0173,0174,0175,0176,0177,
  81: };
  82: static bool folding = FALSE;
  83: 
  84: static int err;
  85: static char *FirstCharacter;
  86: 
  87: void
  88: search_init()
  89: {
  90: #ifdef UNDEF
  91:     register int    i;
  92: 
  93:     for (i = 0; i < ASCSIZ; i++)
  94:     trans[i] = i;
  95: #else
  96:     ;
  97: #endif
  98: }
  99: 
 100: void
 101: init_compex(compex)
 102: register COMPEX *compex;
 103: {
 104:     /* the following must start off zeroed */
 105: 
 106:     compex->eblen = 0;
 107:     compex->brastr = Nullch;
 108: }
 109: 
 110: void
 111: free_compex(compex)
 112: register COMPEX *compex;
 113: {
 114:     if (compex->eblen) {
 115:     free(compex->expbuf);
 116:     compex->eblen = 0;
 117:     }
 118:     if (compex->brastr) {
 119:     free(compex->brastr);
 120:     compex->brastr = Nullch;
 121:     }
 122: }
 123: 
 124: static char *gbr_str = Nullch;
 125: static int gbr_siz = 0;
 126: 
 127: char *
 128: getbracket(compex,n)
 129: register COMPEX *compex;
 130: int n;
 131: {
 132:     int length = compex->braelist[n] - compex->braslist[n];
 133: 
 134:     if (!compex->nbra || n > compex->nbra || !compex->braelist[n] || length<0)
 135:     return nullstr;
 136:     growstr(&gbr_str, &gbr_siz, length+1);
 137:     safecpy(gbr_str, compex->braslist[n], length+1);
 138:     return gbr_str;
 139: }
 140: 
 141: void
 142: case_fold(which)
 143: int which;
 144: {
 145:     register int i;
 146: 
 147:     if (which != folding) {
 148:     if (which) {
 149:         for (i = 'A'; i <= 'Z'; i++)
 150:         trans[i] = tolower(i);
 151:     }
 152:     else {
 153:         for (i = 'A'; i <= 'Z'; i++)
 154:         trans[i] = i;
 155:     }
 156:     folding = which;
 157:     }
 158: }
 159: 
 160: /* Compile the given regular expression into a [secret] internal format */
 161: 
 162: char *
 163: compile (compex, strp, RE, fold)
 164: register COMPEX *compex;
 165: register char   *strp;
 166: int RE;
 167: int fold;
 168: {
 169:     register int c;
 170:     register char  *ep;
 171:     char   *lastep;
 172:     char    bracket[NBRA],
 173:        *bracketp;
 174:     char **alt = compex->alternatives;
 175:     char *retmes = "Badly formed search string";
 176: 
 177:     case_fold(compex->do_folding = fold);
 178:     if (!compex->eblen) {
 179:     compex->expbuf = safemalloc(84);
 180:     compex->eblen = 80;
 181:     }
 182:     ep = compex->expbuf;        /* point at expression buffer */
 183:     *alt++ = ep;            /* first alternative starts here */
 184:     bracketp = bracket;         /* first bracket goes here */
 185:     if (*strp == 0) {           /* nothing to compile? */
 186:     if (*ep == 0)           /* nothing there yet? */
 187:         return "Null search string";
 188:     return Nullch;          /* just keep old expression */
 189:     }
 190:     compex->nbra = 0;           /* no brackets yet */
 191:     lastep = 0;
 192:     for (;;) {
 193:     if (ep - compex->expbuf >= compex->eblen)
 194:         grow_eb(compex);
 195:     c = *strp++;            /* fetch next char of pattern */
 196:     if (c == 0) {           /* end of pattern? */
 197:         if (bracketp != bracket) {  /* balanced brackets? */
 198: #ifdef VERBOSE
 199:         retmes = "Unbalanced parens";
 200: #endif
 201:         goto cerror;
 202:         }
 203:         *ep++ = CEND;       /* terminate expression */
 204:         *alt++ = 0;         /* terminal alternative list */
 205:         /*
 206: 	    compex->eblen = ep - compex->expbuf + 1;
 207: 	    compex->expbuf = saferealloc(compex->expbuf,compex->eblen+4); */
 208:         return Nullch;      /* return success */
 209:     }
 210:     if (c != '*')
 211:         lastep = ep;
 212:     if (!RE) {          /* just a normal search string? */
 213:         *ep++ = CCHR;       /* everything is a normal char */
 214:         *ep++ = c;
 215:     }
 216:     else                /* it is a regular expression */
 217:         switch (c) {
 218: 
 219:         case '\\':      /* meta something */
 220:             switch (c = *strp++) {
 221:             case '(':
 222:             if (compex->nbra >= NBRA) {
 223: #ifdef VERBOSE
 224:                 retmes = "Too many parens";
 225: #endif
 226:                 goto cerror;
 227:             }
 228:             *bracketp++ = ++compex->nbra;
 229:             *ep++ = CBRA;
 230:             *ep++ = compex->nbra;
 231:             break;
 232:             case '|':
 233:             if (bracketp>bracket) {
 234: #ifdef VERBOSE
 235:                 retmes = "No \\| in parens";    /* Alas! */
 236: #endif
 237:                 goto cerror;
 238:             }
 239:             *ep++ = CEND;
 240:             *alt++ = ep;
 241:             break;
 242:             case ')':
 243:             if (bracketp <= bracket) {
 244: #ifdef VERBOSE
 245:                 retmes = "Unmatched right paren";
 246: #endif
 247:                 goto cerror;
 248:             }
 249:             *ep++ = CKET;
 250:             *ep++ = *--bracketp;
 251:             break;
 252:             case 'w':
 253:             *ep++ = WORD;
 254:             break;
 255:             case 'W':
 256:             *ep++ = NWORD;
 257:             break;
 258:             case 'b':
 259:             *ep++ = WBOUND;
 260:             break;
 261:             case 'B':
 262:             *ep++ = NWBOUND;
 263:             break;
 264:             case '0': case '1': case '2': case '3': case '4':
 265:             case '5': case '6': case '7': case '8': case '9':
 266:             *ep++ = CBACK;
 267:             *ep++ = c - '0';
 268:             break;
 269:             default:
 270:             *ep++ = CCHR;
 271:             if (c == '\0')
 272:                 goto cerror;
 273:             *ep++ = c;
 274:             break;
 275:             }
 276:             break;
 277:         case '.':
 278:             *ep++ = CDOT;
 279:             continue;
 280: 
 281:         case '*':
 282:             if (lastep == 0 || *lastep == CBRA || *lastep == CKET
 283:             || *lastep == CIRC
 284:             || (*lastep&STAR)|| *lastep>NWORD)
 285:             goto defchar;
 286:             *lastep |= STAR;
 287:             continue;
 288: 
 289:         case '^':
 290:             if (ep != compex->expbuf && ep[-1] != CEND)
 291:             goto defchar;
 292:             *ep++ = CIRC;
 293:             continue;
 294: 
 295:         case '$':
 296:             if (*strp != 0 && (*strp != '\\' || strp[1] != '|'))
 297:             goto defchar;
 298:             *ep++ = CDOL;
 299:             continue;
 300: 
 301:         case '[': {     /* character class */
 302:             register int i;
 303: 
 304:             if (ep - compex->expbuf >= compex->eblen - BMAPSIZ)
 305:             grow_eb(compex);    /* reserve bitmap */
 306:             for (i = BMAPSIZ; i; --i)
 307:             ep[i] = 0;
 308: 
 309:             if ((c = *strp++) == '^') {
 310:             c = *strp++;
 311:             *ep++ = NCCL;   /* negated */
 312:             }
 313:             else
 314:             *ep++ = CCL;    /* normal */
 315: 
 316:             i = 0;      /* remember oldchar */
 317:             do {
 318:             if (c == '\0') {
 319: #ifdef VERBOSE
 320:                 retmes = "Missing ]";
 321: #endif
 322:                 goto cerror;
 323:             }
 324:             if (*strp == '-' && *(++strp))
 325:                 i = *strp++;
 326:             else
 327:                 i = c;
 328:             while (c <= i) {
 329:                 ep[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE);
 330:                 if (fold && isalpha(c))
 331:                 ep[(c ^ 32) / BITSPERBYTE] |=
 332:                     1 << ((c ^ 32) % BITSPERBYTE);
 333:                     /* set the other bit too */
 334:                 c++;
 335:             }
 336:             } while ((c = *strp++) != ']');
 337:             ep += BMAPSIZ;
 338:             continue;
 339:         }
 340: 
 341:         defchar:
 342:         default:
 343:             *ep++ = CCHR;
 344:             *ep++ = c;
 345:         }
 346:     }
 347: cerror:
 348:     compex->expbuf[0] = 0;
 349:     compex->nbra = 0;
 350:     return retmes;
 351: }
 352: 
 353: void
 354: grow_eb(compex)
 355: register COMPEX *compex;
 356: {
 357:     compex->eblen += 80;
 358:     compex->expbuf = saferealloc(compex->expbuf, (MEM_SIZE)compex->eblen + 4);
 359: }
 360: 
 361: char *
 362: execute (compex, addr)
 363: register COMPEX *compex;
 364: char *addr;
 365: {
 366:     register char *p1 = addr;
 367:     register char *trt = trans;
 368:     register int c;
 369: 
 370:     if (addr == Nullch)
 371:     return Nullch;
 372:     if (compex->nbra) {         /* any brackets? */
 373:     for (c = 0; c <= compex->nbra; c++)
 374:         compex->braslist[c] = compex->braelist[c] = Nullch;
 375:     if (compex->brastr)
 376:         free(compex->brastr);
 377:     compex->brastr = savestr(p1);   /* in case p1 is not static */
 378:     p1 = compex->brastr;        /* ! */
 379:     }
 380:     case_fold(compex->do_folding);  /* make sure table is correct */
 381:     FirstCharacter = p1;        /* for ^ tests */
 382:     if (compex->expbuf[0] == CCHR && !compex->alternatives[1]) {
 383:     c = trt[compex->expbuf[1]]; /* fast check for first character */
 384:     do {
 385:         if (trt[*p1] == c && advance (compex, p1, compex->expbuf))
 386:         return p1;
 387:         p1++;
 388:     } while (*p1 && !err);
 389:     return Nullch;
 390:     }
 391:     else {          /* regular algorithm */
 392:     do {
 393:         register char **alt = compex->alternatives;
 394:         while (*alt) {
 395:         if (advance (compex, p1, *alt++))
 396:             return p1;
 397:         }
 398:         p1++;
 399:     } while (*p1 && !err);
 400:     return Nullch;
 401:     }
 402: }
 403: 
 404: /* advance the match of the regular expression starting at ep along the
 405:    string lp, simulates an NDFSA */
 406: bool
 407: advance (compex, lp, ep)
 408: register COMPEX *compex;
 409: register char *ep;
 410: register char *lp;
 411: {
 412:     register char *curlp;
 413:     register char *trt = trans;
 414:     register int i;
 415: 
 416:     while ((*ep & STAR) || *lp || *ep == CIRC || *ep == CKET)
 417:     switch (*ep++) {
 418: 
 419:         case CCHR:
 420:         if (trt[*ep++] != trt[*lp]) return FALSE;
 421:         lp++;
 422:         continue;
 423: 
 424:         case CDOT:
 425:         if (*lp == '\n') return FALSE;
 426:         lp++;
 427:         continue;
 428: 
 429:         case CDOL:
 430:         if (!*lp || *lp == '\n')
 431:             continue;
 432:         return FALSE;
 433: 
 434:         case CIRC:
 435:         if (lp == FirstCharacter || lp[-1]=='\n')
 436:             continue;
 437:         return FALSE;
 438: 
 439:         case WORD:
 440:         if (isalnum(*lp)) {
 441:             lp++;
 442:             continue;
 443:         }
 444:         return FALSE;
 445: 
 446:         case NWORD:
 447:         if (!isalnum(*lp)) {
 448:             lp++;
 449:             continue;
 450:         }
 451:         return FALSE;
 452: 
 453:         case WBOUND:
 454:         if ((lp == FirstCharacter || !isalnum(lp[-1])) !=
 455:             (!*lp || !isalnum(*lp)) )
 456:             continue;
 457:         return FALSE;
 458: 
 459:         case NWBOUND:
 460:         if ((lp == FirstCharacter || !isalnum(lp[-1])) ==
 461:             (!*lp || !isalnum(*lp)))
 462:             continue;
 463:         return FALSE;
 464: 
 465:         case CEND:
 466:         return TRUE;
 467: 
 468:         case CCL:
 469:         if (cclass (ep, *lp, 1)) {
 470:             ep += BMAPSIZ;
 471:             lp++;
 472:             continue;
 473:         }
 474:         return FALSE;
 475: 
 476:         case NCCL:
 477:         if (cclass (ep, *lp, 0)) {
 478:             ep += BMAPSIZ;
 479:             lp++;
 480:             continue;
 481:         }
 482:         return FALSE;
 483: 
 484:         case CBRA:
 485:         compex->braslist[*ep++] = lp;
 486:         continue;
 487: 
 488:         case CKET:
 489:         i = *ep++;
 490:         compex->braelist[i] = lp;
 491:         compex->braelist[0] = lp;
 492:         compex->braslist[0] = compex->braslist[i];
 493:         continue;
 494: 
 495:         case CBACK:
 496:         if (compex->braelist[i = *ep++] == 0) {
 497:             fputs("bad braces\n",stdout) FLUSH;
 498:             err = TRUE;
 499:             return FALSE;
 500:         }
 501:         if (backref (compex, i, lp)) {
 502:             lp += compex->braelist[i] - compex->braslist[i];
 503:             continue;
 504:         }
 505:         return FALSE;
 506: 
 507:         case CBACK | STAR:
 508:         if (compex->braelist[i = *ep++] == 0) {
 509:             fputs("bad braces\n",stdout) FLUSH;
 510:             err = TRUE;
 511:             return FALSE;
 512:         }
 513:         curlp = lp;
 514:         while (backref (compex, i, lp)) {
 515:             lp += compex->braelist[i] - compex->braslist[i];
 516:         }
 517:         while (lp >= curlp) {
 518:             if (advance (compex, lp, ep))
 519:             return TRUE;
 520:             lp -= compex->braelist[i] - compex->braslist[i];
 521:         }
 522:         continue;
 523: 
 524:         case CDOT | STAR:
 525:         curlp = lp;
 526:         while (*lp++ && lp[-1] != '\n');
 527:         goto star;
 528: 
 529:         case WORD | STAR:
 530:         curlp = lp;
 531:         while (*lp++ && isalnum(lp[-1]));
 532:         goto star;
 533: 
 534:         case NWORD | STAR:
 535:         curlp = lp;
 536:         while (*lp++ && !isalnum(lp[-1]));
 537:         goto star;
 538: 
 539:         case CCHR | STAR:
 540:         curlp = lp;
 541:         while (*lp++ && trt[lp[-1]] == trt[*ep]);
 542:         ep++;
 543:         goto star;
 544: 
 545:         case CCL | STAR:
 546:         case NCCL | STAR:
 547:         curlp = lp;
 548:         while (*lp++ && cclass (ep, lp[-1], ep[-1] == (CCL | STAR)));
 549:         ep += BMAPSIZ;
 550:         goto star;
 551: 
 552:     star:
 553:         do {
 554:             lp--;
 555:             if (advance (compex, lp, ep))
 556:             return TRUE;
 557:         } while (lp > curlp);
 558:         return FALSE;
 559: 
 560:         default:
 561:         fputs("Badly compiled pattern\n",stdout) FLUSH;
 562:         err = TRUE;
 563:         return -1;
 564:     }
 565:     if (*ep == CEND || *ep == CDOL) {
 566:         return TRUE;
 567:     }
 568:     return FALSE;
 569: }
 570: 
 571: bool
 572: backref (compex, i, lp)
 573: register COMPEX *compex;
 574: register int i;
 575: register char *lp;
 576: {
 577:     register char *bp;
 578: 
 579:     bp = compex->braslist[i];
 580:     while (*lp && *bp == *lp) {
 581:     bp++;
 582:     lp++;
 583:     if (bp >= compex->braelist[i])
 584:         return TRUE;
 585:     }
 586:     return FALSE;
 587: }
 588: 
 589: bool
 590: cclass (set, c, af)
 591: register char  *set;
 592: register int c;
 593: {
 594:     c &= 0177;
 595: #if BITSPERBYTE == 8
 596:     if (set[c >> 3] & 1 << (c & 7))
 597: #else
 598:     if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE))
 599: #endif
 600:     return af;
 601:     return !af;
 602: }

Defined functions

advance defined in line 406; used 5 times
backref defined in line 571; used 3 times
case_fold defined in line 141; used 3 times
cclass defined in line 589; used 4 times
free_compex defined in line 110; used 5 times
getbracket defined in line 127; used 2 times
grow_eb defined in line 353; used 3 times
search_init defined in line 87; used 2 times

Defined variables

FirstCharacter defined in line 85; used 4 times
err defined in line 84; used 5 times
gbr_siz defined in line 125; used 1 times
gbr_str defined in line 124; used 3 times
trans defined in line 64; used 5 times

Defined typedef's

TRANSTABLE defined in line 62; used 1 times
  • in line 64

Defined macros

ASCSIZ defined in line 61; used 2 times
BITSPERBYTE defined in line 34; used 9 times
BMAPSIZ defined in line 37; used 6 times
CBACK defined in line 48; used 2 times
CBRA defined in line 40; used 2 times
CCHR defined in line 41; used 5 times
CCL defined in line 43; used 3 times
CDOL defined in line 45; used 2 times
CDOT defined in line 42; used 2 times
CEND defined in line 46; used 4 times
CIRC defined in line 50; used 3 times
CKET defined in line 47; used 3 times
NCCL defined in line 44; used 2 times
NWBOUND defined in line 55; used 1 times
NWORD defined in line 53; used 3 times
STAR defined in line 57; used 4 times
WBOUND defined in line 54; used 1 times
WORD defined in line 52; used 2 times
Last modified: 1987-03-16
Generated: 2016-12-26
Generated by src2html V0.67
page hit count: 3779
Valid CSS Valid XHTML 1.0 Strict