/* * The lexical analyzer. */ #include "itran.h" #include "token.h" #include "lex.h" #include "char.h" #include "tree.h" int tline; int tcol; /* * yylex - find the next token in the input stream, and return its token * type and value to the parser. * * Variables of interest: * * cc - character following last token. * comflag - set if in a comment. * nlflag - set if a newline was between the last token and the current token * lastend - set if the last token was an ENDER. * lastval - when a semicolon is inserted and returned, lastval gets the * token value that would have been returned if the semicolon hadn't * been inserted. */ yylex() { register struct toktab *t; register int c; int nlflag; int comflag; static struct toktab *lasttok = NULL; static nodeptr lastval; static int lastend = 0; static int eofflag = 0; static int lastline = 0; static int cc = '\n'; extern struct toktab *getident(), *getnum(), *getstring(), *getop(); if (lasttok != NULL) { /* * A semicolon was inserted and returned on the last call to yylex, * instead of going to the input, return lasttok and set the * appropriate variables. */ yylval = lastval; tline = LINE(lastval); tcol = COL(lastval); t = lasttok; goto ret; } nlflag = 0; comflag = 0; loop: c = cc; /* * Skip whitespace and comments. */ while (c != EOF && (comflag || c == COMMENT || isspace(c))) { if (c == '\n') { nlflag++; comflag = 0; } else if (c == COMMENT) comflag++; c = NEXTCHAR; } /* * A token is the next thing in the input. Record the last line number * and set tline and tcol to the current line and column. */ lastline = tline; tline = inline; tcol = incol; if (c == EOF) { /* * End of file has been reached. Set eofflag, return T_EOF, and * set cc to EOF so that any subsequent scans also return T_EOF. */ if (eofflag++) { eofflag = 0; cc = '\n'; return (int) (yylval = 0); } cc = EOF; t = T_EOF; yylval = 0; goto ret; } /* * Look at current input character to determine what class of token * is next and take the appropriate action. Note that the various * token gathering routines write a value into cc. */ c = ctran[c]; if (isalpha(c)) { /* gather ident or reserved word */ if ((t = getident(c, &cc)) == NULL) goto loop; } else if (isdigit(c)) { /* gather numeric literal */ if ((t = getnum(c, &cc)) == NULL) goto loop; } else if (c == '"' || c == '\'') { /* gather string or cset literal */ if ((t = getstring(c, &cc)) == NULL) goto loop; } else { /* gather longest legal operator */ if ((t = getop(c, &cc)) == NULL) goto loop; yylval = OPNODE(t->t_type); } if (nlflag && lastend && (t->t_flags & BEGINNER)) { /* * A newline was encountered between the current token and the last, * the last token was an ENDER, and the current token is a BEGINNER. * Return a semicolon and save the current token in lastval. */ lastval = yylval; lasttok = t; tline = lastline; tcol = 0; yylval = OPNODE(SEMICOL); return (SEMICOL); } ret: /* * Clear lasttok, set lastend if the token being returned is an * ENDER, and return the token. */ lasttok = 0; lastend = t->t_flags & ENDER; return (t->t_type); } /* * getident - gather an identifier beginning with ac. The character * following identifier goes in cc. */ struct toktab *getident(ac, cc) char ac; int *cc; { register c; register char *p; register struct toktab *t; extern char *putident(); extern struct toktab *findres(); c = ac; p = sfree; /* * Copy characters into string space until a non-alphanumeric character * is found. */ do { if (p >= send) syserr("out of string space"); *p++ = c; c = ctran[NEXTCHAR]; } while (isalnum(c)); if (p >= send) syserr("out of string space"); *p++ = 0; *cc = c; /* * If the identifier is a reserved word, make a RESNODE for it and return * the token value. Otherwise, install it with putident, make an * IDNODE for it, and return. */ if ((t = findres()) != NULL) { yylval = RESNODE(t->t_type); return (t); } else { yylval = IDNODE((int)putident(p-sfree)); return (T_IDENT); } } /* * findres - if the string just copied into the string space by getident * is a reserved word, return a pointer to its entry in the token table. * Return NULL if the string isn't a reserved word. */ struct toktab *findres() { register struct toktab *t; register char c, *p; p = sfree; c = *p; if (!islower(c)) return (NULL); /* * Point t at first reserved word that starts with c (if any). */ if ((t = restab[c - '_']) == NULL) return (NULL); /* * Search through reserved words, stopping when a match is found * or when the current reserved word doesn't start with c. */ while (t->t_word[0] == c) { if (strcmp(t->t_word, p) == 0) return (t); t++; } return (NULL); } /* * getnum - gather a numeric literal starting with ac and put the * character following the literal into *cc. */ struct toktab *getnum(ac, cc) char ac; int *cc; { register c; register r; register state; char *p; int realflag; extern char *putident(); c = ac; r = tonum(c); p = sfree; state = 0; realflag = 0; for (;;) { if (p >= send) syserr("out of string space"); *p++ = c; c = ctran[NEXTCHAR]; switch (state) { case 0: /* integer part */ if (isdigit(c)) { r = r * 10 + tonum(c); continue; } if (c == '.') { state = 1; realflag++; continue; } if (tolower(c) == 'e') { state = 2; realflag++; continue; } if (tolower(c) == 'r') { state = 5; if (r < 2 || r > 36) err("invalid radix for integer literal", 0); continue; } break; case 1: /* fractional part */ if (isdigit(c)) continue; if (tolower(c) == 'e') { state = 2; continue; } break; case 2: /* optional exponent sign */ if (c == '+' || c == '-') { state = 3; continue; } case 3: /* first digit after e, e+, or e- */ if (isdigit(c)) { state = 4; continue; } err("invalid real literal", 0); break; case 4: /* remaining digits after e */ if (isdigit(c)) continue; break; case 5: /* first digit after r */ if ((isdigit(c) || isletter(c)) && tonum(c) < r) { state = 6; continue; } err("invalid integer literal", 0); break; case 6: /* remaining digits after r */ if (isdigit(c) || isletter(c)) { if (tonum(c) >= r) { /* illegal digit for radix r */ err("invalid digit in integer literal", 0); r = tonum('z'); /* prevent more messages */ } continue; } break; } break; } if (p >= send) syserr("out of string space"); *p++ = 0; *cc = c; if (realflag) { yylval = REALNODE((int)putident(p-sfree)); return (T_REAL); } yylval = INTNODE((int)putident(p-sfree)); return (T_INT); } /* * getstring - gather a string literal starting with ac and place the * character following the literal in *cc. */ struct toktab *getstring(ac, cc) char ac; int *cc; { register c, sc; register char *p; char *lc; extern char *putident(); sc = c = ac; p = sfree; lc = 0; while ((c = NEXTCHAR) != sc && c != '\n' && c != EOF) { contin: if (c == '_') lc = p; else if (!isspace(c)) lc = 0; if (ctran[c] == ESCAPE) { c = NEXTCHAR; if (isoctal(c)) c = octesc(c); else if (ctran[c] == 'x') c = hexesc(); else if (ctran[c] == '^') c = ctlesc(); else c = esctab[c]; if (c == EOF) goto noquote; } if (p >= send) syserr("out of string space"); *p++ = c; } if (p >= send) syserr("out of string space"); *p++ = 0; if (c == sc) *cc = ' '; else { if (c == '\n' && lc) { p = lc; while ((c = NEXTCHAR) != EOF && isspace(c)) ; if (c != EOF) goto contin; } noquote: err("unclosed quote", 0); *cc = c; } if (ac == '"') { /* a string literal */ yylval = STRNODE((int)putident(p-sfree), p-sfree); return (T_STRING); } else { /* a cset literal */ yylval = CSETNODE((int)putident(p-sfree), p-sfree); return (T_CSET); } } /* * ctlesc - translate a control escape -- backslash followed by * caret and one character. */ ctlesc() { register c; c = NEXTCHAR; if (c == EOF) return (EOF); return (c & 037); } /* * octesc - translate an octal escape -- backslash followed by * one, two, or three octal digits. */ octesc(ac) char ac; { register c, nc, i; c = 0; nc = ac; i = 1; do { c = (c << 3) | (nc - '0'); nc = NEXTCHAR; if (nc == EOF) return (EOF); } while (isoctal(nc) && i++ < 3); PUSHCHAR(nc); return (c & 0377); } /* * hexesc - translate a hexadecimal escape -- backslash-x * followed by one or two hexadecimal digits. */ hexesc() { register c, nc, i; c = 0; i = 0; while (i++ < 2) { nc = NEXTCHAR; if (nc == EOF) return (EOF); if (nc >= 'a' && nc <= 'f') nc -= 'a' - 10; else if (nc >= 'A' && nc <= 'F') nc -= 'A' - 10; else if (isdigit(nc)) nc -= '0'; else { PUSHCHAR(nc); break; } c = (c << 4) | nc; } return (c); } /* * getop - find the longest legal operator and return a pointer * to its entry in the token table. The tour describes the * operator recognition process in detail. */ struct toktab *getop(ac, cc) char ac; int *cc; { register struct optab *state; register char c, i; state = state0; c = ac; for (;;) { while ((i = state->o_input) && c != i) state++; switch (state->o_action) { case A_GOTO: state = (struct optab *) state->o_val; c = ctran[NEXTCHAR]; continue; case A_ERROR: err("invalid character", 0); *cc = ' '; return (NULL); case A_RETURN: *cc = c; return (struct toktab *) (state->o_val); case A_IMMRET: *cc = ' '; return (struct toktab *) (state->o_val); } } } /* * nextchar - return the next character in the input. */ nextchar() { register char c; if (c = peekc) { peekc = 0; return (c); } c = getc(infile); switch (c) { case EOF: inline = 0; incol = 0; break; case '\n': inline++; incol = 0; break; case '\t': incol = (incol | 7) + 1; break; case '\b': if (incol) incol--; break; default: incol++; } return (c); }