1: #ifndef lint 2: static char sccsid[] = "@(#)invert.c 2.5 9/10/85"; 3: #endif not lint 4: # 5: /* input: records of lines, separated by blank lines 6: output: key:file1 start/length ... start/length:file2 start/length ... 7: */ 8: 9: # include "stdio.h" 10: # include "streams.h" 11: # include "bib.h" 12: # define isnull(x) (*(x) == NULL) 13: # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c) 14: 15: int max_kcnt = 100; /* max number of keys */ 16: int max_klen = 6; /* max length of keys */ 17: char *ignore = /* string of line starts to ignore */ 18: "CNOPVX"; 19: char *common = /* name of file of common words */ 20: COMFILE; 21: char *INDEX= /* name of output file */ 22: INDXFILE; 23: 24: char *tmpfile = /* name of temporary file */ 25: INVTEMPFILE; 26: 27: int silent = 0; /* 0 => statistics printed */ 28: /* 1 => no statisitics printed */ 29: 30: char *sort_it = 31: "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s"; 32: char sortcmd[maxstr]; 33: 34: int argc; 35: char **argv; 36: 37: main(argcount,arglist) 38: int argcount; 39: char **arglist; 40: { char *filename; 41: FILE *input, *output; 42: long int start,length; 43: char word[maxstr]; 44: int kcnt; 45: char tag_line[maxstr]; 46: 47: long int records = 0; /* number of records read */ 48: long int keys = 0; /* number of keys read (occurences) */ 49: long int distinct; /* number of distinct keys */ 50: long int shorten(); 51: 52: strcpy(COMFILE, N_COMFILE); 53: strcpy(BMACLIB, N_BMACLIB); 54: 55: argc= argcount-1; 56: argv= arglist+1; 57: mktemp(tmpfile); 58: output= fopen(tmpfile,"w"); 59: 60: for ( flags() ; argc>0 ; argc--, argv++ ,flags() ) 61: { /* open input file */ 62: filename= *argv; 63: input= fopen(filename,"r"); 64: if (input==NULL) 65: { fprintf(stderr, "invert: error in open of %s\n", filename); 66: continue; 67: } 68: start= 0L; 69: length= 0L; 70: 71: for(;;) /* each record */ 72: { /* find start of next record (exit if none) */ 73: start= nextrecord(input,start+length); 74: if (start==EOF) break; 75: records++; 76: kcnt= 0; 77: length= recsize(input,start); 78: sprintf(tag_line, " %s %d %d\n", filename, start, length); 79: 80: while (ftell(input) < start+length && kcnt < max_kcnt) 81: { getword(input,word,ignore); 82: makekey(word,max_klen,common); 83: if (!isnull(word)) 84: { fputs(word,output); fputs(tag_line,output); 85: kcnt++; keys++; 86: } 87: } 88: } 89: fclose(input); 90: } 91: fclose(output); 92: 93: sprintf(sortcmd, sort_it, tmpfile, tmpfile); 94: system(sortcmd); 95: 96: distinct = shorten(tmpfile,INDEX); 97: if( silent == 0 ) 98: fprintf(stderr, 99: "%d documents %d distinct keys %d key occurrences\n", 100: records, distinct, keys); 101: exit(0); 102: } 103: 104: 105: 106: /* Flag Meaning Default 107: -ki Keys per record 100 108: -li max Length of keys 6 109: -%str ignore lines that begin with %x CNOPVX 110: where x is in str 111: str is a seq of chars 112: -cfile file contains Common words /usr/new/lib/bib/common 113: do not use common words as keys 114: -pfile name of output file INDEX 115: -s do not print statistics statistics printed 116: */ 117: 118: # define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2) 119: 120: flags() 121: { 122: char *p; 123: for (; argc>0 && *argv[0]=='-'; argc--,argv++) 124: { switch ((*argv)[1]) 125: { case 'k': max_kcnt= atoi(operand); 126: break; 127: case 'l': max_klen= atoi(operand); 128: break; 129: case 'c': common= operand; 130: break; 131: case '%': ignore= *argv+2; 132: break; 133: case 'p': INDEX= operand; 134: break; 135: case 's': silent= 1; 136: break; 137: case 'd': 138: p = &argv[0][2]; 139: if (!p) { 140: argv++; 141: p = &argv[0][0]; 142: } 143: strreplace(COMFILE, BMACLIB, p); 144: strcpy(BMACLIB, p); 145: break; 146: default: fprintf(stderr, "unknown flag '%s'\n", *argv); 147: } 148: } 149: } 150: 151: 152: /* shorten(inf,outf): file "inf" consists of lines of the form: 153: key file start length 154: sorted by key and file. replace lines with the same key 155: with one line of the form: 156: key:file1 start/length ... start/length:file2 start/length ... 157: rename as file "outf" 158: returns number of lines in output 159: */ 160: long shorten(inf,outf) 161: char *inf, *outf; 162: { FILE *in, *out; 163: char line[maxstr]; 164: char key[maxstr], newkey[maxstr], 165: file[maxstr], newfile[maxstr]; 166: long int start, length; 167: long int lines = 0; 168: 169: in= fopen(inf, "r"); 170: out= fopen(outf, "w"); 171: if (in==NULL || out==NULL) 172: { fprintf(stderr, "invert: error in opening file for compression\n"); 173: return(0); 174: } 175: 176: getline(in,line); 177: sscanf(line,"%s%s%d%d", key, file, &start, &length); 178: fprintf(out, "%s :%s %d/%d", key, file, start, length); 179: for ( getline(in, line) ; !feof(in); getline(in, line)) 180: { sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length); 181: if (strcmp(key,newkey)!=0) 182: { strcpy(key, newkey); 183: strcpy(file, newfile); 184: fprintf(out, "\n%s :%s %d/%d", key, file, start, length); 185: lines++; 186: } 187: else if (strcmp(file,newfile)!=0) 188: { strcpy(file,newfile); 189: fprintf(out, ":%s %d/%d", file, start, length); 190: } 191: else 192: fprintf(out, " %d/%d", start, length); 193: } 194: fprintf(out, "\n"); 195: lines++; 196: 197: fclose(in); fclose(out); 198: unlink(inf); 199: return (lines); 200: }