#include "stdio.h" #include "ctype.h" #include "assert.h" #include "err.h" #include "re.h" #ifdef _tolower /* thank you, system V */ #undef tolower #define tolower _tolower #endif #ifndef D2 #define D2 0 #endif void *zalloc(unsigned int, unsigned int); /* * fgrep -- print all lines containing any of a set of keywords * * status returns: * 0 - ok, and some matches * 1 - ok, but no matches * 2 - some error */ #define MAXSIZ 1000 #define QSIZE 400 typedef struct Trans Trans; typedef struct State State; struct State { State *fail; Trans *trans; int out; }; struct Trans { State *nst; Trans *link; int inp; }; typedef union { State s; Trans t; } Heapobj; struct re_ac { /*unsigned char *map;*/ Heapobj *smax, *lim; Heapobj space[MAXSIZ]; }; static int nfound; static int isnew(State *); int re_run(re_ac *fg, char *instr, int ccount, int need) { unsigned char *p; State *c, *s0; int ch; nfound = 0; p = (unsigned char *)instr; c = s0 = &fg->space[0].s; /* start state */ #if D2 fprintf(stderr, "in execute ccount %d\n", ccount); #endif while (--ccount >= 0) { ch = *p; if (isupper(ch)) ch = tolower(ch); for (;; c = c->fail) { Trans *t; #if D2 fprintf(stderr, "g(0x%x,'%c')", c, ch); #endif for (t = c->trans; t != 0; t = t->link) if (t->inp == ch) { c = t->nst; goto Found; } /* failed */ if (c == s0) break; #if D2 fprintf(stderr, " =\n"); #endif } Found: #if D2 fprintf(stderr, " = 0x%x [%d]\n", c, c->out); #endif if (c->out && isnew(c)) { nfound++; #if D2 fprintf(stderr, " found: nfound %d need %d\n", nfound, need); #endif if (nfound >= need) break; ccount++; continue; /* note: restarts machine on last char of last match */ } p++; } return nfound; } #define NFINAL 100 static State *seen[NFINAL]; static int isnew(State *x) { int i; for (i = 0; i < nfound; i++) if (seen[i] == x) return 0; if (i >= NFINAL) err("too many final states (fgrep)"); seen[i] = x; return 1; } /* * Aho/Corasick keyword matching algorithm: * CACM Vol. 18, No. 6, June 1975, pp 333-340 */ /* * initialise a structure describing an empty machine */ re_ac *re_acinit(void) { static re_ac *sfg; /* static in refer's application, to save time */ State *s0; if (sfg == 0) { sfg = (re_ac *) zalloc(1, sizeof(*sfg)); sfg->lim = &sfg->space[MAXSIZ]; } sfg->smax = sfg->space; s0 = &sfg->smax->s; s0->out = 0; s0->trans = 0; s0->fail = 0; return sfg; } static State *fgoto(State *s, int c) { Trans *t; for (t = s->trans; t != 0; t = t->link) if (t->inp == c) return t->nst; /* non-zero by construction */ return 0; } /* * add the new keyword represented by the string ab..ae to machine fg * * this is algorithm 2 (construction of the goto function); p 336 * returns 0 if construction failed (no space) */ int re_acadd(re_ac *fg, char *ab, char *ae) { unsigned char *b = (unsigned char *)ab; unsigned char *e = (unsigned char *)ae; State *s, *ns; Trans *t; s = &fg->space[0].s; /* start state */ for (; b != e && (ns = fgoto(s, *b)) != 0; b++, s = ns) /* while g(state,*b) != fail */ ; for (; b != e; b++) { /* add new trans to new state for each char */ t = &(++fg->smax)->t; ns = &(++fg->smax)->s; if (fg->smax >= fg->lim) return 0; t->nst = ns; t->inp = *b; t->link = s->trans; s->trans = t; ns->out = 0; ns->fail = 0; ns->trans = 0; s = ns; } s->out = 1; return 1; } #define enq(w) { \ *qw++ = (w); \ if (qw >= &queue[QSIZE]) qw = queue; \ if (qw == qr) return 0; \ } /* * complete the machine * * this is algorithm 3 (construction of failure function), page 336 * returns 0 if construction failed (no space) */ int re_accomp(re_ac *fg) { State *queue[QSIZE]; State **qr, **qw; State *state, *r, *ns, *s, *s0; Trans *t; int c; qr = qw = queue; s0 = &fg->space[0].s; /* start state; state 0 */ for (t = s0->trans; t != 0; t = t->link) { /* queue states of depth 1 */ enq(t->nst); t->nst->fail = s0; } /* calculate states at each depth d from those at d-1: */ while (qr != qw) { r = *qr++; if (qr >= &queue[QSIZE]) qr = queue; for (t = r->trans; t != 0; t = t->link) { /* each char in state r */ s = t->nst; enq(s); state = r->fail; c = t->inp; while ((ns = fgoto(state,c)) == 0 && state != s0) state = state->fail; if ((s->fail = ns) == 0) s->fail = s0; s->out |= s->fail->out; } } return 1; }