#include #include #include #include #include "hash.h" enum { MAXTAB = 256, MAXBEST = 32, }; typedef struct Table Table; struct Table { char *file; Hash *hash; int nmsg; }; typedef struct Word Word; struct Word { Stringtab *s; /* from hmsg */ int count[MAXTAB]; /* counts from each table */ double p[MAXTAB]; /* probabilities from each table */ double mp; /* max probability */ int mi; /* w.p[w.mi] = w.mp */ }; Table tab[MAXTAB]; int ntab; Word best[MAXBEST]; int mbest; int nbest; int debug; void usage(void) { fprint(2, "usage: bayes [-D] [-m maxword] boxhash ... ~ msghash ...\n"); exits("usage"); } void* emalloc(int n) { void *v; v = mallocz(n, 1); if(v == nil) sysfatal("out of memory"); return v; } void noteword(Word *w) { int i; for(i=nbest-1; i>=0; i--) if(w->mp < best[i].mp) break; i++; if(i >= mbest) return; if(nbest == mbest) nbest--; if(i < nbest) memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0])); best[i] = *w; nbest++; } Hash* hread(char *s) { Hash *h; Biobuf *b; if((b = Bopenlock(s, OREAD)) == nil) sysfatal("open %s: %r", s); h = emalloc(sizeof(Hash)); Breadhash(b, h, 1); Bterm(b); return h; } void main(int argc, char **argv) { int i, j, a, mi, oi, tot, keywords; double totp, p, xp[MAXTAB]; Hash *hmsg; Word w; Stringtab *s, *t; Biobuf bout; mbest = 15; keywords = 0; ARGBEGIN{ case 'D': debug = 1; break; case 'k': keywords = 1; break; case 'm': mbest = atoi(EARGF(usage())); if(mbest > MAXBEST) sysfatal("cannot keep more than %d words", MAXBEST); break; default: usage(); }ARGEND for(i=0; i MAXTAB) sysfatal("cannot handle more than %d tables", MAXTAB); if(i+1 >= argc) usage(); for(i=0; icount == 0) tab[ntab].nmsg = 1; else tab[ntab].nmsg = s->count; ntab++; } Binit(&bout, 1, OWRITE); oi = ++i; for(a=i; aall; s; s=s->link){ w.s = s; tot = 0; totp = 0.0; for(i=0; istr, s->n, 0); if(t == nil) w.count[i] = 0; else w.count[i] = t->count; tot += w.count[i]; p = w.count[i]/(double)tab[i].nmsg; if(p >= 1.0) p = 1.0; w.p[i] = p; totp += p; } if(tot < 5){ /* word does not appear enough; give to box 0 */ w.p[0] = 0.5; for(i=1; i 0.99) p = 0.99; if(p > w.mp){ w.mp = p; w.mi = i; } w.p[i] = p; } noteword(&w); } totp = 0.0; for(i=0; i xp[mi]) mi = i; if(oi != argc-1) Bprint(&bout, "%s: ", argv[a]); Bprint(&bout, "%s %f", tab[mi].file, xp[mi]); if(keywords){ for(i=0; istr, best[i].s->n); Bprint(&bout, " %f", best[i].p[mi]); } } freehash(hmsg); Bprint(&bout, "\n"); if(debug){ for(i=0; istr, best[i].s->n); Bprint(&bout, " %f", best[i].p[mi]); if(best[i].p[mi] < best[i].mp) Bprint(&bout, " (%f %s)", best[i].mp, tab[best[i].mi].file); Bprint(&bout, "\n"); } } } Bterm(&bout); }