/* * suggest.c - generate suggestions for spelling errors. * * term% spell doc.ms | suggest * * The triagram permute validation seems a good idea, however in * testing it rejected only 20% of the initial suggestions. This * is comperable in cpu terms to the overhead in generating the * probability tables in the first place. * %A V. I. Levenshtein %T Binary codes capable of correcting deletions, insertions, and reversals %J Soviet Physics Doklady 10 %D 1966 %V 707710 %T Hanging on the Metaphone %A Lawrence Philips %J Computer Language %D December 1990 %P 38 */ #include #include #include #include #include /* * These constants are rather arbitary but * to work for my (rather small) corpus. */ enum{ Idx = 3, // apply Dist below after this many suggestions Dist = 2, // stop offering suggestions if levenshtein is above this Maxidx = 8, // give up after this many suggestions // triagramme table Trilen = 28, // side length Begword = 0, // code for begining of word Endword = 26, // code for end of word Nalpha = 27, // code for non alpha character Ncode = 4 // length of metaphone hash used }; char *Wordfile = "/lib/words"; Biobuf *Words = nil; uvlong Unknown = 0; // stats uvlong Rejected = 0; uvlong Generated = 0; uvlong Ntri = 0; // number of triagrammes uint ***Tri = nil; // triagramme counts typedef struct Table Table; typedef struct Result Result; struct Table { char code[Ncode]; vlong off; }; struct Result { Result *next; char code[Ncode]; char *word; int dist; }; Table *Tab = nil; int Ntab = 0; void metaphone(char *name, char *buf, int len) { int ii, jj, silent, hard, Lng, lastChr; char curLtr, prevLtr, nextLtr, nextLtr2, nextLtr3; int vowelAfter, vowelBefore, frontvAfter; char *p, *p1; char wname[60]; char *ename = wname; static char *VOWELS = "AEIOU"; static char *FRONTV = "EIY"; /* special cases for letters in FRONT of these */ static char *VARSON = "CSPTG"; /* variable sound--those modified by adding an "h" */ static char *DOUBLE = "."; /* let these double letters through */ static char *excpPAIR = "AGKPW"; /* exceptions "ae-", "gn-", "kn-", "pn-", "wr-" */ static char *nextLTR = "ENNNR"; memset(buf, 0, len); jj = 0; for(ii = 0; name[ii] != '\0'; ii++) { if(isalpha(name[ii])) { ename[jj] = toupper(name[ii]); jj++; } } ename[jj] = '\0'; if(strlen(ename) == 0) return; /* if ae, gn, kn, pn, wr then drop the first letter */ if((p = strchr(excpPAIR, ename[0])) != nil) { p1 = nextLTR + (p - excpPAIR); if(*p1 == ename[1]) strcpy(ename, &ename[1]); } /* change x to s */ if(ename[0] == 'X') ename[0] = 'S'; /* get rid of the "h" in "wh" */ if(strncmp(ename, "WH", 2) == 0) strcpy(&ename[1], &ename[2]); Lng = strlen(ename); lastChr = Lng - 1; /* index to last character in string makes code easier */ /* Remove an S from the end of the string */ if(ename[lastChr] == 'S') { ename[lastChr] = '\0'; Lng = strlen(ename); lastChr = Lng - 1; } for(ii = 0; ((strlen(buf) < len) && (ii < Lng)); ii++) { curLtr = ename[ii]; vowelBefore = 0; prevLtr = ' '; if(ii > 0) { prevLtr = ename[ii - 1]; if(strchr(VOWELS, prevLtr) != nil) vowelBefore = 1; } /* if first letter is a vowel KEEP it */ if(ii == 0 && (strchr(VOWELS, curLtr) != nil)) { strncat(buf, &curLtr, 1); continue; } vowelAfter = 0; frontvAfter = 0; nextLtr = ' '; if(ii < lastChr) { nextLtr = ename[ii + 1]; if(strchr(VOWELS, nextLtr) != nil) vowelAfter = 1; if(strchr(FRONTV, nextLtr) != nil) frontvAfter = 1; } /* skip double letters except ones in list */ if(curLtr == nextLtr && (strchr(DOUBLE, nextLtr) == nil)) continue; nextLtr2 = ' '; if(ii < (lastChr - 1)) nextLtr2 = ename[ii + 2]; nextLtr3 = ' '; if(ii < (lastChr - 2)) nextLtr3 = ename[ii + 3]; switch (curLtr) { case 'B': silent = 0; if(ii == lastChr && prevLtr == 'M') silent = 1; if(!silent) strncat(buf, &curLtr, 1); break; /* silent -sci-,-sce-,-scy-; sci-, etc OK */ case 'C': if(!(ii > 1 && prevLtr == 'S' && frontvAfter)) if(ii > 0 && nextLtr == 'I' && nextLtr2 == 'A') strncat(buf, "X", 1); else if(frontvAfter) strncat(buf, "S", 1); else if(ii > 1 && prevLtr == 'S' && nextLtr == 'H') strncat(buf, "K", 1); else if(nextLtr == 'H') if(ii == 0 && (strchr(VOWELS, nextLtr2) == nil)) strncat(buf, "K", 1); else strncat(buf, "X", 1); else if(prevLtr == 'C') strncat(buf, "C", 1); else strncat(buf, "K", 1); break; case 'D': if(nextLtr == 'G' && (strchr(FRONTV, nextLtr2) != nil)) strncat(buf, "J", 1); else strncat(buf, "T", 1); break; case 'G': silent = 0; /* SILENT -gh- except for -gh and no vowel after h */ if((ii < (lastChr - 1) && nextLtr == 'H') && (strchr(VOWELS, nextLtr2) == nil)) silent = 1; if((ii == (lastChr - 3)) && nextLtr == 'N' && nextLtr2 == 'E' && nextLtr3 == 'D') silent = 1; else if((ii == (lastChr - 1)) && nextLtr == 'N') silent = 1; if(prevLtr == 'D' && frontvAfter) silent = 1; if(prevLtr == 'G') hard = 1; else hard = 0; if(!silent) if(frontvAfter && (!hard)) strncat(buf, "J", 1); else strncat(buf, "K", 1); break; case 'H': silent = 0; if(strchr(VARSON, prevLtr) != nil) silent = 1; if(vowelBefore && !vowelAfter) silent = 1; if(!silent) strncat(buf, &curLtr, 1); break; case 'F': case 'J': case 'L': case 'M': case 'N': case 'R': strncat(buf, &curLtr, 1); break; case 'K': if(prevLtr != 'C') strncat(buf, &curLtr, 1); break; case 'P': if(nextLtr == 'H') strncat(buf, "F", 1); else strncat(buf, "P", 1); break; case 'Q': strncat(buf, "K", 1); break; case 'S': if(ii > 1 && nextLtr == 'I' && (nextLtr2 == 'O' || nextLtr2 == 'A')) strncat(buf, "X", 1); else if(nextLtr == 'H') strncat(buf, "X", 1); else strncat(buf, "S", 1); break; case 'T': if(ii > 1 && nextLtr == 'I' && (nextLtr2 == 'O' || nextLtr2 == 'A')) strncat(buf, "X", 1); else if(nextLtr == 'H') /* The=0, Tho=T, Withrow=0 */ if(ii > 0 || (strchr(VOWELS, nextLtr2) != nil)) strncat(buf, "0", 1); else strncat(buf, "T", 1); else if(!(ii < (lastChr - 2) && nextLtr == 'C' && nextLtr2 == 'H')) strncat(buf, "T", 1); break; case 'V': strncat(buf, "F", 1); break; case 'W': case 'Y': if(ii < lastChr && vowelAfter) strncat(buf, &curLtr, 1); break; case 'X': strncat(buf, "KS", 2); break; case 'Z': strncat(buf, "S", 1); break; } } /* * DON'T DO THIS NOW, REMOVING "S" IN BEGINNING HAS the same effect * with plurals, in addition imbedded S's in the Metaphone are included * * Lng = strlen(buf); * lastChr = Lng -1; if (buf[lastChr] == 'S' && Lng >= 3 ) * buf[lastChr] = '\0'; */ } /**********************************************************/ /* * Lifted from /sys/src/ape/lib/ap/gen/bsearch.c * and stripped of it's ANSI-isms */ void* bsearch(void* key, void* base, int nmemb, int size, int (*compar)(void*, void*)) { long i, bot, top, new; void *p; bot = 0; top = bot + nmemb - 1; while(bot <= top){ new = (top + bot)/2; p = (char *)base+new*size; i = (*compar)(key, p); if(i == 0) return p; if(i > 0) bot = new + 1; else top = new - 1; } return 0; } /**********************************************************/ int min3(int a, int b, int c) { int min = a; if(b < min) min = b; if(c < min) min = c; return min; } int levenshtein(char *s, char *t) { static int *d = nil; static int alloc = 0; int k, i, j, n, m, cost, distance; n = strlen(s); m = strlen(t); if(n == 0 || m == 0) return -1; /* * This leaks once on exit, but who cares */ if((m+1)*(n+1) > alloc){ free(d); alloc = (m+1)*(n+1)+32; if((d = malloc(sizeof(int)*alloc)) == nil) sysfatal("no memory\n"); } m++; n++; for(k = 0; k < n; k++) d[k] = k; for(k = 0; k < m; k++) d[k*n] = k; for(i = 1; i < n; i++) for(j = 1; j < m; j++) { cost = (s[i-1] == t[j-1])? 0: 1; d[j*n+i] = min3(d[(j-1)*n+i]+1, d[j*n+i-1]+1, d[(j -1)*n+i-1]+cost); #ifdef NOT_DEFINED if(i>=2 && j>=2 && (d[j*n+i]-d[(j-2)*n+i-2]==2) && (s[i-2]==t[j-1]) && (s[i-1]==t[j-2])) d[j*n+i]--; #endif } distance = d[n*m-1]; return distance; } /**********************************************************/ int enc(char c) { if(! isalpha(c)) return Nalpha; if(isupper(c)) c = tolower(c); return c - 'a'; } void triagrams(char *word) { char *p; if(word[0] == 0 || word[1] == 0) // too short return; Ntri++; p = word; Tri[Begword][enc(p[0])][enc(p[1])]++; for(; p[0] && p[1] && p[2]; p++) Tri[enc(p[0])][enc(p[1])][enc(p[2])]++; Tri[enc(p[0])][enc(p[1])][Endword]++; } double triprob(char *p, int l, int i) { double n; if(i > l-2) return 0.5; // too short if(i == 0) n = Tri[Begword][enc(p[0])][enc(p[1])]; else if(i == l-1) n = Tri[enc(p[i])][enc(p[i+1])][Endword]; else n = Tri[enc(p[i-1])][enc(p[i+0])][enc(p[i+1])]; return n / (double)Ntri; } /**********************************************************/ int tabcmp(void *a, void *b) { Table *x = (Table *)a; Table *y = (Table *)b; return memcmp(x->code, y->code, Ncode); } int rescmp(void *a, void *b) { Result **x = (Result **)a; Result **y = (Result **)b; if((*x)->dist == (*y)->dist) return strcmp((*x)->word, (*y)->word); return (*x)->dist - (*y)->dist; } void ingest(void) { char *p; vlong off; int alloc; off = 0; Tab = nil; Ntab = alloc = 0; while((p = Brdline(Words, '\n')) != nil){ p[Blinelen(Words) -1] = 0; triagrams(p); if(Ntab >= alloc){ alloc += 128; if((Tab = realloc(Tab, alloc*sizeof(Table))) == nil) sysfatal("No memory: %r\n"); } metaphone(p, Tab[Ntab].code, Ncode); Tab[Ntab].off = off; Ntab++; off = Bseek(Words, 0, 1); } qsort(Tab, Ntab, sizeof(Table), tabcmp); } void results(char *word, Result *root) { int n, i; char *prev; Result **idx, *r; n = 0; for(r = root; r; r = r->next) n++; if((idx = malloc(n * sizeof(Result *))) == nil) sysfatal("No memory: %r\n"); i = 0; for(r = root; r; r = r->next) idx[i++] = r; qsort(idx, n, sizeof(Result *), rescmp); print("%-16s → ", word); prev = ""; for(i = 0; i < n; i++){ r = idx[i]; if(strcmp(r->word, prev) == 0) continue; print("%s ", r->word); if(r->dist == 0) break; if((i > Idx && r->dist > Dist) || i > Maxidx) break; prev = r->word; } print("\n"); free(idx); } int lookup(Result **root, char *word, Table *tp) { Result *r; char l, *p; Table *first, *last; for(r = *root; r; r = r->next) if(memcmp(r->code, tp->code, Ncode) == 0) // already tested return 0; for(first = tp; tabcmp(tp, first) == 0 && first >= Tab; first--) continue; first++; for(last = tp; tabcmp(tp, last) == 0; last++) continue; for(tp = first; tp < last; tp++){ Bseek(Words, tp->off, 0); if((p = Brdline(Words, '\n')) == nil){ fprint(2, "%s - read failed at off=%lld: %r\n", Wordfile, tp->off); continue; } l = Blinelen(Words) -2; p[l+1] = 0; if((r = malloc(sizeof(Result)+strlen(p)+1)) == nil) sysfatal("No memory"); r->dist = levenshtein(word, p); memcpy(r->code, tp->code, Ncode); r->word = ((char *)r)+sizeof(Result); strcpy(r->word, p); r->next = *root; *root = r; if(r->dist == 0) return 1; /* * Workaround for feature of deroff where it strips * trailing periods from all words, even abbreviations. */ if(p[l] == '.' && strncmp(p, word, l-1) == 0){ r->dist = 0; return 1; } } return 0; } void permute(char *word) { char *buf; int i, j, l; Table t, *tp; Result *r, *nr, *root; root = nil; l = strlen(word); if((buf = malloc(l+2)) == nil) // 2 == 1 insertion + string terminator sysfatal("No memory %r\n"); /* unchanged */ metaphone(word, t.code, Ncode); if((tp = bsearch(&t, Tab, Ntab, sizeof(Table), tabcmp)) != nil){ Generated++; if(lookup(&root, word, tp) == 1) Unknown++; } /* deletion */ for(i = 0; i < l; i++){ Generated++; strncpy(buf, word, i); strcpy(buf+i, word+i+1); if(triprob(buf, l, i) == 0){ Rejected++; continue; } metaphone(buf, t.code, Ncode); if((tp = bsearch(&t, Tab, Ntab, sizeof(Table), tabcmp)) != nil) lookup(&root, word, tp); Unknown++; } /* modification */ for(j = 'a'; j <= 'z'; j++){ for(i = 0; i < l; i++){ Generated++; strcpy(buf, word); buf[i] = j; if(triprob(buf, l, i) == 0){ Rejected++; continue; } metaphone(buf, t.code, Ncode); if((tp = bsearch(&t, Tab, Ntab, sizeof(Table), tabcmp)) != nil) lookup(&root, word, tp); Unknown++; } } /* transposition */ for(j = 0; j < l-1; j++){ for(i = j+1; i < l; i++){ Generated++; strcpy(buf, word); buf[i] = word[j]; buf[j] = word[i]; if(triprob(buf, l, i) == 0 || triprob(buf, l, j) == 0){ Rejected++; continue; } metaphone(buf, t.code, Ncode); if((tp = bsearch(&t, Tab, Ntab, sizeof(Table), tabcmp)) != nil) lookup(&root, word, tp); Unknown++; } } /* insertion */ for(j = 'a'; j <= 'z'; j++){ for(i = 0; i < l; i++){ Generated++; strncpy(buf, word, i); buf[i+1] = j; strcpy(buf+i+1, word+i); if(triprob(buf, l, i+1) == 0){ Rejected++; continue; } metaphone(buf, t.code, Ncode); if((tp = bsearch(&t, Tab, Ntab, sizeof(Table), tabcmp)) != nil) lookup(&root, word, tp); Unknown++; } } results(word, root); free(buf); for(r = root; r; r = nr){ nr = r->next; free(r); } } void usage(void) { fprint(2, "usage: %s [-s] [-d dict] [words]\n", argv0); fprint(2, " -d dict use dict rather than %s\n", Wordfile); fprint(2, " -s print stats at exit\n"); exits("usage"); } void main(int argc, char *argv[]) { char *p; Biobuf bi; int i, j, stats; stats = 0; ARGBEGIN{ case 's': stats++; break; case 'd': Wordfile = EARGF(usage()); break; default: usage(); break; }ARGEND; if((Words = Bopen(Wordfile, OREAD)) == nil) sysfatal("%s - cannot open: %r\n", Wordfile); if((Tri = mallocz(sizeof(Tri[0][0][0]) * Trilen, 1)) == nil) sysfatal("No memory: %r\n"); for(i = 0; i < Trilen; i++){ if((Tri[i] = mallocz(sizeof(Tri[0][0][0]) * Trilen, 1)) == nil) sysfatal("No memory: %r\n"); for(j = 0; j < Trilen; j++) if((Tri[i][j] = mallocz(sizeof(Tri[0][0][0]) * Trilen, 1)) == nil) sysfatal("No memory: %r\n"); } ingest(); if(argc){ for(i = 0; i < argc; i++) permute(argv[i]); } else{ Binit(&bi, OREAD, 0); while((p = Brdline(&bi, '\n')) != nil){ for(i = Blinelen(&bi) -1; isspace(p[i]) && i >= 0; i--) p[i] = 0; for(; isspace(*p); p++) continue; if(*p == 0) continue; permute(p); } Bterm(&bi); } free(Tab); Bterm(Words); if(stats) print("generated: %llud rejected: %llud unknown: %llud\n", Generated, Rejected, Unknown); exits(0); }