/* * sherlock.c - * Originally written by Loki from Rob Pike's * sig and comp programs. * Ported to Plan 9 by Akshat Kumar. */ #include #include #include int Ntoken = 3; int Zerobits = 4; ulong zeromask; char ** token; int Thresh = 20; /* characters to ignore at start and end of each word */ char * Ignore = " \t\n"; /* characters to treat as word-separators or words on their own */ char * Punct_full = ",.<>/?;:'\"`~[]{}\\|!@#$%^&*()-+_="; char * Punct = ""; typedef struct Sig Sig; struct Sig { int nval; ulong *val; }; Sig * signature(Biobuf *); int compare(Sig *, Sig *); void usage(void) { fprint(2, "usage: %s [-t thresh] [-z zbits] [-n ntoks]" " file1 file2 ...\n", argv0); exits("usage"); } void main(int argc, char *argv[]) { int f, i, j, percent; Biobuf bin; Sig **sig; char *err; ARGBEGIN { case 't': Thresh = atoi(EARGF(usage())); break; case 'z': Zerobits = atoi(EARGF(usage())); break; case 'n': Ntoken = atoi(EARGF(usage())); break; default: usage(); } ARGEND; if (Thresh < 0 || Thresh > 100) { fprint(2, "%s: threshold must be between 0 and 100\n", argv0); exits("threshold"); } if (Zerobits < 0 || Zerobits > 31) { fprint(2, "%s: zerobits must be between 0 and 31\n", argv0); exits("zerobits"); } if (Ntoken <= 0) { fprint(2, "%s: Ntoken must be greater than 0\n", argv0); exits("ntoken"); } if (argc < 2) usage(); token = mallocz(Ntoken * sizeof(*token), 1); zeromask = (1<= Thresh) print("%s and %s: %d%%\n", argv[i], argv[j], percent); } } exits(err); } char * read_word(Biobuf *bin, int *length, char *ignore, char *punct) { long max; char *word; long pos; char *c; int ch, is_ignore, is_punct; pos = 0; max = 128; word = malloc(sizeof(*word) * max); c = &word[pos]; if (!ignore) ignore = ""; if (!punct) punct = ""; /* read characters into the buffer, resizing it if necessary */ while ((ch = Bgetc(bin)) >= 0) { is_ignore = (strchr(ignore, ch) != nil); if (pos == 0) { if (is_ignore) continue; } if (is_ignore) break; is_punct = (strchr(punct, ch) != nil); if (is_punct && (pos > 0)) { Bungetc(bin); break; } *c = ch; c++; pos++; if (is_punct) break; if (pos == max) { max += max; word = realloc(word, max); c = & word[pos]; } } /* set length and check for EOF condition */ *length = pos; if (pos == 0) { free(word); return nil; } /* terminate the string and shrink to smallest required space */ *c = '\0'; word = realloc(word, pos+1); return word; } /* ulcmp: compare *p1 and *p2 */ int ulcmp(const void *p1, const void *p2) { ulong v1, v2; v1 = *(ulong *) p1; v2 = *(ulong *) p2; if (v1 < v2) return -1; else if (v1 == v2) return 0; else return 1; } ulong hash(char *tok[]) { ulong h; uchar *s; int i; h = 0; for (i=0; i < Ntoken; i++) for (s=(uchar*)tok[i]; *s; s++) h = h*31 + *s; return h; } Sig * signature(Biobuf *bin) { int nv, na; ulong *v, h; char *str; int i, ntoken; Sig *sig; v = nil; na = 0; nv = 0; ntoken = 0; while ((str = read_word(bin, &i, Ignore, Punct)) != nil) { free(token[0]); for (i=0; i < Ntoken-1; i++) token[i] = token[i+1]; token[Ntoken-1] = str; ntoken++; if (ntoken < Ntoken) continue; h = hash(token); if ((h & zeromask) != 0) continue; h = h >> Zerobits; if (nv == na) { na += 100; v = realloc(v, na*sizeof(ulong)); } v[nv++] = h; } /* sort the array of hash values for speed */ qsort(v, nv, sizeof(v[0]), ulcmp); sig = malloc(sizeof(Sig)); sig->nval = nv; sig->val = v; return sig; } int compare(Sig *s0, Sig *s1) { int i0, i1, nboth, nsimilar; ulong v; i0 = 0; i1 = 0; nboth = 0; while (i0 < s0->nval && i1 < s1->nval) { if (s0->val[i0] == s1->val[i1]) { v = s0->val[i0]; while (i0 < s0->nval && v == s0->val[i0]) { i0++; nboth++; } while (i1 < s1->nval && v == s1->val[i1]) { i1++; nboth++; } continue; } if (s0->val[i0] < s1->val[i1]) i0++; else i1++; } if (s0->nval + s1->nval == 0) return 0; /* ignore if both are empty files */ if (s0->nval + s1->nval == nboth) return 100; /* perfect match if all hash codes match */ nsimilar = nboth / 2; return 100 * nsimilar / (s0->nval + s1->nval - nsimilar); }