orig/sherlock.c:2,13 c sherlock.c:2,6 < * sherlock.c - written by Loki from Rob Pike's sig and comp programs. < * < * This program takes filenames given on the command line, < * and reads those files into memory, then compares them < * all pairwise to find those which are most similar. < * < * It uses a digital signature generation scheme to randomly < * discard information, thus allowing a better match. < * Essentially it hashes up N adjacent 'words' of input, < * and semi-randomly throws away many of the hashed values < * so that it become hard to hide the plagiarised text. < */ --- > * sherlock.c - > * Originally written by Loki from Rob Pike's > * sig and comp programs. > * Ported to Plan 9 by Akshat Kumar. > */ orig/sherlock.c:15,17 c sherlock.c:8,10 < #include < #include < #include --- > #include > #include > #include orig/sherlock.c:19 d sherlock.c:11 < char * Progname = "sherlock"; orig/sherlock.c:22,25 c sherlock.c:14,15 < unsigned long zeromask; < int ntoken = 0; < char ** token; < FILE * Outfile; --- > ulong zeromask; > char ** token; orig/sherlock.c:39 c sherlock.c:29 < unsigned long *val; --- > ulong *val; orig/sherlock.c:42,43 c sherlock.c:32 < void init_token_array(void); < Sig * signature(FILE *); --- > Sig * signature(Biobuf *); orig/sherlock.c:48,66 c sherlock.c:37,39 < fprintf(stderr, "%s: find similar files\n", Progname); < < fprintf(stderr, "usage: %s", Progname); < fprintf(stderr, " [options] file1 file2 ...\n"); < < fprintf(stderr, "options:"); < fprintf(stderr, " [-t threshold%%]"); < fprintf(stderr, " [-z zerobits]"); < fprintf(stderr, " [-n chainlength]"); < fprintf(stderr, " [-o outfile]"); < fprintf(stderr, "\n"); < < fprintf(stderr, "defaults:"); < fprintf(stderr, " threshold=20%%"); < fprintf(stderr, " zerobits=3"); < fprintf(stderr, " chainlength=4"); < fprintf(stderr, " outfile=the screen"); < fprintf(stderr, "\n"); < exit(2); --- > fprint(2, "usage: %s [-t thresh] [-z zbits] [-n ntoks]" > " file1 file2 ...\n", argv0); > exits("usage"); orig/sherlock.c:69 c sherlock.c:42 < int main(int argc, char *argv[]) --- > void main(int argc, char *argv[]) orig/sherlock.c:71,73 c sherlock.c:44,45 < FILE *f; < int i, j, nfiles, start, percent; < char *s, *outname; --- > int f, i, j, percent; > Biobuf bin; orig/sherlock.c:74 a sherlock.c:47 > char *err; orig/sherlock.c:76,83 c sherlock.c:49 < Outfile = stdout; < outname = NULL; < < /* handle options */ < for (start=1; start < argc; start++) { < if (argv[start][0] != '-') < break; < switch (argv[start][1]) { --- > ARGBEGIN { orig/sherlock.c:85,90 c sherlock.c:51 < s = argv[++start]; < if (s == NULL) < usage(); < Thresh = atoi(s); < if (Thresh < 0 || Thresh > 100) < usage(); --- > Thresh = atoi(EARGF(usage())); orig/sherlock.c:93,98 c sherlock.c:54 < s = argv[++start]; < if (s == NULL) < usage(); < Zerobits = atoi(s); < if (Zerobits < 0 || Zerobits > 31) < usage(); --- > Zerobits = atoi(EARGF(usage())); orig/sherlock.c:101,106 c sherlock.c:57 < s = argv[++start]; < if (s == NULL) < usage(); < Ntoken = atoi(s); < if (Ntoken <= 0) < usage(); --- > Ntoken = atoi(EARGF(usage())); orig/sherlock.c:108,113 d sherlock.c:58 < case 'o': < s = argv[++start]; < if (s == NULL) < usage(); < outname = s; < break; orig/sherlock.c:116 c sherlock.c:61,65 < } --- > } ARGEND; > > if (Thresh < 0 || Thresh > 100) { > fprint(2, "%s: threshold must be between 0 and 100\n", argv0); > exits("threshold"); orig/sherlock.c:119,120 c sherlock.c:68,78 < nfiles = argc - start; < if (nfiles < 2) --- > if (Zerobits < 0 || Zerobits > 31) { > fprint(2, "%s: zerobits must be between 0 and 31\n", argv0); > exits("zerobits"); > } > > if (Ntoken <= 0) { > fprint(2, "%s: Ntoken must be greater than 0\n", argv0); > exits("ntoken"); > } > > if (argc < 2) orig/sherlock.c:123,126 c sherlock.c:81 < /* initialise */ < if (outname != NULL) < Outfile = fopen(outname, "w"); < init_token_array(); --- > token = mallocz(Ntoken * sizeof(*token), 1); orig/sherlock.c:128 c sherlock.c:83 < sig = malloc(nfiles * sizeof(Sig *)); --- > sig = mallocz(argc * sizeof(*sig), 1); orig/sherlock.c:130,138 c sherlock.c:85,94 < /* generate signatures for each file */ < for (i=0; i < nfiles; i++) { < /* fprintf(stderr, "%s: Reading %s\n", Progname, argv[i+start]); */ < f = fopen(argv[i+start], "r"); < if (f == NULL) { < fprintf(stderr, "%s: can't open %s:", < Progname, argv[i+start]); < perror(NULL); < continue; --- > err = nil; > for (i=0; i < argc; i++) { > f = open(argv[i], OREAD); > if (f < 0) { > fprint(2, "%s: can't open %s: %r\n", argv0, argv[i]); > err = "open"; > } else { > Binit(&bin, f, OREAD); > sig[i] = signature(&bin); > Bterm(&bin); orig/sherlock.c:140,141 d sherlock.c:95 < sig[i] = signature(f); < fclose(f); orig/sherlock.c:145,146 c sherlock.c:99,102 < for (i=0; i < nfiles; i++) < for (j=i+1; j < nfiles; j++) { --- > for (i=0; i < argc; i++) { > for (j=i+1; j < argc; j++) { > if (sig[i] == nil || sig[j] == nil) > continue; orig/sherlock.c:149,150 c sherlock.c:105,106 < fprintf(Outfile, "%s and %s: %d%%\n", < argv[i+start], argv[j+start], percent); --- > print("%s and %s: %d%%\n", > argv[i], argv[j], percent); orig/sherlock.c:151 a sherlock.c:108 > } orig/sherlock.c:153 c sherlock.c:110 < return 0; --- > exits(err); orig/sherlock.c:156,162 c sherlock.c:113 < /* read_word: read a 'word' from the input, ignoring leading characters < which are inside the 'ignore' string, and stopping if one of < the 'ignore' or 'punct' characters is found. < Uses memory allocation to avoid buffer overflow problems. < */ < < char * read_word(FILE *f, int *length, char *ignore, char *punct) --- > char * read_word(Biobuf *bin, int *length, char *ignore, char *punct) orig/sherlock.c:170,176 d sherlock.c:120 < /* check for EOF first */ < if (feof(f)) { < length = 0; < return NULL; < } < < /* allocate a buffer to hold the string */ orig/sherlock.c:179,180 c sherlock.c:123,124 < word = malloc(sizeof(char) * max); < c = & word[pos]; --- > word = malloc(sizeof(*word) * max); > c = &word[pos]; orig/sherlock.c:182,183 c sherlock.c:126 < /* initialise some defaults */ < if (ignore == NULL) --- > if (!ignore) orig/sherlock.c:185 c sherlock.c:128 < if (punct == NULL) --- > if (!punct) orig/sherlock.c:189,190 c sherlock.c:132,133 < while ((ch = getc(f)) != EOF) { < is_ignore = (strchr(ignore, ch) != NULL); --- > while ((ch = Bgetc(bin)) >= 0) { > is_ignore = (strchr(ignore, ch) != nil); orig/sherlock.c:193 d sherlock.c:135 < /* ignorable char found at start, skip it */ orig/sherlock.c:197 d sherlock.c:138 < /* ignorable char found after start, stop */ orig/sherlock.c:199 c sherlock.c:140 < is_punct = (strchr(punct, ch) != NULL); --- > is_punct = (strchr(punct, ch) != nil); orig/sherlock.c:201 c sherlock.c:142 < ungetc(ch, f); --- > Bungetc(bin); orig/sherlock.c:210 d sherlock.c:150 < /* realloc buffer twice the size */ orig/sherlock.c:221 c sherlock.c:161 < return NULL; --- > return nil; orig/sherlock.c:233 c sherlock.c:173 < unsigned long v1, v2; --- > ulong v1, v2; orig/sherlock.c:235,236 c sherlock.c:175,176 < v1 = *(unsigned long *) p1; < v2 = *(unsigned long *) p2; --- > v1 = *(ulong *) p1; > v2 = *(ulong *) p2; orig/sherlock.c:245,246 c sherlock.c:185 < /* hash: hash an array of char* into an unsigned long hash code */ < unsigned long hash(char *tok[]) --- > ulong hash(char *tok[]) orig/sherlock.c:248,249 c sherlock.c:187,188 < unsigned long h; < unsigned char *s; --- > ulong h; > uchar *s; orig/sherlock.c:254 c sherlock.c:193 < for (s=(unsigned char*)tok[i]; *s; s++) --- > for (s=(uchar*)tok[i]; *s; s++) orig/sherlock.c:259 c sherlock.c:198 < void init_token_array(void) --- > Sig * signature(Biobuf *bin) orig/sherlock.c:261,270 d sherlock.c:199 < int i; < < /* create global array of char* and initialise all to NULL */ < token = malloc(Ntoken * sizeof(char*)); < for (i=0; i < Ntoken; i++) < token[i] = NULL; < } < < Sig * signature(FILE *f) < { orig/sherlock.c:272 c sherlock.c:201 < unsigned long *v, h; --- > ulong *v, h; orig/sherlock.c:277,278 c sherlock.c:206 < /* start loading hash values, after we have Ntoken of them */ < v = NULL; --- > v = nil; orig/sherlock.c:282 c sherlock.c:210 < while ((str = read_word(f, &i, Ignore, Punct)) != NULL) --- > while ((str = read_word(bin, &i, Ignore, Punct)) != nil) orig/sherlock.c:284 d sherlock.c:211 < /* step words down by one */ orig/sherlock.c:288 d sherlock.c:214 < /* add new word into array */ orig/sherlock.c:291 d sherlock.c:216 < /* if we don't yet have enough words in the array continue */ orig/sherlock.c:296 d sherlock.c:220 < /* hash the array of words */ orig/sherlock.c:301 d sherlock.c:224 < /* discard zeros from end of hash value */ orig/sherlock.c:304 d sherlock.c:226 < /* add value into the signature array, resizing if needed */ orig/sherlock.c:307 c sherlock.c:229 < v = realloc(v, na*sizeof(unsigned long)); --- > v = realloc(v, na*sizeof(ulong)); orig/sherlock.c:315 d sherlock.c:236 < /* allocate and return the Sig structure for this file */ orig/sherlock.c:325 c sherlock.c:246 < unsigned long v; --- > ulong v; orig/sherlock.c:358,377 d sherlock.c:278 < < /* < * Let f1 == filesize(file1) == A+B < * and f2 == filesize(file2) == A+C < * where A is the similar section and B or C are dissimilar < * < * Similarity = 100 * A / (f1 + f2 - A) < * = 100 * A / (A+B + A+C - A) < * = 100 * A / (A+B+C) < * < * Thus if A==B==C==n the similarity will be 33% (one third) < * This is desireable since we are finding the ratio of similarities < * as a fraction of (similarities+dissimilarities). < * < * The other way of doing things would be to find the ratio of < * the sum of similarities as a fraction of total file size: < * Similarity = 100 * (A+A) / (A+B + A+C) < * This produces higher percentages and more false matches. < */ <