/*
  * sherlock.c - 
  *	Originally written by Loki from Rob Pike's
  *	sig and comp programs. 
  *	Ported to Plan 9 by Akshat Kumar.
  */

#include <u.h>
#include <libc.h>
#include <bio.h>

int		Ntoken = 3;
int		Zerobits = 4;
ulong	zeromask;
char **	token;
int		Thresh = 20;

/* characters to ignore at start and end of each word */
char *		Ignore = " \t\n";

/* characters to treat as word-separators or words on their own */
char *		Punct_full = ",.<>/?;:'\"`~[]{}\\|!@#$%^&*()-+_=";
char *		Punct = "";

typedef struct Sig Sig;
struct Sig
{
	int		nval;
	ulong	*val;
};

Sig *	signature(Biobuf *);
int	compare(Sig *, Sig *);

void usage(void)
{
	fprint(2, "usage: %s [-t thresh] [-z zbits] [-n ntoks]"
		" file1 file2 ...\n", argv0);
	exits("usage");
}

void main(int argc, char *argv[])
{
	int f, i, j, percent;
	Biobuf bin;
	Sig **sig;
	char *err;

	ARGBEGIN {
		case 't':
			Thresh = atoi(EARGF(usage()));
			break;
		case 'z':
			Zerobits = atoi(EARGF(usage()));
			break;
		case 'n':
			Ntoken = atoi(EARGF(usage()));
			break;
		default:
			usage();
	} ARGEND;

	if (Thresh < 0 || Thresh > 100) {
		fprint(2, "%s: threshold must be between 0 and 100\n", argv0);
		exits("threshold");
	}

	if (Zerobits < 0 || Zerobits > 31) {
		fprint(2, "%s: zerobits must be between 0 and 31\n", argv0);
		exits("zerobits");
	}

	if (Ntoken <= 0) {
		fprint(2, "%s: Ntoken must be greater than 0\n", argv0);
		exits("ntoken");
	}

	if (argc < 2)
		usage();

	token = mallocz(Ntoken * sizeof(*token), 1);
	zeromask = (1<<Zerobits)-1;
	sig = mallocz(argc * sizeof(*sig), 1);

	err = nil;
	for (i=0; i < argc; i++) {
		f = open(argv[i], OREAD);
		if (f < 0) {
			fprint(2, "%s: can't open %s: %r\n", argv0, argv[i]);
			err = "open";
		} else {
			Binit(&bin, f, OREAD);
			sig[i] = signature(&bin);
			Bterm(&bin);
		}
	}

	/* compare each signature pair-wise */
	for (i=0; i < argc; i++) {
		for (j=i+1; j < argc; j++) {
			if (sig[i] == nil || sig[j] == nil)
				continue;
			percent = compare(sig[i], sig[j]);
			if (percent >= Thresh)
				print("%s and %s: %d%%\n",
					argv[i], argv[j], percent);
		}
	}

	exits(err);
}

char * read_word(Biobuf *bin, int *length, char *ignore, char *punct)
{
	long max;
        char *word;
        long pos;
        char *c;
        int ch, is_ignore, is_punct;

        pos = 0;
	max = 128;
        word = malloc(sizeof(*word) * max);
        c = &word[pos];

	if (!ignore)
		ignore = "";
	if (!punct)
		punct = "";

        /* read characters into the buffer, resizing it if necessary */
        while ((ch = Bgetc(bin)) >= 0) {
		is_ignore = (strchr(ignore, ch) != nil);
		if (pos == 0) {
			if (is_ignore)
				continue;
		}
		if (is_ignore)
			break;
		is_punct = (strchr(punct, ch) != nil);
		if (is_punct && (pos > 0)) {
			Bungetc(bin);
			break;
		}
                *c = ch;
                c++;
                pos++;
		if (is_punct)
			break;
                if (pos == max) {
                        max += max;
                        word = realloc(word, max);
                        c = & word[pos];
                }
        }

        /* set length and check for EOF condition */
        *length = pos;
        if (pos == 0) {
                free(word);
                return nil;
        }

        /* terminate the string and shrink to smallest required space */
        *c = '\0';
        word = realloc(word, pos+1);
        return word;
}

/* ulcmp:  compare *p1 and *p2 */
int ulcmp(const void *p1, const void *p2)
{
	ulong v1, v2;

	v1 = *(ulong *) p1;
	v2 = *(ulong *) p2;
	if (v1 < v2)
		return -1;
	else if (v1 == v2)
		return 0;
	else
		return 1;
}

ulong hash(char *tok[])
{
	ulong h;
	uchar *s;
	int i;

	h = 0;
	for (i=0; i < Ntoken; i++)
		for (s=(uchar*)tok[i]; *s; s++)
			h = h*31 + *s;
	return h;
}

Sig * signature(Biobuf *bin)
{
	int nv, na;
	ulong *v, h;
	char *str;
	int i, ntoken;
	Sig *sig;

	v = nil;
	na = 0;
	nv = 0;
	ntoken = 0;
	while ((str = read_word(bin, &i, Ignore, Punct)) != nil)
	{
		free(token[0]);
		for (i=0; i < Ntoken-1; i++)
			token[i] = token[i+1];
		token[Ntoken-1] = str;

		ntoken++;
		if (ntoken < Ntoken)
			continue;

		h = hash(token);
		if ((h & zeromask) != 0)
			continue;

		h = h >> Zerobits;

		if (nv == na) {
			na += 100;
			v = realloc(v, na*sizeof(ulong));
		}
		v[nv++] = h;
	}

	/* sort the array of hash values for speed */
	qsort(v, nv, sizeof(v[0]), ulcmp);

	sig = malloc(sizeof(Sig));
	sig->nval = nv;
	sig->val = v;
	return sig;
}

int compare(Sig *s0, Sig *s1)
{
	int i0, i1, nboth, nsimilar;
	ulong v;

	i0 = 0;
	i1 = 0;
	nboth = 0;
	while (i0 < s0->nval && i1 < s1->nval) {
		if (s0->val[i0] == s1->val[i1]) {
			v = s0->val[i0];
			while (i0 < s0->nval && v == s0->val[i0]) {
				i0++;
				nboth++;
			}
			while (i1 < s1->nval && v == s1->val[i1]) {
				i1++;
				nboth++;
			}
			continue;
		}
		if (s0->val[i0] < s1->val[i1])
			i0++;
		else
			i1++;
	}

	if (s0->nval + s1->nval == 0)
		return 0;	/* ignore if both are empty files */

	if (s0->nval + s1->nval == nboth)
		return 100;	/* perfect match if all hash codes match */

	nsimilar = nboth / 2;
	return 100 * nsimilar / (s0->nval + s1->nval - nsimilar);
}