#include #include #include #include #include "dfa.h" /*** * Regular expression for matching. */ char *ignore[] = { /* HTML that isn't A, IMG, or FONT */ /* Must have a space somewhere to avoid catching */ "<[ \n\r]*(" "[^aif]|" "a[^> \t\r\n]|" "i[^mM \t\r\n]|" "im[^gG \t\r\n]|" "img[^> \t\r\n]|" "f[^oO \t\r\n]|" "fo[^Nn \t\r\n]|" "fon[^tT \t\r\n]|" "font[^> \r\t\n]" ")[^>]*[ \t\n\r][^>]*>", "<[ \n\r]*(" "i|im|f|fo|fon" ")[ \t\r\n][^>]*>", /* ignore html comments */ "", /* random mail strings */ "^message-id:.*\n([ ].*\n)*", "^in-reply-to:.*\n([ ].*\n)*", "^references:.*\n([ ].*\n)*", "^date:.*\n([ ].*\n)*", "^delivery-date:.*\n([ ].*\n)*", "e?smtp id .*", "^ id.*", "boundary=.*", "name=\"", "filename=\"", "news:<[^>]+>", "^--[^ ]*$", /* base64 encoding */ "^[0-9a-zA-Z+\\-=/]+$", /* uu encoding */ "^[!-Z]+$", /* little things */ ".", "\n", }; char *keywords[] = { "([a-zA-Z'`$!¡-￿]|[0-9]([.,][0-9])*)+", }; int debug; Dreprog* dregcomp(char *buf) { Reprog *r; Dreprog *d; if(debug) print(">>> '%s'\n", buf); r = regcomp(buf); if(r == nil) sysfatal("regcomp"); d = dregcvt(r); if(d == nil) sysfatal("dregcomp"); free(r); return d; } char* strcpycase(char *d, char *s) { int cc, esc; cc = 0; esc = 0; while(*s){ if(*s == '[') cc++; if(*s == ']') cc--; if(!cc && 'a' <= *s && *s <= 'z'){ *d++ = '['; *d++ = *s; *d++ = *s+'A'-'a'; *d++ = ']'; }else *d++ = *s; if(*s == '\\') esc++; else if(esc) esc--; s++; } return d; } void regerror(char *msg) { sysfatal("regerror: %s", msg); } void buildre(Dreprog *re[3]) { int i; static char buf[16384], *s; re[0] = dregcomp("^From "); s = buf; for(i=0; i