#!/bin/env -a 1800 python
# name: pipeto
# last update: 2008/12/04
# -Kenar-
#
# mode of this file should be 755
# chmod 755 pipeto
# user=sys.argv[1] # local!arisawa
# mbox=sys.argv[2] # /mail/box/arisawa/mbox
# NOTE: pwd is /usr/none
# NOTE: sys.argv[0] is /mail/box/arisawa/pipeto

# Debugging message
# 	pipeto -D <message
# Don't forget "From" line in the message


import sys
import os
from string import *
import re
sep=re.compile("^From ")

# whitelist/blacklist
# the 4 types of rules:
# &		# client IP address
# *		# HELO host
# ^		# mail header pattern, string in ( ) will be shown in subject header
# others # mail body pattern

whitelist=[]

blacklist=[
r"* [^ ]*(dial|dsl|adsl|xdsl|dynamic)[.-]", # don't remove. this data is used in fqdn check
]

(white,black,grey) = (1,2,3)

maxbody={}
maxbody[white] = 10000000  # 10MB
maxbody[black] = 100000 #  100KB
maxbody[grey] = 1000000 #1MB

maxhead=100000    # 100KB

line0=None

def logit(s): # for debug
	f=open("/sys/log/pipeto", "a")
	f.write(s)
	f.close()

def dnsquery(q,name="/net/dns"):
	"""
	usage:
	print dnsquery("ar.aichi-u.ac.jp ip")
	print dnsquery("ar.aichi-u.ac.jp ip","/net/dns")
	print dnsquery("202.250.160.40 ptr")
	"""
	f = os.open(name,os.ORDWR)
	os.seek(f,0,0)
	t = split(q)
	if t[1] == "ptr":
		u = split(t[0],".")
		q = join((u[3],u[2],u[1],u[0]),".")+".in-addr.arpa ptr"
	r = []
	try:	# NOTE: we need timer but we don't have timer!
		os.write(f,q)
		os.seek(f,0,0)
		v = os.read(f,256) # "" if not answered
		while v:
			r = r + [v.split()[2]]
			v = os.read(f,256)
	except:
		pass
	os.close(f)
	if r == []:
		return None
	return r

def getline(f):
	global line0
	if line0:
		s = line0
		line0 = None
		return s
	return f.readline()
def ungetline(line):
	global line0
	line0 = line


def gethead(f):
	m=""
	line = getline(f)
	n = len(line)
	while line and line != "\n":
		if sep.match(line): # illegal mail that should not happen
			line = "BadHeader: " + line
		if n > maxhead:
			m = m + "\n"
			while line: # discard to the end of the mail
				line = getline(f)
			return (m,"truncated")
		m = m + line
		line = getline(f)
		n = n + len(line)
	if line: # then the line is "\n"
		ungetline(line)
	return (m,None)

def getbody(f,htype): # starting from empty line
	#dprint("checking body...")
	status = None
	max = maxbody[htype]
	mbody=""
	line = getline(f)
	n = len(line)
	while line:
		#dprint(line)
		if n > max: # so big
			status = "truncated"
			# we should inform the client that the mail is truncated,
			# but the code below does not work.
			# sys.stdout.write("552 Too much mail data.\n")
			# sys.stderr.write("552 Too much mail data.\n")
			# I guess the work must be done in smtpd.
			break
		if htype == grey and status == None: # we detect only first matching
			if xsearch1(w3c,line):
				htype = white
				max = maxbody[htype]
			else:
				status = xsearch1(b3c,line)
				if status != None:
					htype = black
					max = maxbody[htype]
		if sep.match(line):
			# this should be unnecessary, but smtpd sometimes makes a mistake
			# I don't know the reason.
			line = " " + line
		mbody = mbody + line
		line = getline(f)
		n = n + len(line)
	if len(mbody) == 0 or mbody[-1] != "\n":
		mbody = mbody + "\n"
	return (mbody,htype,status) # includes trailing empty line

def getlist(file):
	g = open(file)
	v = g.readlines()
	u = []
	for s in v:
		t = strip(s)
		if len(t) and t[0] != '#':
			u = u + [t]
	g.close()
	return u

def classfy(list):
	clip = [] # client ip
	fqdn = []
	headers = []
	others = []
	for x in list:
		if x[0] == "&":
			t = split(x)
			clip = clip +  [t[1]]
		elif x[0] == "*": fqdn = fqdn + [strip(x[1:])]
		elif x[0] == "^": headers = headers + [strip(x)]
		else: others = others + [strip(x)]
	return (clip,fqdn,headers,others)

def xmatch(e,s):
	if e==None or s==None:
		return None
	return e.match(s)

def xsearch(e,s):
	if not e:
		return None
	return e.search(s)

def xsearch1(e,s):
	m = xsearch(e,s)
	if m:
		# the example is ((0, 24), (19, 23), (-1, -1))
		status = ""
		for k in range(1,len(m.regs)):
			status = m.group(k)
			if status:
				break
		return status
	return None

def dnwcheck(dn): # domain name white check
	if dn == None:
		return False
	for d in dn:
		if w1c.match(d):
			return True
	return False

def ddncheck(dn,ip): # dynamic domain name check
	if dn == None:
		return True
	dprint("### dn: %s"%dn)
	# we remove "-" and "."
	d = dn.replace("-", "")
	d = d.replace(".", "");
	if n5c.search(d): # 5 or more contiguous numerics
		return True
	dprint("### checking by IP")
	d = lower(d)
	i = ip.split(".")
	s = "%02x%02x%02x%02x"%(int(i[0]),int(i[1]),int(i[2]),int(i[3]))
	if d.find(s) >= 0:
		return True
	i.reverse()
	s = "%02x%02x%02x%02x"%(int(i[0]),int(i[1]),int(i[2]),int(i[3]))
	if d.find(s) >= 0:
		return True
	return False

def ipequiv(ip1,ip2,m): # m is 24,32 etc
	s = split(ip1,".")
	i1 = 256**3*int(s[0]) + 256**2*int(s[1]) + 256*int(s[2]) + int(s[3])
	s = split(ip2,".")
	i2 = 256**3*int(s[0]) + 256**2*int(s[1]) + 256*int(s[2]) + int(s[3])
	if (i1 ^ i2) >> 32 - m: return False
	return True
	

def in_na(ip,ad): # check if the ip is in ad.
	# "ad" is a network adress in CIDR notation, i.e., aaa.bbb.ccc.ddd/mask
	# example: in_na("202.250.160.40", "202.250.160.0/24") returns True
	t = split(ad,"/")
	m = int(t[1])	# mask
	return ipequiv(ip,t[0],m)

def in_nal(ip, nalist): # nalist: list of IP/M, network adress in CIDR notation
	if nalist == None:
		return False
	for a in nalist:
		if in_na(ip,a): return True
	return False

def in_ipl(ip, iplist): # check if the ip is in iplist with some lose manner
	m = 24  # mask
	if iplist == None:
		return False
	for a in iplist:
		if ipequiv(ip, a, m): return True
	return False

def dprint(s):
	if debug:
		print s


f=sys.stdin
args = sys.argv
p = args[0] # path to this program
n = rfind(p,"/")
os.chdir(p[:n])   # p[:n] is "/mail/box/arisawa"
debug = False
if len(args) > 1 and args[1] == "-D":
	debug = True
if debug:
	mb = sys.stdout
else:
	mb = None
	while mb == None:
		try:
			mb = open("mbox","a")
		except:
			os.sleep(5000) # wait 5 sec

whitelist = whitelist + getlist("white")
blacklist = blacklist + getlist("black")

# The first line is a separator and does not have client IP info,
# so we use the "Received: from" line such as
# Received: from X14.D-IP06.lipetsk.ru ([195.34.253.14]) by ar
# that appears first in the mail
rc2=re.compile(r"^Received: from ([^ ]+) \(\[([0-9.]+)\]\)", re.M)

whitelist = classfy(whitelist)
blacklist = classfy(blacklist)

w1c = w2c = w3c = b1c = b2c = b3c = None

wipl = whitelist[0]; # white ip list
if whitelist[1]:
	w1c=re.compile(join(whitelist[1],"|"))
if whitelist[2]:
	w2c=re.compile(join(whitelist[2],"|"),re.M)
if whitelist[3]:
	w3c=re.compile(join(whitelist[3],"|"),re.M)

bipl = blacklist[0]; # black ip list
if blacklist[1]:
	# Pattern following "* "
	# b1c is the pattern that should be tested for FQDN
	b1c=re.compile(join(blacklist[1],"|"))
if blacklist[2]:
	b2c=re.compile(join(blacklist[2],"|"),re.M)
if blacklist[3]:
	b3c=re.compile(join(blacklist[3],"|"))

n5c=re.compile(r"[0-9]{5,}")



sepline=getline(f)  # "From "  line

chk = grey
status0 = None	# header status
(head, status0) = gethead(f)

n = head.find("\n")	# used to extract the first line
if n < 0: n = len(head) # should not happen

m = rc2.search(head)  # search the first "Received: from " line
# we cannot assume non None m, such mails come from local senders
if m:
	d = m.group(1)
	dprint("### d=%s"%d) # client HELO host
	ip = m.group(2)  # client ip
	if in_nal(ip, wipl):
		chk = white
	elif in_nal(ip, bipl):
		chk = black
		status0 = "ip"  # in black ip list
	elif head.find("with ESMTPA",0,n) >= 0:
		chk = white
	elif d[0] == "[":
		chk = black
		status0 = "noname" # no dom name
		dprint ("### black0")
	elif xmatch(b1c,d): # check if the dom naime is in the blacklist
		chk = black
		status0 = "host" # the host is in the blacklist
	elif xsearch(w2c,head):
		chk = white
	elif ddncheck(d,ip):
		chk = black
		status0 = "suspect"  # suspect the dom name is dynamic
		dprint ("### black1")
	else:
		ips = dnsquery(d + " ip") # ips is None or a list of ip
		dprint("### ip=%s"%ip)
		dprint("### dnsquery="); dprint(ips)
		if ips == None or in_ipl(ip,ips) == False: # dns failure or faiked fqdn
			chk = black
			status0 = "fake"  # faked helo host
		elif dnwcheck(d):
			chk = white
			dprint ("### white1")
		else:
			# don't touch status0 for non black
			st = xsearch1(b2c,head)
			if st != None:
				chk = black
				status0 = st
				dprint ("### black1")

if head.find("\nSubject:") < 0:
	head = head + "Subject:\n"

(body,chk,status) = getbody(f,chk)
if chk == black:
	if status0 != None:
		status = "spam:%s"%status0
	else: # this should not happen
		status = "spam"
if status != None:
	head = replace(head,"\nSubject:","\nSubject: [%s]"%status)

mail=sepline + head + body + "\n"
if chk == black or body=="\n":
	dprint("--------------- spam ----------------")
else:
	dprint ("--------------- normal ----------------")
mb.write(mail)
mb.flush()