#!/usr/bin/python
"""Spam ranker

Browses thru an mbox-style mailbox given as the first argument.
Prints a summary of the hosts/domain in the Received: -headers, and lists the
URLs found in the bodies of the messages.

whitelist	regexp for excluding received-headers
rex_domain	regexp for matching a domain-name from received-headers
count_threshold	suppress printing anything with a lesser count
"""

import sre, sys, os
import mailbox, rfc822

whitelist=sre.compile("from.*((sci|iki)\.fi|212\.16\.100\.[12]).*(?=.*\sby\s)", sre.S)

rex_domain=sre.compile("""
	(
	([^.]*\.(com|net|org|int|mil|gov|edu|info|biz|name))
	|([^.]*\.(fi|se|no|nl|ch|fr|be|nu|dk|at|pl|de|ru|gr|cl|cz|ca))
	|[^.]*\.(com?|net?|ac|gov|sk)\.[^.]*
	)$""", sre.X)

count_threshold=10

def main ():
    mailboxname_in = sys.argv[1]
    mailboxname_out = mailboxname_in + '.report'
    result, urldb =process_mailbox (mailboxname_in, mailboxname_out)
    ranks=result.items()
    urls=urldb.items()
    ranks.sort(by_count)
    urls.sort(by_count)
    print """# count \t Received from"""
    for address, cnt in ranks:
	if cnt<count_threshold: continue
    	print cnt, "\t", address

    print ""
    print """# count \t http://"""
    for domain, url_list in urls:
	if cnt<count_threshold: continue
    	print url_list, "\t", domain

def by_count(a, b):
	return cmp(a[1], b[1])

def by_length(a, b):
	return cmp(len(a[1]), len(b[1]))

def filter_document (msg, document):
    """This returns the SMTP relays of the message"""
    rec=msg.getheaders("Received")
    rex=sre.compile("^from\s(.*?)(?= by|\n)", sre.S)
    rex2=sre.compile("([^()\[\]\s]*|\(\S*\)|\[[^\]]*\])")
    rex_ip=sre.compile("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
    addressdb={}
    
    # get relays
    for line in rec:
    	if whitelist.match(line): continue
    	originator=rex.findall(line)
	if not originator: continue
	for row in originator:	
		addresses=filter(lambda x: "." in x, rex2.findall(row.lower()))
		addresses=reduce(lambda x,y: y not in x and x.append(y) or x, addresses, [])
		#for address in addresses:
		if not addresses: continue
		address=addresses[0]
		if 1:
			if not rex_ip.match(address):
				addr=rex_domain.search(address)
				if addr: address=addr.group()
			else:
				address=address[:address.rindex(".")+1]+"xxx"
			if addressdb.has_key(address):
				addressdb[address]+=1
			else:
				addressdb[address]=1

    # Get embedded URLs
    urldb={}
    rex=sre.compile("""http://[^\s">]*""")
    rex2=sre.compile("""http://([^/]*)""")
    for url in rex.findall(document):
    	domain=rex2.match(url).group()[7:]
	if urldb.has_key(domain):
		urldb[domain].append(url)
	else:
		urldb[domain]=[url]
   
    return addressdb, urldb

def process_mailbox (mailboxname_in, mailboxname_out):
    # Open the mailbox.
    mb = mailbox.UnixMailbox (file(mailboxname_in,'r'))

    msg = mb.next()
    addr_stats={}
    url_stats={}
    
    while msg is not None:
        # Properties of msg cannot be modified, so we pull out the
        # document to handle is separately. We keep msg around to
        # keep track of headers and stuff.
        document = msg.fp.read()

        r,u=filter_document (msg, document)
	for addr, cnt in r.items():
		if not addr_stats.has_key(addr):
			addr_stats[addr]=0
		addr_stats[addr]+=cnt
	for addr, cnt in u.items():
		if not url_stats.has_key(addr):
			url_stats[addr]=0
		url_stats[addr]+=len(cnt)
        
        msg = mb.next()

    return addr_stats, url_stats

if __name__=="__main__":
	main()
