Source code for reverend.guessers.email

# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar:
# amir@divmod.org.  This is free software; you can redistribute it and/or
# modify it under the terms of version 2.1 of the GNU Lesser General Public
# License as published by the Free Software Foundation.
#
        
from rfc822 import AddressList

from reverend.thomas import Bayes


[docs]class EmailClassifier(Bayes):
[docs] def get_tokens(self, msg): # Overide from parent # This should return a list of strings # which will be used as the key into # the table of token counts tokens = self.get_header_tokens(msg) tokens += self.get_body_tokens(msg) # Get some tokens that are generated from the # header and the structure tokens += self.get_meta_tokens(msg) return tokens
[docs] def get_body_tokens(self, msg): text = self.get_text_plain(msg) if text is None: text = '' tl = list(self.tokenizer.tokenize(text)) return tl
[docs] def get_header_tokens(self, msg): subj = msg.get('subject','nosubject') text = subj + ' ' text += msg.get('from','fromnoone') + ' ' text += msg.get('to','tonoone') + ' ' text += msg.get('cc','ccnoone') + ' ' tl = list(self.tokenizer.tokenize(text)) return tl
[docs] def get_text_plain(self, msg): for part in msg.walk(): typ = part.get_content_type() if typ and typ.lower() == "text/plain": text = part.get_payload(decode=True) return text return None
[docs] def get_text_html(self, msg): for part in msg.walk(): typ = part.get_content_type() if typ and typ.lower() == "text/html": text = part.get_payload(decode=False) return text return None
[docs] def get_meta_tokens(self, msg): r = [] for f in ['Content-type', 'X-Priority', 'X-Mailer', 'content-transfer-encoding', 'X-MSMail-Priority']: r.append(f +':' + msg.get(f, 'None')) text = self.get_text_plain(msg) html = self.get_text_html(msg) for stem, part in zip(['text','html'],[text,html]): if part is None: r.append(stem + '_None') continue else: r.append(stem + '_True') l = len(part.split()) if l is 0: a = 'zero' r.append(stem + a) if l > 10000: a = 'more_than_10000' r.append(stem + a) if l > 1000: a = 'more_than_1000' r.append(stem + a) if l > 100: a = 'more_than_100' r.append(stem + a) t = msg.get('to','') at = AddressList(t).addresslist c = msg.get('cc','') ac = AddressList(c).addresslist if at > 5: r.append('to_more_than_5') if at > 10: r.append('to_more_than_10') if ac > 5: r.append('cc_more_than_5') if ac > 10: r.append('cc_more_than_10') return r