1+ from datetime import datetime
12from email .utils import getaddresses
23from mailbox import mbox
34
4- from emailnetwork .emails import EmailAddress , EmailMeta
5- # try:
6- # from .emails import EmailAddress, EmailMeta
7- # except:
8- # from emailnetwork.emails import EmailAddress, EmailMeta
5+ from mailbox import mboxMessage
6+
7+ from emailnetwork .utils import clean_subject , clean_body
8+ from emailnetwork .emails import EmailAddress , EmailMeta , EmailBody
9+ from emailnetwork .summary import DomainSummary
10+
11+ from emailnetwork .header import HeaderCounter
12+
913
1014def extract_meta (email ):
1115
1216 recs = email .get_all ('To' , []) + email .get_all ('Resent-To' , [])
1317 ccs = email .get_all ('Cc' , []) + email .get_all ('Resent-Cc' , [])
14-
18+
1519 return EmailMeta (
1620 sender = EmailAddress (getaddresses (email .get_all ('From' ))[0 ]),
1721 recipients = [EmailAddress (rec ) for rec in getaddresses (recs )],
1822 cc = [EmailAddress (cc ) for cc in getaddresses (ccs )],
19- subject = email . get ( 'Subject' , '' ). strip ( ) or None ,
23+ subject = clean_subject ( email [ 'Subject' ] ) or None ,
2024 date = email ['Date' ]
2125 )
2226
27+
28+ def extract_body (email ):
29+
30+ return EmailBody (
31+ subject = clean_subject (email ['Subject' ]) or None ,
32+ body = clean_body (email )
33+ )
34+
35+
2336class MBoxReader (object ):
2437 """ A class that extends python's `mailbox` module to provide additional
2538 functionalities such as length, date filtering and parsing. A key component of
@@ -31,7 +44,7 @@ class MBoxReader(object):
3144 Args:
3245 object ([type]): Instantiate this class by specifying a path to an `.mbox` object
3346 """
34-
47+
3548 def __init__ (self , path ) -> None :
3649 super ().__init__ ()
3750 self .path = path
@@ -49,7 +62,7 @@ def count(self):
4962 Count the number of emails in the mbox instance.
5063 Helper function to implement __len__
5164 """
52- return self .mbox .keys ()[- 1 ]+ 1
65+ return self .mbox .keys ()[- 1 ]+ 1
5366 # return len(self.mbox.keys())
5467
5568 def extract (self ):
@@ -66,38 +79,73 @@ def extract(self):
6679 print (e )
6780 continue
6881
69-
70- def filter_by_date (self , operator :str , datestring :str ):
71- if operator not in ['>=' , '==' , '<=' ]:
82+ def filter_emails (self , emailaddress = None , datestring = None , dateoperator = "==" ):
83+ if emailaddress != None :
84+ if type (emailaddress ) != str :
85+ raise ValueError (
86+ "Please use a valid string representing an email address" )
87+
88+ if dateoperator not in ['>=' , '==' , '<=' ]:
7289 raise ValueError ("Please use one of ['>=', '==', '<=']" )
73-
90+
91+ if datestring != None :
92+ try :
93+ targetdate = datetime .strptime (datestring , "%Y-%m-%d" )
94+ except ValueError :
95+ print (ValueError )
96+ return "Please use the ISO format for comparison: YYYY-MM-DD"
97+
7498 val = []
75- for email in self .mbox :
76- emailmeta = extract_meta (email )
77- if operator == '>=' :
78- if emailmeta >= datestring :
79- val .append (emailmeta )
80- elif operator == '==' :
81- if emailmeta == datestring :
82- val .append (emailmeta )
83- elif operator == '<=' :
84- if emailmeta <= datestring :
99+ if emailaddress == None and datestring == None :
100+ for email in self .mbox :
101+ emailmeta = extract_meta (email )
102+ val .append (emailmeta )
103+ elif emailaddress != None and datestring == None :
104+ for email in self .mbox :
105+ emailmeta = extract_meta (email )
106+ checkers = [emailmeta .sender .email ] + [recipient .email for recipient in emailmeta .recipients ]
107+ if emailaddress in checkers :
85108 val .append (emailmeta )
86- return val
109+ elif emailaddress == None and datestring != None :
110+ for email in self .mbox :
111+ emailmeta = extract_meta (email )
112+ if dateoperator == '>=' :
113+ if emailmeta >= targetdate :
114+ val .append (emailmeta )
115+ elif dateoperator == '==' :
116+ if emailmeta == targetdate :
117+ val .append (emailmeta )
118+ elif dateoperator == '<=' :
119+ if emailmeta <= targetdate :
120+ val .append (emailmeta )
121+ else :
122+ for email in self .mbox :
123+ emailmeta = extract_meta (email )
124+ checkers = [emailmeta .sender .email ] + [recipient .email for recipient in emailmeta .recipients ]
125+ if emailaddress in checkers :
126+ if dateoperator == '>=' :
127+ if emailmeta >= targetdate :
128+ val .append (emailmeta )
129+ elif dateoperator == '==' :
130+ if emailmeta == targetdate :
131+ val .append (emailmeta )
132+ elif dateoperator == '<=' :
133+ if emailmeta <= targetdate :
134+ val .append (emailmeta )
87135
136+ return val
88137
89138
90139if __name__ == '__main__' :
91- # reader = MBoxReader('/Users/samuel/Footprints/samuel-supertype.mbox')
92- reader = MBoxReader ('/Users/samuel/Footprints/emailnetwork/emailnetwork/tests/test.mbox' )
93- print (f'{ len (reader )} emails in the sample mbox.' )
94- # email = reader.mbox[646]
95- email = reader .mbox [0 ]
96- emailmsg = extract_meta (email )
140+ reader = MBoxReader ('/Users/samuel/Footprints/samuel-supertype.mbox' )
141+ # reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox')
142+ headers = HeaderCounter (reader )
143+ k = headers .keys ()
144+ spamheaders = list (filter (lambda v : "spam" in v .lower (), k ))
145+
146+ summary = DomainSummary (reader )
97147
98- thisyearmails = reader .filter_by_date (">=" , "2021-01-05" )
99- # print(emailmsg.recipients)
100- # print(emailmsg.recipients[0].domain)
101- emails = reader .extract ()
102- #[email.origin_domain for email in emails]
103-
148+ email = reader .mbox [1 ]
149+ emailmsg = extract_meta (email )
150+ emailbody = extract_body (email )
151+ mails = reader .filter_emails (datestring = '2020-12-31' , dateoperator = "==" )
0 commit comments