Skip to content

Commit 314e6eb

Browse files
committed
pypi v0.0.2
1 parent 1f1a5ec commit 314e6eb

15 files changed

Lines changed: 461 additions & 79 deletions

File tree

README.md

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,14 @@ For the social scientists, creating social networks from your mailbox data and a
1414
* Study social actors (most emails from Marketing involve Peter and Andy) and their relative influence
1515
* Identify the key social groups (Sales team hangs out a lot, but the IT / product division less so)
1616
* Key account managers of the company (Despite being with the company only recently, Margaretha is connected to more key clients than her peers)
17+
* Compare distributions and patterns of email behaviors and aggregated statistics between groups of employees
1718

1819

1920
If you're a graph theorist and looking for something more statistical:
20-
* Support directed and undirected graphs (**implemented in version 0.0.2**, see below)
21+
* Support directed and undirected graphs (**already implemented in version 0.0.2**, see below)
2122
* Also output statistical measurements such as centrality distribution (**planned for version 0.0.3**)
2223
* Betweenness, closeness, hubness, distance histograms plotting (**planned for version 0.0.3**)
23-
* Exports to `.graphml` format for use in other graphing software (**implemented in version 0.0.2**)
24+
* Exports to `.graphml` format for use in other graphing software (**already implemented in version 0.0.2**, see below)
2425

2526
## Dependencies
2627
* Python 3.7+
@@ -155,9 +156,11 @@ All tests are located in the `/tests/` directory.
155156

156157
## Authors and Copyright
157158

158-
Samuel Chan, Supertype [Supertype](https://supertype.ai)
159+
Samuel Chan, [Supertype](https://supertype.ai)
160+
- Github: [onlyphantom](https://github.com/onlyphantom)
159161

160-
Vincentius Christopher Calvin, Supertype [https://supertype.ai](https://supertype.ai)
162+
Vincentius Christopher Calvin, [Supertype](https://supertype.ai)
163+
- Github: [vccalvin33](https://github.com/vccalvin33)
161164

162165
If you find the code useful in your project, please link to this repository in your citation.
163166

build/lib/emailnetwork/emails.py

Lines changed: 8 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
21
from dataclasses import dataclass
32
from datetime import datetime
4-
from .utils import parse_date
3+
from emailnetwork.utils import parse_date
54

65
@dataclass
76
class EmailMeta:
@@ -22,30 +21,15 @@ def __post_init__(self):
2221
self.origin_domain = self.sender.domain
2322
self.date = parse_date(self.date)
2423

25-
def __eq__(self, datestring):
26-
try:
27-
targetdate = datetime.fromisoformat(datestring)
28-
except ValueError:
29-
print(ValueError)
30-
return "Please use the ISO format for comparison: YYYY-MM-DD"
24+
def __eq__(self, targetdate):
3125
if isinstance(targetdate, datetime):
3226
return self.date.date() == targetdate.date()
3327

34-
def __ge__(self, datestring):
35-
try:
36-
targetdate = datetime.fromisoformat(datestring)
37-
except ValueError:
38-
print(ValueError)
39-
return "Please use the ISO format for comparison: YYYY-MM-DD"
28+
def __ge__(self, targetdate):
4029
if isinstance(targetdate, datetime):
4130
return self.date.date() >= targetdate.date()
4231

43-
def __le__(self, datestring):
44-
try:
45-
targetdate = datetime.fromisoformat(datestring)
46-
except ValueError:
47-
print(ValueError)
48-
return "Please use the ISO format for comparison: YYYY-MM-DD"
32+
def __le__(self, targetdate):
4933
if isinstance(targetdate, datetime):
5034
return self.date.date() <= targetdate.date()
5135

@@ -65,3 +49,7 @@ def __getitem__(self):
6549
def domain(self):
6650
return self.email.split('@')[-1] or None
6751

52+
@dataclass
53+
class EmailBody:
54+
subject: str = None
55+
body: str = None

build/lib/emailnetwork/extract.py

Lines changed: 84 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,38 @@
1+
from datetime import datetime
12
from email.utils import getaddresses
23
from mailbox import mbox
34

4-
from emailnetwork.emails import EmailAddress, EmailMeta
5-
# try:
6-
# from .emails import EmailAddress, EmailMeta
7-
# except:
8-
# from emailnetwork.emails import EmailAddress, EmailMeta
5+
from mailbox import mboxMessage
6+
7+
from emailnetwork.utils import clean_subject, clean_body
8+
from emailnetwork.emails import EmailAddress, EmailMeta, EmailBody
9+
from emailnetwork.summary import DomainSummary
10+
11+
from emailnetwork.header import HeaderCounter
12+
913

1014
def extract_meta(email):
1115

1216
recs = email.get_all('To', []) + email.get_all('Resent-To', [])
1317
ccs = email.get_all('Cc', []) + email.get_all('Resent-Cc', [])
14-
18+
1519
return EmailMeta(
1620
sender=EmailAddress(getaddresses(email.get_all('From'))[0]),
1721
recipients=[EmailAddress(rec) for rec in getaddresses(recs)],
1822
cc=[EmailAddress(cc) for cc in getaddresses(ccs)],
19-
subject=email.get('Subject', '').strip() or None,
23+
subject=clean_subject(email['Subject']) or None,
2024
date=email['Date']
2125
)
2226

27+
28+
def extract_body(email):
29+
30+
return EmailBody(
31+
subject=clean_subject(email['Subject']) or None,
32+
body=clean_body(email)
33+
)
34+
35+
2336
class MBoxReader(object):
2437
""" A class that extends python's `mailbox` module to provide additional
2538
functionalities such as length, date filtering and parsing. A key component of
@@ -31,7 +44,7 @@ class MBoxReader(object):
3144
Args:
3245
object ([type]): Instantiate this class by specifying a path to an `.mbox` object
3346
"""
34-
47+
3548
def __init__(self, path) -> None:
3649
super().__init__()
3750
self.path = path
@@ -49,7 +62,7 @@ def count(self):
4962
Count the number of emails in the mbox instance.
5063
Helper function to implement __len__
5164
"""
52-
return self.mbox.keys()[-1]+1
65+
return self.mbox.keys()[-1]+1
5366
# return len(self.mbox.keys())
5467

5568
def extract(self):
@@ -66,38 +79,73 @@ def extract(self):
6679
print(e)
6780
continue
6881

69-
70-
def filter_by_date(self, operator:str, datestring:str):
71-
if operator not in ['>=', '==', '<=']:
82+
def filter_emails(self, emailaddress=None, datestring=None, dateoperator="=="):
83+
if emailaddress != None:
84+
if type(emailaddress) != str:
85+
raise ValueError(
86+
"Please use a valid string representing an email address")
87+
88+
if dateoperator not in ['>=', '==', '<=']:
7289
raise ValueError("Please use one of ['>=', '==', '<=']")
73-
90+
91+
if datestring != None:
92+
try:
93+
targetdate = datetime.strptime(datestring, "%Y-%m-%d")
94+
except ValueError:
95+
print(ValueError)
96+
return "Please use the ISO format for comparison: YYYY-MM-DD"
97+
7498
val = []
75-
for email in self.mbox:
76-
emailmeta = extract_meta(email)
77-
if operator == '>=':
78-
if emailmeta >= datestring:
79-
val.append(emailmeta)
80-
elif operator == '==':
81-
if emailmeta == datestring:
82-
val.append(emailmeta)
83-
elif operator == '<=':
84-
if emailmeta <= datestring:
99+
if emailaddress == None and datestring == None:
100+
for email in self.mbox:
101+
emailmeta = extract_meta(email)
102+
val.append(emailmeta)
103+
elif emailaddress != None and datestring == None:
104+
for email in self.mbox:
105+
emailmeta = extract_meta(email)
106+
checkers = [emailmeta.sender.email] + [recipient.email for recipient in emailmeta.recipients]
107+
if emailaddress in checkers:
85108
val.append(emailmeta)
86-
return val
109+
elif emailaddress == None and datestring != None:
110+
for email in self.mbox:
111+
emailmeta = extract_meta(email)
112+
if dateoperator == '>=':
113+
if emailmeta >= targetdate:
114+
val.append(emailmeta)
115+
elif dateoperator == '==':
116+
if emailmeta == targetdate:
117+
val.append(emailmeta)
118+
elif dateoperator == '<=':
119+
if emailmeta <= targetdate:
120+
val.append(emailmeta)
121+
else:
122+
for email in self.mbox:
123+
emailmeta = extract_meta(email)
124+
checkers = [emailmeta.sender.email] + [recipient.email for recipient in emailmeta.recipients]
125+
if emailaddress in checkers:
126+
if dateoperator == '>=':
127+
if emailmeta >= targetdate:
128+
val.append(emailmeta)
129+
elif dateoperator == '==':
130+
if emailmeta == targetdate:
131+
val.append(emailmeta)
132+
elif dateoperator == '<=':
133+
if emailmeta <= targetdate:
134+
val.append(emailmeta)
87135

136+
return val
88137

89138

90139
if __name__ == '__main__':
91-
# reader = MBoxReader('/Users/samuel/Footprints/samuel-supertype.mbox')
92-
reader = MBoxReader('/Users/samuel/Footprints/emailnetwork/emailnetwork/tests/test.mbox')
93-
print(f'{len(reader)} emails in the sample mbox.')
94-
# email = reader.mbox[646]
95-
email = reader.mbox[0]
96-
emailmsg = extract_meta(email)
140+
reader = MBoxReader('/Users/samuel/Footprints/samuel-supertype.mbox')
141+
# reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox')
142+
headers = HeaderCounter(reader)
143+
k = headers.keys()
144+
spamheaders = list(filter(lambda v: "spam" in v.lower(), k))
145+
146+
summary = DomainSummary(reader)
97147

98-
thisyearmails = reader.filter_by_date(">=", "2021-01-05")
99-
# print(emailmsg.recipients)
100-
# print(emailmsg.recipients[0].domain)
101-
emails = reader.extract()
102-
#[email.origin_domain for email in emails]
103-
148+
email = reader.mbox[1]
149+
emailmsg = extract_meta(email)
150+
emailbody = extract_body(email)
151+
mails = reader.filter_emails(datestring='2020-12-31', dateoperator="==")

build/lib/emailnetwork/graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,8 @@ def plot_undirected(reader:MBoxReader, layout:str='shell', graphml:bool=False):
210210
if __name__ == '__main__':
211211
MBOX_PATH = f'{os.path.dirname(__file__)}/tests/test.mbox'
212212

213-
reader = MBoxReader('/Users/samuel/Footprints/emailnetwork/emailnetwork/tests/test.mbox')
213+
# reader = MBoxReader('/Users/samuel/Footprints/emailnetwork/emailnetwork/tests/test.mbox')
214+
reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox')
214215
# reader = MBoxReader('/Users/samuel/Footprints/samuel-supertype.mbox')
215216
# plot_single_directed(reader,300)
216217
# plot_single_directed(reader, 1, True)

build/lib/emailnetwork/header.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
from collections import Counter
2+
from email.header import decode_header
3+
from emailnetwork.utils import clean_subject
4+
5+
6+
class HeaderCounter(Counter):
7+
"""[summary]
8+
9+
Args:
10+
Counter ([type]): [description]
11+
"""
12+
13+
def __init__(self, reader):
14+
super().__init__()
15+
self = self.build_from(reader)
16+
17+
def __str__(self):
18+
return f'{self.most_common()}'
19+
20+
def build_from(self, reader):
21+
for email in reader:
22+
for k in email.keys():
23+
self[k] += 1
24+
25+
return self
26+
27+
def histogram(self, n=25):
28+
from matplotlib import pyplot as plt
29+
plt.style.use('fivethirtyeight')
30+
k, v = (list(self.keys())[:n], list(self.values())[:n])
31+
fig = plt.figure(figsize=(7, 10))
32+
ax = fig.add_subplot(111)
33+
y_pos = [i for i in range(n)]
34+
ax.barh(y_pos, v, color='plum')
35+
ax.set_yticks(y_pos)
36+
ax.set_yticklabels(k)
37+
ax.invert_yaxis()
38+
ax.set_xlabel('Frequency')
39+
ax.set_title('Email Header Analysis')
40+
plt.tight_layout()
41+
plt.show()
42+
43+
44+
if __name__ == '__main__':
45+
from emailnetwork.extract import MBoxReader
46+
# reader = MBoxReader('/Users/samuel/Footprints/samuel-supertype.mbox')
47+
reader = MBoxReader('/Users/vincentiuscalvin/Documents/Supertype/mbox-dataset/Ori_Sample_01.mbox')
48+
headers = HeaderCounter(reader)
49+
50+
k = headers.keys()
51+
52+
containspam = list(filter(lambda v: "spam" in v.lower(), k))
53+
54+
for email in reader:
55+
for key in email.keys():
56+
if key in containspam:
57+
print({key: decode_header(email[key])})

0 commit comments

Comments
 (0)