-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbechdel_utils.py
More file actions
74 lines (52 loc) · 2.13 KB
/
bechdel_utils.py
File metadata and controls
74 lines (52 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from collections import Counter, defaultdict
from nltk import wordpunct_tokenize
def prettify(name):
"""Properly capitalize a name
>>> prettify("edgar ATHELING")
"Edgar Atheling"
"""
clean = []
for word in name.split():
capitalized = word[0].upper() + word[1:].lower()
clean.append(capitalized)
return " ".join(clean)
def resolve(speakers, coreferents, who):
"""Resolve coreferents."""
for i, s in enumerate(speakers):
if s in coreferents:
speakers[i] = who
MALE_INDICATORS = {"he", "him", "his", "man", "boy"}
FEMALE_INDICATORS = {"she", "her", "hers", "woman", "girl"}
def about_man(s, gender_dict):
"""Return a dictionary of gender to relative frequency of mention/reference"""
words = [w.lower() for w in wordpunct_tokenize(s)]
word_freq = Counter(words)
total_word_count = len(words)
total_ref_count = defaultdict(int)
for m_indicator in MALE_INDICATORS:
total_ref_count["MALE"] += word_freq[m_indicator]
for f_indicator in FEMALE_INDICATORS:
total_ref_count["FEMALE"] += word_freq[f_indicator]
for name, gender in gender_dict.items():
total_ref_count[gender] += s.count(name)
if '' in total_ref_count:
del total_ref_count['']
return {k:v/total_word_count for k,v in total_ref_count.items()}
def compress(speakers, talks):
"""Reduce consecutive dialogue by the same person into single records"""
i = 0
current_text = ""
current_speaker = ""
compressed_speakers = []
compressed_talks = []
for speaker, text in zip(speakers, talks):
if speaker == current_speaker:
current_text = current_text + " " + text
else:
compressed_speakers.append(current_speaker)
compressed_talks.append(current_text.replace('"', ''))
current_speaker = speaker
current_text = text
compressed_speakers.append(current_speaker)
compressed_talks.append(current_text.replace('"', ''))
return compressed_speakers, compressed_talks