-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
121 lines (107 loc) · 3.8 KB
/
utils.py
File metadata and controls
121 lines (107 loc) · 3.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from typing import Iterator, Dict, Any
from datetime import date
import csv, re
def iter_people_from_file(path: str) -> Iterator[Dict[str, Any]]:
skip = True
with open(path, 'r') as f:
csv_reader = csv.reader(f)
for line in csv_reader:
if skip: # skip the headers
skip = False
continue
person = {}
person['person_id'] = line[0]
person['company_name'] = line[1]
person['company_li_name'] = line[2]
person['last_title'] = line[3]
person['group_start_date'] = line[4]
person['group_end_date'] = line[5]
yield person
def iter_companies_from_file(path: str) -> Iterator[Dict[str, Any]]:
skip = True
idx = 0
with open(path, 'r') as f:
csv_reader = csv.reader(f)
for line in csv_reader:
if skip: # skip the headers
skip = False
continue
company = {}
company['id'] = idx
company['company_name'] = line[0]
company['company_li_names'] = line[1]
company['description'] = line[2]
company['headcount'] = line[3]
company['founding_date'] = line[4]
company['most_recent_raise'] = line[5]
company['most_recent_valuation'] = line[6]
company['investors'] = line[7]
company['known_total_funding'] = line[8]
idx +=1
yield company
def iter_company_li_from_file(path: str) -> Iterator[Dict[str, Any]]:
skip = True
idx = 0
with open(path, 'r') as f:
csv_reader = csv.reader(f)
for line in csv_reader:
if skip: # skip the headers
skip = False
continue
company_li_names = line[1].replace('\n','').replace('"','').replace(' ','').replace('[','').replace(']','').split(',')
if company_li_names == ['']: company_li_names = []
for li_name in company_li_names:
company = {}
company['id'] = idx
company['company_name'] = line[0]
company['company_li_name'] = li_name.strip()
idx +=1
yield company
def parse_li_name(text: str) -> bool:
# clean up data when n/a provided by person
if re.search('n/a', text):
return None
else:
return text
def parse_currently_at(date: str):
# set currently works at company when person does not provide end date
end_date = parse_date(date)
if end_date:
return end_date
else:
return datetime.date.today()
def parse_date(text: str) -> date:
if text == '':
return None # replace empty string with None
parts = text.split('-')
if len(parts) == 3:
return date(int(parts[0]), int(parts[1]), int(parts[2]))
else:
print(parts)
assert False, 'Unknown date format'
def parse_end_date(text: str):
# set currently works at company when person does not provide end date
end_date = parse_date(text)
if end_date:
return end_date
else:
return date.today()
def parse_title_for_founder(text: str) -> bool:
text = text.lower()
if re.search('founder', text):
return True
else:
return False
def parse_lists(text: str) -> str:
if text == '[]' or text == '':
return None
else:
return text.lower()
def parse_strings(text: str) -> str:
return text.lower().strip()
def parse_numeric_value(value: str) -> int:
# set currently works at company when person does not provide end date
if str.isalpha(value) or value == '':
return None
else:
return int(value)