-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_dataset_with_posts_until_2015.py
More file actions
59 lines (34 loc) · 2.47 KB
/
Copy pathget_dataset_with_posts_until_2015.py
File metadata and controls
59 lines (34 loc) · 2.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import argparse, json, os, pandas as pd
from datetime import datetime
DATASET_DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
ARGS_DATE_FORMAT = '%Y-%m-%d'
def remove_bots_posts(dataset):
bots = ["AutoModerator", "RemindMeBot", "WikiTextBot", "youtubefactsbot", "RedditNiobioBot", "NemLiNemLereiBot"]
return list(filter(lambda data: (not hasattr(data, 'author')) or (data['author'] == None) or (data['author'] != None and data['author']['name'] not in bots), dataset))
parser = argparse.ArgumentParser(description='Splits a dataset into others using years as delimiter.')
parser.add_argument('--dataset', type=str, help='dataset path. A JSON file', required=True)
parser.add_argument('--datasetName', type=str, help='dataset path. A JSON file', required=True)
parser.add_argument('--outputPath', type=str, help='path to put the resulting split datasets', required=True)
parser.add_argument('--years', nargs='+', help='years to use as delimiters while splitting', required=True)
args = parser.parse_args()
original_dataset = json.load(open(args.dataset, 'r'))
print("Original row count: ", len(original_dataset))
original_dataset = remove_bots_posts(original_dataset)
print("Row count after bots' posts removal: ", len(original_dataset))
original_data_frame = pd.DataFrame.from_dict(original_dataset)
df_without_duplicates = original_data_frame.drop_duplicates(subset=['body'], keep='first')
print("Row count after duplicates removal: ", len(df_without_duplicates))
df_deleted_posts_removed = df_without_duplicates[df_without_duplicates.body != "[deleted]"]
df_removed_posts_removed = df_deleted_posts_removed[df_deleted_posts_removed.body != "[removed]"]
print("Row count after deleted/removed posts removal: ", len(df_removed_posts_removed))
df_empty_posts_removed = df_removed_posts_removed[df_removed_posts_removed.body != ""]
print("Row count after empty posts removal: ", len(df_empty_posts_removed))
original_dataset = df_empty_posts_removed.to_dict(orient='records')
for year_string in args.years:
year = datetime.strptime(year_string, ARGS_DATE_FORMAT)
year_dataset = list(filter(lambda record: datetime.strptime(record['date'], DATASET_DATE_FORMAT) <= year, original_dataset))
print(f'{year} dataset length: {len(year_dataset)}')
path = os.path.join(args.outputPath, f'{args.datasetName}_[until_{year_string}_dataset].json')
os.makedirs(os.path.dirname(path), exist_ok=True)
json.dump(year_dataset, open(path, 'w'))
print(f'Datasets saved to "{args.outputPath}" folder.')