-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathetl.py
More file actions
94 lines (78 loc) · 4.15 KB
/
etl.py
File metadata and controls
94 lines (78 loc) · 4.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import numpy as np
import pandas as pd
import os
from decouple import config
import json
COLS_TO_USE = json.loads(config('COLS_TO_USE'))
FILE_NAME = config('TRACE_DATA_FILE_NAME')
COURSE_NAME = config('NAME_OF_COURSE')
LIST_IDS_TO_DELETE = json.loads(config('USERS_IDS_TO_DELETE'))
nan_value = float("NaN")
df = pd.read_excel(FILE_NAME, usecols= COLS_TO_USE)
# Get user id as case id
df['case_id'] = df['Description'].str.split("'").str[1]
# Fix timestamp format
df['timestamp'] = df['Time'].str.replace(r',', '')
# extract the activity name from the event context
df['activity'] = df['Event context'].str.split(':').str[1]
# Conditions to delete not important rows
system_logs = df[(df['Component'] == 'System') & (df['Event name'] == 'Course viewed')].index
system_course_reports = df[(df['Component'] == 'System') & (df['Event name'] == 'Course user report viewed')].index
system_grades_reports = df[(df['Event name'] == 'Grade user report viewed')].index
system_tours = df[(df['Component'] == 'User tours')].index
system_rewards = df[(df['Component'] == 'Stash')].index
system_grades = df[(df['Component'] == 'System') & (df['Event name'] == 'User graded')].index
system_evidences = df[(df['Component'] == 'System') & (df['Event name'] == 'Evidence created.')].index
system_rewards2 = df[(df['Component'] == 'Page') & (df['Event context'] == 'Page: Reward')].index
system_profile = df[(df['Component'] == 'System') & (df['Event name'] == 'User profile viewed')].index
system_profile2 = df[(df['Component'] == 'System') & (df['Event name'] == 'User list viewed')].index
component_formun = df[(df['Component'] == 'Forum')].index
component_choise = df[(df['Component'] == 'Choice')].index
# Delete rows by conditions.
df.drop(system_logs , inplace=True)
df.drop(system_tours , inplace=True)
df.drop(system_rewards , inplace=True)
df.drop(system_grades , inplace=True)
df.drop(system_evidences , inplace=True)
df.drop(system_rewards2 , inplace=True)
df.drop(system_profile , inplace=True)
df.drop(system_profile2 , inplace=True)
df.drop(system_course_reports , inplace=True)
df.drop(system_grades_reports , inplace=True)
df.drop(component_formun , inplace=True)
df.drop(component_choise , inplace=True)
# Extract status of the event name.
df['Event name'] = df['Event name'].str.replace('.', '', regex=True)
df['Event name'] = df['Event name'].str.replace(' ', ':', regex=True)
df['Event name'] = df['Event name'].str[-10:]
df['Event name'] = df['Event name'].str.split(':').str[1]
# Rename columns to match event logs creteria
df.rename(columns={'Component': 'resource', 'Event name': 'status'}, inplace=True)
# delete status 'uploaded' and 'created'
status_uploaded = df[(df['status'] == 'uploaded') | (df['status'] == 'created') | (df['status'] == 'posted')].index
df.drop(status_uploaded , inplace=True)
#delete empty rows
df.replace("", nan_value, inplace=True)
df.dropna(subset = ["activity", "status", "case_id", "timestamp"], inplace=True)
empty_rows= df[(df['activity'] == '') | (df['status'] == '') | (df['case_id'] == '') | (df['timestamp'] == '')].index
df.drop(empty_rows , inplace=True)
# change status
df['status'].loc[(df['resource'] == 'System') & (df['status'] == 'updated')] = 'completed'
df['status'].loc[(df['status'] == 'submitted')] = 'compleated'
# delete other status
status_not_view_or_compleated = df[(df['status'] != 'completed') & (df['status'] != 'viewed')].index
df.drop(status_not_view_or_compleated , inplace=True)
# delete case ids of teachers and test users
case_ids_teachers_serie = df[df['case_id'].isin(LIST_IDS_TO_DELETE)].index
df.drop(case_ids_teachers_serie, inplace=True)
# Delete innecesary columns
df.drop(columns=['Description', 'Event context', 'Time'], inplace=True)
df.replace("", nan_value, inplace=True)
df.dropna(how='all', axis=1, inplace=True)
df.drop_duplicates(['resource', 'status', 'case_id', 'timestamp', 'activity'], keep='first', inplace=True)
activity_course_load = df[(df['activity'] == COURSE_NAME)].index
df.drop(activity_course_load , inplace=True)
# output file
compression_opts = dict(method='zip', archive_name= config('OUTPUT_FILE_NAME') +'.csv')
df.to_csv(config('OUTPUT_FILE_NAME')+'.zip' , index=False,compression=compression_opts)
df.head()