-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathData Cleaning and Preprocessing.sql
More file actions
177 lines (149 loc) · 5.25 KB
/
Data Cleaning and Preprocessing.sql
File metadata and controls
177 lines (149 loc) · 5.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
-- 1. Data Cleaning and Preprocessing
USE parch_posey;
-- Set primary keys
ALTER TABLE accounts
ADD CONSTRAINT PK_Accounts PRIMARY KEY (id);
ALTER TABLE orders
ADD CONSTRAINT PK_Orders PRIMARY KEY (id);
ALTER TABLE region
ADD CONSTRAINT PK_Region PRIMARY KEY (id);
ALTER TABLE sales_reps
ADD CONSTRAINT PK_SalesRep PRIMARY KEY (id);
ALTER TABLE web_events
ADD CONSTRAINT PK_WebEvents PRIMARY KEY (id);
-- Foreign Keys setup
-- Link orders to accounts
ALTER TABLE orders
ADD CONSTRAINT FK_Orders_Accounts FOREIGN KEY (account_id)
REFERENCES accounts (id);
-- Link sales_rep to region
ALTER TABLE sales_reps
ADD CONSTRAINT FK_SalesRep_Region FOREIGN KEY (region_id)
REFERENCES region (id);
-- Link accounts to sales_rep
ALTER TABLE accounts
ADD CONSTRAINT FK_Accounts_SalesRep FOREIGN KEY (sales_rep_id)
REFERENCES sales_reps (id);
-- Link web_events to accounts
ALTER TABLE web_events
ADD CONSTRAINT FK_WebEvents_Accounts FOREIGN KEY (account_id)
REFERENCES accounts (id);
-- Are there any missing values in the dataset?
-- Accounts table
SELECT
'id' AS ColumnName, COUNT(*) AS TotalRows, SUM(CASE WHEN id IS NULL THEN 1 ELSE 0 END) AS NullCount
FROM accounts
UNION ALL
SELECT 'name', COUNT(*), SUM(CASE WHEN name IS NULL THEN 1 ELSE 0 END)
FROM accounts
UNION ALL
SELECT 'website', COUNT(*), SUM(CASE WHEN website IS NULL THEN 1 ELSE 0 END)
FROM accounts
UNION ALL
SELECT 'lat', COUNT(*), SUM(CASE WHEN lat IS NULL THEN 1 ELSE 0 END)
FROM accounts
UNION ALL
SELECT 'long', COUNT(*), SUM(CASE WHEN long IS NULL THEN 1 ELSE 0 END)
FROM accounts
UNION ALL
SELECT 'primary_poc', COUNT(*), SUM(CASE WHEN primary_poc IS NULL THEN 1 ELSE 0 END)
FROM accounts
UNION ALL
SELECT 'sales_rep_id', COUNT(*), SUM(CASE WHEN sales_rep_id IS NULL THEN 1 ELSE 0 END)
FROM accounts;
-- Zero NUll count in all columns
-- Orders table
SELECT
'id' AS ColumnName, COUNT(*) AS TotalRows, SUM(CASE WHEN id IS NULL THEN 1 ELSE 0 END) AS NullCount
FROM orders
UNION ALL
SELECT 'account_id', COUNT(*), SUM(CASE WHEN account_id IS NULL THEN 1 ELSE 0 END)
FROM orders
UNION ALL
SELECT 'occurred_at', COUNT(*), SUM(CASE WHEN occurred_at IS NULL THEN 1 ELSE 0 END)
FROM orders
UNION ALL
SELECT 'standard_qty', COUNT(*), SUM(CASE WHEN standard_qty IS NULL THEN 1 ELSE 0 END)
FROM orders
UNION ALL
SELECT 'gloss_qty', COUNT(*), SUM(CASE WHEN gloss_qty IS NULL THEN 1 ELSE 0 END)
FROM orders
UNION ALL
SELECT 'poster_qty', COUNT(*), SUM(CASE WHEN poster_qty IS NULL THEN 1 ELSE 0 END)
FROM orders
UNION ALL
SELECT 'total_amt_usd', COUNT(*), SUM(CASE WHEN total_amt_usd IS NULL THEN 1 ELSE 0 END)
FROM orders;
SELECT * FROM orders WHERE standard_qty IS NULL;
-- standard_qty -> 825 Null values
-- gloss_qty -> 1018
-- poster_qty -> 1149
-- Region table
SELECT * FROM region;
-- No missing values in region
-- Sales reps table
SELECT * FROM sales_reps;
-- NO missing values in sales_reps
-- Web events table
SELECT
'id' AS ColumnName, COUNT(*) AS TotalRows, SUM(CASE WHEN id IS NULL THEN 1 ELSE 0 END) AS NullCount
FROM web_events
UNION ALL
SELECT 'account_id', COUNT(*), SUM(CASE WHEN account_id IS NULL THEN 1 ELSE 0 END)
FROM web_events
UNION ALL
SELECT 'occurred_at', COUNT(*), SUM(CASE WHEN occurred_at IS NULL THEN 1 ELSE 0 END)
FROM web_events
UNION ALL
SELECT 'channel', COUNT(*), SUM(CASE WHEN channel IS NULL THEN 1 ELSE 0 END)
FROM web_events;
-- No missing values
-- In orders table if all the three columns standard_qty, gloss_qty, poster_qty are zero then it is
-- considered as there is no order made so we can replace them with zero
UPDATE orders
SET standard_qty = 0,
gloss_qty = 0,
poster_qty = 0
WHERE standard_qty IS NULL
AND gloss_qty IS NULL
AND poster_qty IS NULL;
-- only 20 rows affected
SELECT * FROM orders WHERE standard_qty IS NULL;
SELECT * FROM orders WHERE gloss_qty IS NULL;
SELECT * FROM orders WHERE poster_qty IS NULL;
-- There is no pattern associated with null values
UPDATE orders
SET standard_qty = (SELECT AVG(CAST(standard_qty AS FLOAT)) FROM orders WHERE standard_qty IS NOT NULL)
WHERE standard_qty IS NULL
UPDATE orders
SET gloss_qty = (SELECT AVG(CAST(gloss_qty AS FLOAT)) FROM orders WHERE gloss_qty IS NOT NULL)
WHERE gloss_qty IS NULL
UPDATE orders
SET poster_qty = (SELECT AVG(CAST(poster_qty AS FLOAT)) FROM orders WHERE poster_qty IS NOT NULL)
WHERE poster_qty IS NULL;
-- Are there any duplicates in the dataset?
SELECT COUNT(*) as duplicated
FROM orders
GROUP BY id, account_id, occurred_at, standard_qty, gloss_qty, poster_qty, total_amt_usd
HAVING COUNT(*) > 1;
-- None
SELECT id, account_id, occurred_at, standard_qty, gloss_qty, poster_qty, total_amt_usd, COUNT(*) AS duplicate_count
FROM orders
GROUP BY id, account_id, occurred_at, standard_qty, gloss_qty, poster_qty, total_amt_usd
HAVING COUNT(*) > 1;
-- None
SELECT id, account_id, occurred_at, channel, COUNT(*) AS duplicate_count
FROM web_events
GROUP BY id, account_id, occurred_at, channel
HAVING COUNT(*) > 1;
-- None
-- There are no duplicated records
-- Are there any inconsistencies in categorical variables (e.g., spelling errors, mixed case)?
SELECT DISTINCT name FROM accounts;
-- All ok
-- All tables categorical values are correct
-- Are the data types for each column appropriate (e.g., date columns formatted as dates)?
SELECT occurred_at
FROM web_events
WHERE ISDATE(occurred_at) = 0;
-- Dates are in correct format