-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_cleaning_fifa.sql
More file actions
265 lines (224 loc) · 9.8 KB
/
data_cleaning_fifa.sql
File metadata and controls
265 lines (224 loc) · 9.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
/* =============================================================================
FIFA DATASET: DATA CLEANING SCRIPT
=============================================================================
Description: This script cleans and standardizes the raw FIFA players dataset.
Tools: PostgreSQL
=============================================================================
*/
-- -----------------------------------------------------------------------------
-- 1. DATABASE SETUP & EXTENSIONS
-- -----------------------------------------------------------------------------
-- Create a working copy of the raw data to protect the original dataset
CREATE TABLE fifa_rt AS
SELECT * FROM fifa_ratings;
-- Enable the fuzzystrmatch extension for typo detection (used in EDA phase)
CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
-- -----------------------------------------------------------------------------
-- 2. TEXT NORMALIZATION (Names, Nationalities, Clubs)
-- -----------------------------------------------------------------------------
-- Trim whitespaces from names
UPDATE fifa_rt
SET long_name = TRIM(long_name);
-- Standardize nationalities to uppercase
UPDATE fifa_rt
SET nationality = UPPER(nationality);
-- Strip hidden characters (newlines, tabs) from club names using Regex
UPDATE fifa_rt
SET club = REGEXP_REPLACE(club, '[\n\r\t]+', '', 'g');
-- -----------------------------------------------------------------------------
-- 3. CONTRACT PARSING & FEATURE ENGINEERING
-- -----------------------------------------------------------------------------
-- Add new columns for the split contract data (assuming they don't exist yet)
ALTER TABLE fifa_rt
ADD COLUMN contract_start text,
ADD COLUMN contract_end text,
ADD COLUMN contract_type text,
ADD COLUMN loan_end_date text;
-- Extract contract start year
UPDATE fifa_rt
SET contract_start = CASE
WHEN contract LIKE '%~%' THEN LEFT(contract, 4)
WHEN contract LIKE '%Loan%' THEN NULL
ELSE NULL
END;
-- Extract contract end year
UPDATE fifa_rt
SET contract_end = CASE
WHEN contract LIKE '%~%' THEN RIGHT(contract, 4)
WHEN contract LIKE '%Loan%' THEN NULL
ELSE NULL
END;
-- Define the contract type
UPDATE fifa_rt
SET contract_type = CASE
WHEN contract LIKE '%Loan%' THEN 'On Loan'
WHEN contract LIKE '%Free%' THEN 'Free'
ELSE 'Regular'
END;
-- Extract loan end date
UPDATE fifa_rt
SET loan_end_date = CASE
WHEN contract LIKE '%Loan%' THEN LEFT(contract, 12)
ELSE NULL
END;
-- Convert extracted text dates to integer/date types
UPDATE fifa_rt SET loan_end_date = TO_DATE(loan_end_date, 'Mon DD, YYYY');
ALTER TABLE fifa_rt
ALTER COLUMN contract_start TYPE int USING contract_start::integer,
ALTER COLUMN contract_end TYPE int USING contract_end::integer,
ALTER COLUMN loan_end_date TYPE date USING loan_end_date::date;
-- Drop the original messy contract column and the redundant loan_date column
ALTER TABLE fifa_rt DROP COLUMN contract;
ALTER TABLE fifa_rt DROP COLUMN loan_date;
-- Extract primary position from the comma-separated positions list
ALTER TABLE fifa_rt ADD COLUMN primary_position text;
UPDATE fifa_rt
SET primary_position = SPLIT_PART(positions, ',', 1);
-- -----------------------------------------------------------------------------
-- 4. DATE CONVERSIONS
-- -----------------------------------------------------------------------------
-- Standardize the 'joined' column to proper DATE format
UPDATE fifa_rt
SET joined = TO_DATE(joined, 'DD-Mon-YY');
ALTER TABLE fifa_rt
ALTER COLUMN joined TYPE date USING joined::date;
-- -----------------------------------------------------------------------------
-- 5. UNIT STANDARDIZATION (Height & Weight)
-- -----------------------------------------------------------------------------
-- Clean Height: Separate cm from feet/inches and standardise notation
UPDATE fifa_rt
SET height = CASE
WHEN height LIKE '%cm%' THEN LEFT(height, 3)
ELSE REPLACE(height, '''', '.')
END;
ALTER TABLE fifa_rt ALTER COLUMN height TYPE numeric(5,2) USING height::numeric(5,2);
-- Convert imperial height (feet.inches) to metric (cm)
UPDATE fifa_rt
SET height = ROUND(
(TRUNC(height) * 30.48) +
CASE
WHEN TRUNC(height) = 5 AND (height - TRUNC(height)) < 0.2 THEN (height - TRUNC(height)) * 100 * 2.54
ELSE (height - TRUNC(height)) * 10 * 2.54
END
)
WHERE height < 7;
ALTER TABLE fifa_rt ALTER COLUMN height TYPE int USING height::int;
-- Clean Weight: Convert lbs to kg
UPDATE fifa_rt
SET weight = CASE
WHEN weight LIKE '%kg%' THEN REPLACE(weight, 'kg', '')::numeric
ELSE REPLACE(weight, 'lbs', '')::numeric * 0.45359
END;
ALTER TABLE fifa_rt ALTER COLUMN weight TYPE int USING ROUND(weight::numeric)::int;
-- -----------------------------------------------------------------------------
-- 6. FINANCIAL DATA CLEANING (Value, Wage, Release Clause)
-- -----------------------------------------------------------------------------
-- Standardize Value to millions (M)
UPDATE fifa_rt
SET value = CASE
WHEN value LIKE '%M%' THEN REPLACE(REPLACE(value, '€', ''), 'M', '')::numeric
WHEN value LIKE '%K%' THEN REPLACE(REPLACE(value, '€', ''), 'K', '')::numeric / 1000
ELSE REPLACE(value, '€', '')::numeric
END;
ALTER TABLE fifa_rt ALTER COLUMN value TYPE numeric(8,5) USING value::numeric(8,5);
-- Standardize Wage to thousands (K)
UPDATE fifa_rt
SET wage = CASE
WHEN wage LIKE '%K%' THEN REPLACE(REPLACE(wage, '€', ''), 'K', '')::numeric
ELSE REPLACE(wage, '€', '')::numeric / 1000
END;
ALTER TABLE fifa_rt ALTER COLUMN wage TYPE numeric(8,5) USING wage::numeric(8,5);
-- Standardize Release Clause to millions (M)
UPDATE fifa_rt
SET release_clause = CASE
WHEN release_clause LIKE '%M%' THEN REPLACE(REPLACE(release_clause, '€', ''), 'M', '')::numeric
WHEN release_clause LIKE '%K%' THEN REPLACE(REPLACE(release_clause, '€', ''), 'K', '')::numeric / 1000
ELSE REPLACE(release_clause, '€', '')::numeric
END;
ALTER TABLE fifa_rt ALTER COLUMN release_clause TYPE numeric(8,5) USING release_clause::numeric(8,5);
-- -----------------------------------------------------------------------------
-- 7. MISSING TEXT NORMALIZATION (EDA findings)
-- -----------------------------------------------------------------------------
-- Trim whitespaces from nationalities
UPDATE fifa_rt
SET nationality = TRIM(nationality);
-- -----------------------------------------------------------------------------
-- 8. DATA QUALITY CHECKS
-- -----------------------------------------------------------------------------
-- Check if 'id' is unique for every player
WITH mik AS (
SELECT id, row_number() over(partition by id) as row_num
FROM fifa_rt
)
SELECT * FROM mik WHERE row_num > 1;
-- Check for duplicate players under different IDs
WITH mik AS (
SELECT name, long_name, nationality,
row_number() over(partition by name, long_name, nationality, age) as row_num
FROM fifa_rt
)
SELECT * FROM mik WHERE row_num > 1;
-- -----------------------------------------------------------------------------
-- 9. FUZZY MATCHING: TYPO DETECTION IN NATIONALITIES & CLUBS
-- -----------------------------------------------------------------------------
-- Detect spelling mistakes in nationalities using Levenshtein distance
WITH unique_nationalities AS (
SELECT DISTINCT nationality
FROM fifa_rt
WHERE nationality IS NOT NULL
)
SELECT a.nationality AS word1, b.nationality AS word2, levenshtein(a.nationality, b.nationality) AS distance
FROM unique_nationalities a
JOIN unique_nationalities b ON a.nationality < b.nationality
WHERE levenshtein(a.nationality, b.nationality) BETWEEN 1 AND 2
ORDER BY distance, a.nationality;
-- Detect spelling mistakes in nationalities using Similarity score
WITH unique_nationalities AS (
SELECT DISTINCT nationality
FROM fifa_rt
WHERE nationality IS NOT NULL
)
SELECT a.nationality AS word1, b.nationality AS word2, similarity(a.nationality, b.nationality) AS sim_score
FROM unique_nationalities a
JOIN unique_nationalities b ON a.nationality < b.nationality
WHERE similarity(a.nationality, b.nationality) > 0.3
ORDER BY sim_score DESC;
-- Detect spelling mistakes in clubs using Levenshtein distance
WITH unique_clubs AS (
SELECT DISTINCT club FROM fifa_rt
)
SELECT a.club AS word1, b.club AS word2, levenshtein(a.club, b.club) AS distance
FROM unique_clubs a
JOIN unique_clubs b ON a.club < b.club
WHERE levenshtein(a.club, b.club) BETWEEN 1 AND 3
ORDER BY distance, a.club;
-- Detect spelling mistakes in clubs using Similarity score
WITH unique_clubs AS (
SELECT DISTINCT club FROM fifa_rt
)
SELECT a.club AS word1, b.club AS word2, similarity(a.club, b.club) AS sim_score
FROM unique_clubs a
JOIN unique_clubs b ON a.club < b.club
WHERE similarity(a.club, b.club) > 0.5
ORDER BY sim_score DESC;
-- -----------------------------------------------------------------------------
-- 10. COLUMN VALIDATION (POSITIONS)
-- -----------------------------------------------------------------------------
-- Validate values in the existing "best_position " column
SELECT "best_position "
FROM fifa_rt
WHERE "best_position " NOT IN ('GK','CB','LB','RB','LWB','RWB','LM','CM','CDM','RM','CAM','LW','RW','LF','RF','ST','CF');
-- Check maximum length of the best_position to ensure no multiple positions exist
SELECT max(length(trim("best_position "))) FROM fifa_rt;
-- Validate values in the complex 'positions' column
SELECT positions
FROM fifa_rt
WHERE positions NOT LIKE '%GK%' AND positions NOT LIKE '%CB%'
AND positions NOT LIKE '%LB%' AND positions NOT LIKE '%RB%'
AND positions NOT LIKE '%LWB%' AND positions NOT LIKE '%RWB%'
AND positions NOT LIKE '%LM%' AND positions NOT LIKE '%CM%'
AND positions NOT LIKE '%CDM%' AND positions NOT LIKE '%RM%'
AND positions NOT LIKE '%CAM%' AND positions NOT LIKE '%LW%'
AND positions NOT LIKE '%RW%' AND positions NOT LIKE '%LF%'
AND positions NOT LIKE '%RF%' AND positions NOT LIKE '%ST%'
AND positions NOT LIKE '%CF%';