-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_processor.py
More file actions
135 lines (111 loc) · 4.94 KB
/
Copy pathdata_processor.py
File metadata and controls
135 lines (111 loc) · 4.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import pandas as pd
import numpy as np
class DataProcessor:
def __init__(self, df):
self.df = df.copy()
def process_data(self):
"""Process and clean the HR dataset"""
df = self.df.copy()
# Create age groups
df['AgeGroup'] = pd.cut(df['Age'],
bins=[0, 25, 35, 45, 55, 100],
labels=['18-25', '26-35', '36-45', '46-55', '55+'])
# Create tenure groups
df['TenureGroup'] = pd.cut(df['YearsAtCompany'],
bins=[0, 2, 5, 10, 50],
labels=['0-2 years', '3-5 years', '6-10 years', '10+ years'])
# Create income groups
df['IncomeGroup'] = pd.cut(df['MonthlyIncome'],
bins=[0, 3000, 6000, 10000, 20000],
labels=['Low', 'Medium', 'High', 'Very High'])
# Map education levels
education_mapping = {
1: 'Below College',
2: 'College',
3: 'Bachelor',
4: 'Master',
5: 'Doctor'
}
df['EducationLevel'] = df['Education'].map(education_mapping)
# Map satisfaction levels
satisfaction_mapping = {
1: 'Low',
2: 'Medium',
3: 'High',
4: 'Very High'
}
df['JobSatisfactionLevel'] = df['JobSatisfaction'].map(satisfaction_mapping)
df['EnvironmentSatisfactionLevel'] = df['EnvironmentSatisfaction'].map(satisfaction_mapping)
df['RelationshipSatisfactionLevel'] = df['RelationshipSatisfaction'].map(satisfaction_mapping)
# Map work-life balance
worklife_mapping = {
1: 'Bad',
2: 'Good',
3: 'Better',
4: 'Best'
}
df['WorkLifeBalanceLevel'] = df['WorkLifeBalance'].map(worklife_mapping)
# Map job involvement
involvement_mapping = {
1: 'Low',
2: 'Medium',
3: 'High',
4: 'Very High'
}
df['JobInvolvementLevel'] = df['JobInvolvement'].map(involvement_mapping)
# Map performance rating
performance_mapping = {
1: 'Low',
2: 'Good',
3: 'Excellent',
4: 'Outstanding'
}
df['PerformanceRatingLevel'] = df['PerformanceRating'].map(performance_mapping)
return df
def apply_filters(self, df, department, job_role, age_group, education, gender):
"""Apply filters to the dataframe"""
filtered_df = df.copy()
if department != 'All':
filtered_df = filtered_df[filtered_df['Department'] == department]
if job_role != 'All':
filtered_df = filtered_df[filtered_df['JobRole'] == job_role]
if age_group != 'All':
if age_group == '18-25':
filtered_df = filtered_df[filtered_df['Age'] <= 25]
elif age_group == '26-35':
filtered_df = filtered_df[(filtered_df['Age'] >= 26) & (filtered_df['Age'] <= 35)]
elif age_group == '36-45':
filtered_df = filtered_df[(filtered_df['Age'] >= 36) & (filtered_df['Age'] <= 45)]
elif age_group == '46-55':
filtered_df = filtered_df[(filtered_df['Age'] >= 46) & (filtered_df['Age'] <= 55)]
elif age_group == '55+':
filtered_df = filtered_df[filtered_df['Age'] > 55]
if education != 'All':
filtered_df = filtered_df[filtered_df['Education'] == education]
if gender != 'All':
filtered_df = filtered_df[filtered_df['Gender'] == gender]
return filtered_df
def get_attrition_rate(self, df, group_by_column):
"""Calculate attrition rate by a specific column"""
if group_by_column not in df.columns:
return pd.DataFrame()
result = df.groupby(group_by_column).agg({
'Attrition': ['count', lambda x: (x == 'Yes').sum()]
}).round(2)
result.columns = ['Total', 'Attrition_Count']
result['Attrition_Rate'] = (result['Attrition_Count'] / result['Total'] * 100).round(2)
result = result.reset_index()
return result
def get_summary_statistics(self, df):
"""Get summary statistics for the dataset"""
stats = {
'total_employees': len(df),
'attrition_count': len(df[df['Attrition'] == 'Yes']),
'attrition_rate': len(df[df['Attrition'] == 'Yes']) / len(df) * 100 if len(df) > 0 else 0,
'avg_age': df['Age'].mean(),
'avg_monthly_income': df['MonthlyIncome'].mean(),
'avg_years_at_company': df['YearsAtCompany'].mean(),
'departments': df['Department'].nunique(),
'job_roles': df['JobRole'].nunique()
}
return stats