|
18 | 18 | from streamline.utils.job import Job |
19 | 19 | from streamline.utils.dataset import Dataset |
20 | 20 | from streamline.dataprep.kfold_partitioning import KFoldPartitioner |
21 | | -from scipy.stats import chi2_contingency, mannwhitneyu, skew, kurtosis, f_oneway, spearmanr |
| 21 | +from scipy.stats import chi2_contingency, mannwhitneyu, skew, kurtosis, f_oneway, kruskal, spearmanr |
22 | 22 | import seaborn as sns |
23 | 23 | import warnings |
24 | 24 |
|
@@ -1266,69 +1266,82 @@ def univariate_analysis(self, top_features=20): |
1266 | 1266 |
|
1267 | 1267 | def test_selector(self, feature_name): |
1268 | 1268 | """ |
1269 | | - Selects and applies appropriate univariate association test for a given feature. Returns resulting p-value |
| 1269 | + Selects and applies appropriate univariate association test for a given feature. |
| 1270 | + Returns resulting p-value, test statistic, and test name. |
1270 | 1271 |
|
1271 | 1272 | Args: |
1272 | 1273 | feature_name: name of feature column operation is running on |
1273 | 1274 | """ |
1274 | 1275 | outcome_label = self.dataset.outcome_label |
1275 | 1276 | p_val, test_stat, test_name = None, None, None |
| 1277 | + |
| 1278 | + x = self.dataset.data[feature_name] |
| 1279 | + y = self.dataset.data[outcome_label] |
| 1280 | + |
1276 | 1281 | try: |
1277 | | - if self.outcome_type == "Binary" or self.outcome_type == "Multiclass": |
1278 | | - # test_name, test_stat = None, None |
1279 | | - # Feature and outcome both are discrete/categorical/binary |
| 1282 | + if self.outcome_type in ["Binary", "Multiclass"]: |
| 1283 | + # Outcome is discrete/categorical/binary |
1280 | 1284 | if feature_name in self.dataset.categorical_variables: |
1281 | | - # Calculate Contingency Table - Counts |
1282 | | - table_temp = pd.crosstab(self.dataset.data[feature_name], self.dataset.data[outcome_label]) |
| 1285 | + # Feature and outcome both categorical -> Chi-square test of independence (non-parametric) |
| 1286 | + table_temp = pd.crosstab(x, y) |
1283 | 1287 | # Univariate association test (Chi Square Test of Independence - Non-parametric) |
1284 | 1288 | c, p, dof, expected = chi2_contingency(table_temp) |
1285 | 1289 | p_val = p |
1286 | 1290 | test_stat = c |
1287 | 1291 | test_name = "Chi Square Test" |
1288 | | - # Feature is continuous and Outcome is discrete/categorical/binary |
| 1292 | + |
1289 | 1293 | else: |
1290 | | - if len(self.dataset.data[outcome_label].unique()) == 2: |
1291 | | - # Univariate association test (Mann-Whitney Test - Non-parametric) |
1292 | | - c, p = mannwhitneyu( |
1293 | | - x=self.dataset.data[feature_name].loc[self.dataset.data[outcome_label] == 0], |
1294 | | - y=self.dataset.data[feature_name].loc[self.dataset.data[outcome_label] == 1], |
1295 | | - nan_policy='omit') |
| 1294 | + # Feature continuous, outcome categorical/binary |
| 1295 | + unique_outcomes = y.dropna().unique() |
| 1296 | + |
| 1297 | + if len(unique_outcomes) == 2: |
| 1298 | + # Mann-Whitney U Test (non-parametric) |
| 1299 | + group0 = x[y == unique_outcomes[0]].dropna() |
| 1300 | + group1 = x[y == unique_outcomes[1]].dropna() |
| 1301 | + |
| 1302 | + c, p = mannwhitneyu(group0, group1, alternative="two-sided") |
1296 | 1303 | p_val = p |
1297 | 1304 | test_stat = c |
1298 | 1305 | test_name = "Mann-Whitney U Test" |
1299 | 1306 | else: |
1300 | | - categories = list(self.dataset.data[outcome_label].unique()) |
1301 | | - samples = [self.dataset.data[feature_name].loc[self.dataset.data[outcome_label] == cat] |
1302 | | - for cat in categories] |
1303 | | - c, p = f_oneway(*samples) |
| 1307 | + # >2 outcome categories: Kruskal-Wallis H Test (non-parametric alternative to one-way ANOVA) |
| 1308 | + samples = [ |
| 1309 | + x[y == cat].dropna() |
| 1310 | + for cat in unique_outcomes |
| 1311 | + ] |
| 1312 | + c, p = kruskal(*samples) |
1304 | 1313 | p_val = p |
1305 | 1314 | test_stat = c |
1306 | | - test_name = "Analysis of Variance" |
| 1315 | + test_name = "Kruskal-Wallis H Test" |
| 1316 | + |
1307 | 1317 | elif self.outcome_type == "Continuous": |
1308 | | - # Feature is discrete/categorical/binary and Outcome is continuous |
| 1318 | + # Outcome continuous |
1309 | 1319 | if feature_name in self.dataset.categorical_variables: |
1310 | | - categories = list(self.dataset.data[feature_name].unique()) |
1311 | | - samples = [self.dataset.data[outcome_label].loc[self.dataset.data[feature_name] == cat] |
1312 | | - for cat in categories] |
1313 | | - c, p = f_oneway(*samples) |
| 1320 | + # Categorical feature, continuous outcome -> Kruskal-Wallis (non-parametric) |
| 1321 | + categories = x.dropna().unique() |
| 1322 | + samples = [ |
| 1323 | + y[x == cat].dropna() |
| 1324 | + for cat in categories |
| 1325 | + ] |
| 1326 | + c, p = kruskal(*samples) |
1314 | 1327 | p_val = p |
1315 | 1328 | test_stat = c |
1316 | | - test_name = "Analysis of Variance" |
1317 | | - # Feature is continuous and Outcome is continuous |
| 1329 | + test_name = "Kruskal-Wallis H Test" |
1318 | 1330 | else: |
1319 | | - # Univariate association test (Mann-Whitney Test - Non-parametric) |
1320 | | - res = spearmanr( |
1321 | | - a=self.dataset.data[feature_name], |
1322 | | - b=self.dataset.data[outcome_label], nan_policy='omit') |
| 1331 | + # Both continuous -> Spearman correlation (non-parametric) |
| 1332 | + res = spearmanr(x, y, nan_policy="omit") |
1323 | 1333 | c, p = res.statistic, res.pvalue |
1324 | 1334 | p_val = p |
1325 | 1335 | test_stat = c |
1326 | 1336 | test_name = "Spearman Correlation" |
| 1337 | + |
1327 | 1338 | except Exception as e: |
1328 | 1339 | logging.error(e) |
1329 | | - raise Exception("scipy error, check if you've install correct version of scipy") |
| 1340 | + raise Exception("scipy error, check if you've installed correct version of scipy") from e |
| 1341 | + |
1330 | 1342 | return p_val, test_stat, test_name |
1331 | 1343 |
|
| 1344 | + |
1332 | 1345 | def univariate_plots(self, sorted_p_list=None, top_features=20): |
1333 | 1346 | """ |
1334 | 1347 | Checks whether p-value of each feature is less than or equal to significance cutoff. |
|
0 commit comments