Skip to content

Commit 2a6f56d

Browse files
committed
changed anova to kruskal wallis
1 parent 3d8f108 commit 2a6f56d

2 files changed

Lines changed: 44 additions & 32 deletions

File tree

streamline/dataprep/data_process.py

Lines changed: 44 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from streamline.utils.job import Job
1919
from streamline.utils.dataset import Dataset
2020
from streamline.dataprep.kfold_partitioning import KFoldPartitioner
21-
from scipy.stats import chi2_contingency, mannwhitneyu, skew, kurtosis, f_oneway, spearmanr
21+
from scipy.stats import chi2_contingency, mannwhitneyu, skew, kurtosis, f_oneway, kruskal, spearmanr
2222
import seaborn as sns
2323
import warnings
2424

@@ -1266,69 +1266,82 @@ def univariate_analysis(self, top_features=20):
12661266

12671267
def test_selector(self, feature_name):
12681268
"""
1269-
Selects and applies appropriate univariate association test for a given feature. Returns resulting p-value
1269+
Selects and applies appropriate univariate association test for a given feature.
1270+
Returns resulting p-value, test statistic, and test name.
12701271
12711272
Args:
12721273
feature_name: name of feature column operation is running on
12731274
"""
12741275
outcome_label = self.dataset.outcome_label
12751276
p_val, test_stat, test_name = None, None, None
1277+
1278+
x = self.dataset.data[feature_name]
1279+
y = self.dataset.data[outcome_label]
1280+
12761281
try:
1277-
if self.outcome_type == "Binary" or self.outcome_type == "Multiclass":
1278-
# test_name, test_stat = None, None
1279-
# Feature and outcome both are discrete/categorical/binary
1282+
if self.outcome_type in ["Binary", "Multiclass"]:
1283+
# Outcome is discrete/categorical/binary
12801284
if feature_name in self.dataset.categorical_variables:
1281-
# Calculate Contingency Table - Counts
1282-
table_temp = pd.crosstab(self.dataset.data[feature_name], self.dataset.data[outcome_label])
1285+
# Feature and outcome both categorical -> Chi-square test of independence (non-parametric)
1286+
table_temp = pd.crosstab(x, y)
12831287
# Univariate association test (Chi Square Test of Independence - Non-parametric)
12841288
c, p, dof, expected = chi2_contingency(table_temp)
12851289
p_val = p
12861290
test_stat = c
12871291
test_name = "Chi Square Test"
1288-
# Feature is continuous and Outcome is discrete/categorical/binary
1292+
12891293
else:
1290-
if len(self.dataset.data[outcome_label].unique()) == 2:
1291-
# Univariate association test (Mann-Whitney Test - Non-parametric)
1292-
c, p = mannwhitneyu(
1293-
x=self.dataset.data[feature_name].loc[self.dataset.data[outcome_label] == 0],
1294-
y=self.dataset.data[feature_name].loc[self.dataset.data[outcome_label] == 1],
1295-
nan_policy='omit')
1294+
# Feature continuous, outcome categorical/binary
1295+
unique_outcomes = y.dropna().unique()
1296+
1297+
if len(unique_outcomes) == 2:
1298+
# Mann-Whitney U Test (non-parametric)
1299+
group0 = x[y == unique_outcomes[0]].dropna()
1300+
group1 = x[y == unique_outcomes[1]].dropna()
1301+
1302+
c, p = mannwhitneyu(group0, group1, alternative="two-sided")
12961303
p_val = p
12971304
test_stat = c
12981305
test_name = "Mann-Whitney U Test"
12991306
else:
1300-
categories = list(self.dataset.data[outcome_label].unique())
1301-
samples = [self.dataset.data[feature_name].loc[self.dataset.data[outcome_label] == cat]
1302-
for cat in categories]
1303-
c, p = f_oneway(*samples)
1307+
# >2 outcome categories: Kruskal-Wallis H Test (non-parametric alternative to one-way ANOVA)
1308+
samples = [
1309+
x[y == cat].dropna()
1310+
for cat in unique_outcomes
1311+
]
1312+
c, p = kruskal(*samples)
13041313
p_val = p
13051314
test_stat = c
1306-
test_name = "Analysis of Variance"
1315+
test_name = "Kruskal-Wallis H Test"
1316+
13071317
elif self.outcome_type == "Continuous":
1308-
# Feature is discrete/categorical/binary and Outcome is continuous
1318+
# Outcome continuous
13091319
if feature_name in self.dataset.categorical_variables:
1310-
categories = list(self.dataset.data[feature_name].unique())
1311-
samples = [self.dataset.data[outcome_label].loc[self.dataset.data[feature_name] == cat]
1312-
for cat in categories]
1313-
c, p = f_oneway(*samples)
1320+
# Categorical feature, continuous outcome -> Kruskal-Wallis (non-parametric)
1321+
categories = x.dropna().unique()
1322+
samples = [
1323+
y[x == cat].dropna()
1324+
for cat in categories
1325+
]
1326+
c, p = kruskal(*samples)
13141327
p_val = p
13151328
test_stat = c
1316-
test_name = "Analysis of Variance"
1317-
# Feature is continuous and Outcome is continuous
1329+
test_name = "Kruskal-Wallis H Test"
13181330
else:
1319-
# Univariate association test (Mann-Whitney Test - Non-parametric)
1320-
res = spearmanr(
1321-
a=self.dataset.data[feature_name],
1322-
b=self.dataset.data[outcome_label], nan_policy='omit')
1331+
# Both continuous -> Spearman correlation (non-parametric)
1332+
res = spearmanr(x, y, nan_policy="omit")
13231333
c, p = res.statistic, res.pvalue
13241334
p_val = p
13251335
test_stat = c
13261336
test_name = "Spearman Correlation"
1337+
13271338
except Exception as e:
13281339
logging.error(e)
1329-
raise Exception("scipy error, check if you've install correct version of scipy")
1340+
raise Exception("scipy error, check if you've installed correct version of scipy") from e
1341+
13301342
return p_val, test_stat, test_name
13311343

1344+
13321345
def univariate_plots(self, sorted_p_list=None, top_features=20):
13331346
"""
13341347
Checks whether p-value of each feature is less than or equal to significance cutoff.

streamline/tests/test_classification.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ def test_classification():
2727
os.mkdir(output_path)
2828

2929
eda = DataProcessRunner(dataset_path, output_path, experiment_name,
30-
exclude_eda_output=['correlation'],
3130
outcome_label="Class", instance_label="InstanceID", n_splits=3, ignore_features=None,
3231
categorical_features=['Gender', 'Symptoms ', 'Alcohol', 'Hepatitis B Surface Antigen',
3332
'Hepatitis B e Antigen', 'Hepatitis B Core Antibody',

0 commit comments

Comments
 (0)