Skip to content

Commit 02a6614

Browse files
committed
Further DataFrame-compatibility fixes to work with Pingouin
1 parent 46653aa commit 02a6614

5 files changed

Lines changed: 289 additions & 4 deletions

File tree

datamatrix/_datamatrix/_dataframe_compat.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ class DataFrameCompatMixin:
219219
''')
220220
result = compare_classes(DataMatrix, pd.DataFrame)
221221
result['iterrows'] = 'function'
222+
result['_get_numeric_data'] = 'function'
222223
result['index'] = 'property'
223224
result['__dataframe__'] = 'function'
224225
for attr, attr_type in result.items():
@@ -228,5 +229,6 @@ class SeriesCompatMixin:
228229
''')
229230
result = compare_classes(BaseColumn, pd.Series)
230231
result['index'] = 'property'
232+
result['_get_numeric_data'] = 'function'
231233
for attr, attr_type in result.items():
232234
file.write(f" {attr} = df_compat_{attr_type}('{attr}')\n")

datamatrix/_datamatrix/_dataframe_compat_mixin.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ class DataFrameCompatMixin:
202202
where = df_compat_function('where')
203203
xs = df_compat_function('xs')
204204
iterrows = df_compat_function('iterrows')
205+
_get_numeric_data = df_compat_function('_get_numeric_data')
205206
index = df_compat_property('index')
206207
__dataframe__ = df_compat_function('__dataframe__')
207208

@@ -404,3 +405,4 @@ class SeriesCompatMixin:
404405
where = df_compat_function('where')
405406
xs = df_compat_function('xs')
406407
index = df_compat_property('index')
408+
_get_numeric_data = df_compat_function('_get_numeric_data')

datamatrix/_datamatrix/_datamatrix.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from ._dataframe_compat import df_compat_function
2424
from ._basecolumn import BaseColumn
2525
from ._mixedcolumn import MixedColumn
26+
from ._where_index import WhereIndex
2627
from ._index import Index
2728
from ._uninstantiatedcolumn import UninstantiatedColumn
2829
try:
@@ -708,10 +709,11 @@ def _where(self, other_dm):
708709
raise ValueError(
709710
'Can only slice a DataMatrix with a subset of itself')
710711
try:
711-
return [self._rowid.index(rowid) for rowid in other_dm._rowid]
712+
indices = [self._rowid.index(rowid) for rowid in other_dm._rowid]
712713
except KeyError:
713714
raise ValueError(
714715
'Can only slice a DataMatrix with a subset of itself')
716+
return WhereIndex(self, indices)
715717

716718
# Implemented syntax
717719

datamatrix/_datamatrix/_numericcolumn.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class NumericColumn(BaseColumn):
4646
directly.
4747
"""
4848

49-
dtype = float
49+
dtype = np.dtype(float)
5050
invalid = nan
5151

5252
def __init__(self, datamatrix, **kwargs):
@@ -220,7 +220,7 @@ def _getdatamatrixkey(self, key):
220220

221221
def _getintkey(self, key):
222222

223-
return self.dtype(self._seq[key])
223+
return self.dtype.type(self._seq[key])
224224

225225
def _getrowidkey(self, key, dm=None):
226226

@@ -307,7 +307,7 @@ class IntColumn(NumericColumn):
307307
A column of numeric int values. Does not support invalid values.
308308
"""
309309

310-
dtype = int
310+
dtype = np.dtype(int)
311311
invalid = 0
312312

313313
def _tosequence(self, value, length=None):
Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
import numpy as np
2+
from matplotlib import pyplot as plt
3+
import pandas as pd
4+
from datamatrix import DataMatrix
5+
import pingouin as pg
6+
7+
# Set the backend to Agg for headless operation
8+
plt.switch_backend('Agg')
9+
10+
# ------------------------------------------------------------------
11+
# Fixtures / helpers
12+
# ------------------------------------------------------------------
13+
14+
TEST_CLASSES = pd.DataFrame, DataMatrix
15+
16+
def _make_sample(cls, design='between', n_subjects=32, seed=42):
17+
"""Return a sample dataset for a given class."""
18+
np.random.seed(seed)
19+
20+
if design == 'between':
21+
# Between-subjects design
22+
data = {
23+
'subject': np.arange(n_subjects),
24+
'group': np.repeat(['A', 'B'], n_subjects // 2),
25+
'score': np.concatenate([
26+
np.random.normal(100, 15, n_subjects // 2),
27+
np.random.normal(110, 15, n_subjects // 2)
28+
]),
29+
'age': np.random.normal(25, 5, n_subjects),
30+
'treatment': np.tile(['placebo', 'drug'], n_subjects // 2)
31+
}
32+
elif design == 'within':
33+
# Within-subjects design
34+
n_obs = n_subjects // 3
35+
data = {
36+
'subject': np.repeat(np.arange(n_obs), 3),
37+
'time': np.tile(['T1', 'T2', 'T3'], n_obs),
38+
'score': np.concatenate([
39+
np.random.normal(100, 10, n_obs),
40+
np.random.normal(105, 10, n_obs),
41+
np.random.normal(110, 10, n_obs)
42+
]) + np.random.normal(0, 5, n_obs * 3)
43+
}
44+
elif design == 'mixed':
45+
# Mixed design
46+
n_obs = n_subjects // 2
47+
data = {
48+
'subject': np.tile(np.arange(n_obs), 2),
49+
'time': np.repeat(['pre', 'post'], n_obs),
50+
'group': np.concatenate([np.repeat(['control', 'treatment'], n_obs // 2)] * 2),
51+
'score': np.concatenate([
52+
np.random.normal(100, 10, n_obs),
53+
np.random.normal(110, 10, n_obs)
54+
])
55+
}
56+
elif design == 'correlation':
57+
# For correlation analyses
58+
data = {
59+
'x': np.random.normal(100, 15, n_subjects),
60+
'y': np.random.normal(100, 15, n_subjects),
61+
'z': np.random.normal(100, 15, n_subjects)
62+
}
63+
# Add correlation
64+
data['y'] = data['x'] * 0.7 + np.random.normal(0, 10, n_subjects)
65+
data['z'] = data['x'] * 0.3 + data['y'] * 0.4 + np.random.normal(0, 10, n_subjects)
66+
67+
return cls(data)
68+
69+
# ------------------------------------------------------------------
70+
# T-test Tests
71+
# ------------------------------------------------------------------
72+
73+
def test_ttest_one_sample():
74+
"""Test one-sample t-test."""
75+
for cls in TEST_CLASSES:
76+
df = _make_sample(cls, design='between')
77+
result = pg.ttest(df['score'], 100)
78+
assert 'T' in result
79+
assert 'p_val' in result
80+
81+
def test_ttest_independent():
82+
"""Test independent samples t-test."""
83+
for cls in TEST_CLASSES:
84+
df = _make_sample(cls, design='between')
85+
group_a = df[df['group'] == 'A']['score']
86+
group_b = df[df['group'] == 'B']['score']
87+
result = pg.ttest(group_a, group_b, paired=False)
88+
assert 'T' in result
89+
assert 'p_val' in result
90+
91+
def test_ttest_paired():
92+
"""Test paired samples t-test."""
93+
for cls in TEST_CLASSES:
94+
df = _make_sample(cls, design='within')
95+
t1_scores = df[df['time'] == 'T1']['score']
96+
t2_scores = df[df['time'] == 'T2']['score']
97+
# Ensure same length for paired test
98+
min_len = min(len(t1_scores), len(t2_scores))
99+
result = pg.ttest(t1_scores[:min_len], t2_scores[:min_len], paired=True)
100+
assert 'T' in result
101+
assert 'p_val' in result
102+
103+
# ------------------------------------------------------------------
104+
# ANOVA Tests
105+
# ------------------------------------------------------------------
106+
107+
def test_anova_oneway():
108+
"""Test one-way ANOVA."""
109+
for cls in TEST_CLASSES:
110+
df = _make_sample(cls, design='between')
111+
result = pg.anova(data=df, dv='score', between='group')
112+
assert 'F' in result.columns
113+
assert 'p_unc' in result.columns
114+
115+
def test_rm_anova():
116+
"""Test repeated measures ANOVA."""
117+
for cls in TEST_CLASSES:
118+
df = _make_sample(cls, design='within')
119+
result = pg.rm_anova(data=df, dv='score', within='time', subject='subject')
120+
assert 'F' in result.columns
121+
assert 'p_unc' in result.columns
122+
123+
def test_mixed_anova():
124+
"""Test mixed ANOVA."""
125+
for cls in TEST_CLASSES:
126+
df = _make_sample(cls, design='mixed')
127+
result = pg.mixed_anova(data=df, dv='score', within='time',
128+
between='group', subject='subject')
129+
assert 'F' in result.columns
130+
assert 'p_unc' in result.columns
131+
132+
# ------------------------------------------------------------------
133+
# Correlation Tests
134+
# ------------------------------------------------------------------
135+
136+
def test_correlation():
137+
"""Test Pearson and Spearman correlation."""
138+
for cls in TEST_CLASSES:
139+
df = _make_sample(cls, design='correlation')
140+
141+
# Pearson correlation
142+
n, r, ci, p, bf, power = pg.corr(df['x'], df['y'], method='pearson').values[0]
143+
assert -1 <= r <= 1
144+
assert 0 <= p <= 1
145+
146+
# Spearman correlation
147+
n, r, ci, p, power = pg.corr(df['x'], df['y'], method='spearman').values[0]
148+
assert -1 <= r <= 1
149+
assert 0 <= p <= 1
150+
151+
def test_pairwise_correlation():
152+
"""Test pairwise correlations."""
153+
for cls in TEST_CLASSES:
154+
df = _make_sample(cls, design='correlation')
155+
result = pg.pairwise_corr(df, columns=['x', 'y', 'z'])
156+
assert len(result) > 0
157+
assert 'r' in result.columns
158+
assert 'p_unc' in result.columns
159+
160+
def test_partial_correlation():
161+
"""Test partial correlation."""
162+
for cls in TEST_CLASSES:
163+
df = _make_sample(cls, design='correlation')
164+
result = pg.partial_corr(data=df, x='x', y='y', covar='z')
165+
assert 'r' in result.columns
166+
assert 'p_val' in result.columns
167+
168+
# ------------------------------------------------------------------
169+
# Non-parametric Tests
170+
# ------------------------------------------------------------------
171+
172+
def test_wilcoxon():
173+
"""Test Wilcoxon signed-rank test."""
174+
for cls in TEST_CLASSES:
175+
df = _make_sample(cls, design='within')
176+
t1_scores = df[df['time'] == 'T1']['score']
177+
t2_scores = df[df['time'] == 'T2']['score']
178+
min_len = min(len(t1_scores), len(t2_scores))
179+
result = pg.wilcoxon(t1_scores[:min_len], t2_scores[:min_len])
180+
assert 'W_val' in result
181+
assert 'p_val' in result
182+
183+
def test_mann_whitney():
184+
"""Test Mann-Whitney U test."""
185+
for cls in TEST_CLASSES:
186+
df = _make_sample(cls, design='between')
187+
group_a = df[df['group'] == 'A']['score']
188+
group_b = df[df['group'] == 'B']['score']
189+
result = pg.mwu(group_a, group_b)
190+
assert 'U_val' in result
191+
assert 'p_val' in result
192+
193+
def test_kruskal():
194+
"""Test Kruskal-Wallis test."""
195+
for cls in TEST_CLASSES:
196+
df = _make_sample(cls, design='between')
197+
result = pg.kruskal(data=df, dv='score', between='group')
198+
assert 'H' in result.columns
199+
assert 'p_unc' in result.columns
200+
201+
# ------------------------------------------------------------------
202+
# Regression Tests
203+
# ------------------------------------------------------------------
204+
205+
def test_linear_regression():
206+
"""Test linear regression."""
207+
for cls in TEST_CLASSES:
208+
df = _make_sample(cls, design='correlation')
209+
result = pg.linear_regression(df[['x', 'z']], df['y'])
210+
assert 'coef' in result.columns
211+
assert 'pval' in result.columns
212+
213+
def test_logistic_regression():
214+
"""Test logistic regression."""
215+
for cls in TEST_CLASSES:
216+
df = _make_sample(cls, design='between')
217+
# Create binary outcome
218+
219+
df['outcome'] = 0
220+
for i, row in df.iterrows():
221+
if row['score'] > df['score'].median():
222+
df['outcome'][i] = 1
223+
result = pg.logistic_regression(df[['age']], df['outcome'])
224+
assert 'coef' in result.columns
225+
assert 'pval' in result.columns
226+
227+
# ------------------------------------------------------------------
228+
# Effect Size Tests
229+
# ------------------------------------------------------------------
230+
231+
def test_effect_size():
232+
"""Test effect size calculations."""
233+
for cls in TEST_CLASSES:
234+
df = _make_sample(cls, design='between')
235+
group_a = df[df['group'] == 'A']['score']
236+
group_b = df[df['group'] == 'B']['score']
237+
238+
# Cohen's d
239+
d = pg.compute_effsize(group_a, group_b, eftype='cohen')
240+
assert isinstance(d, (int, float))
241+
242+
# Hedge's g
243+
g = pg.compute_effsize(group_a, group_b, eftype='hedges')
244+
assert isinstance(g, (int, float))
245+
246+
# ------------------------------------------------------------------
247+
# Normality Tests
248+
# ------------------------------------------------------------------
249+
250+
def test_normality():
251+
"""Test normality tests."""
252+
for cls in TEST_CLASSES:
253+
df = _make_sample(cls, design='between')
254+
result = pg.normality(df['score'])
255+
assert 'W' in result.columns
256+
assert 'pval' in result.columns
257+
258+
def test_normality_grouped():
259+
"""Test normality tests by group."""
260+
for cls in TEST_CLASSES:
261+
df = _make_sample(cls, design='between')
262+
result = pg.normality(data=df, dv='score', group='group')
263+
assert len(result) == 2 # Two groups
264+
assert 'W' in result.columns
265+
assert 'pval' in result.columns
266+
267+
# ------------------------------------------------------------------
268+
# Post-hoc Tests
269+
# ------------------------------------------------------------------
270+
271+
def test_pairwise_ttests():
272+
"""Test pairwise t-tests."""
273+
for cls in TEST_CLASSES:
274+
df = _make_sample(cls, design='within')
275+
result = pg.pairwise_ttests(data=df, dv='score', within='time',
276+
subject='subject', padjust='bonf')
277+
assert 'T' in result.columns
278+
assert 'p_unc' in result.columns
279+
assert 'p_corr' in result.columns

0 commit comments

Comments
 (0)