Skip to content

Commit 482819f

Browse files
author
miranov25
committed
test(AliasDataFrame): Add Phase 8-9 test coverage
- test_arrow_expression.py: ArrowComputeMapper unit tests (Phase 9a) - test_arrow_scatter.py: Arrow scatter via pc.take() (Phase 9b) - test_join_caching.py: Join index caching (Phase 8c) These tests validate optimization infrastructure even when Arrow compute path is disabled.
1 parent 5bb9d66 commit 482819f

3 files changed

Lines changed: 1206 additions & 0 deletions

File tree

Lines changed: 383 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,383 @@
1+
"""
2+
Tests for PyArrow Expression Evaluation (Phase 9c).
3+
4+
This test suite validates the Arrow-accelerated expression evaluation
5+
integrated into AliasDataFrame._eval_in_namespace().
6+
7+
Author: Claude (Coder)
8+
Date: 2025-12-01
9+
"""
10+
11+
import pytest
12+
import numpy as np
13+
import pandas as pd
14+
import time
15+
16+
# Optional PyArrow import
17+
try:
18+
import pyarrow as pa
19+
import pyarrow.compute as pc
20+
PYARROW_AVAILABLE = True
21+
except ImportError:
22+
PYARROW_AVAILABLE = False
23+
pa = None
24+
pc = None
25+
26+
# Skip all tests if PyArrow not available
27+
pytestmark = pytest.mark.skipif(
28+
not PYARROW_AVAILABLE,
29+
reason="PyArrow not available"
30+
)
31+
32+
33+
class TestArrowExpressionEvaluation:
34+
"""Test Arrow-accelerated expression evaluation."""
35+
36+
@pytest.fixture
37+
def large_adf(self):
38+
"""Create AliasDataFrame with enough rows to trigger Arrow path."""
39+
# Import here to handle path issues
40+
try:
41+
from AliasDataFrame import AliasDataFrame
42+
except ImportError:
43+
from AliasDataFrame.AliasDataFrame import AliasDataFrame
44+
45+
n = 50_000 # Above NUMBA_MIN_ROWS threshold
46+
df = pd.DataFrame({
47+
'x': np.random.randn(n).astype(np.float64),
48+
'y': np.random.randn(n).astype(np.float64),
49+
'z': np.random.randn(n).astype(np.float64),
50+
'a': np.random.randint(0, 100, n).astype(np.int32),
51+
'b': np.random.randint(1, 10, n).astype(np.int32),
52+
})
53+
return AliasDataFrame(df)
54+
55+
@pytest.fixture
56+
def small_adf(self):
57+
"""Create small AliasDataFrame that won't trigger Arrow path."""
58+
try:
59+
from AliasDataFrame import AliasDataFrame
60+
except ImportError:
61+
from AliasDataFrame.AliasDataFrame import AliasDataFrame
62+
63+
n = 100 # Below threshold
64+
df = pd.DataFrame({
65+
'x': np.random.randn(n),
66+
'y': np.random.randn(n),
67+
})
68+
return AliasDataFrame(df)
69+
70+
def test_arrow_info_includes_compute(self, large_adf):
71+
"""Test that arrow_info includes compute_available flag."""
72+
info = large_adf.arrow_info
73+
assert 'compute_available' in info
74+
assert info['compute_available'] == True
75+
assert info['available'] == True
76+
assert info['enabled'] == True
77+
78+
def test_simple_arithmetic(self, large_adf):
79+
"""Test simple arithmetic expression with Arrow."""
80+
large_adf.add_alias('sum_xy', 'x + y')
81+
large_adf.materialize_alias('sum_xy')
82+
83+
# Verify result
84+
expected = large_adf.df['x'] + large_adf.df['y']
85+
np.testing.assert_allclose(
86+
large_adf.df['sum_xy'].values,
87+
expected.values,
88+
rtol=1e-10
89+
)
90+
91+
def test_complex_arithmetic(self, large_adf):
92+
"""Test complex arithmetic expression."""
93+
large_adf.add_alias('complex', 'x * y + z / 2.0')
94+
large_adf.materialize_alias('complex')
95+
96+
expected = large_adf.df['x'] * large_adf.df['y'] + large_adf.df['z'] / 2.0
97+
np.testing.assert_allclose(
98+
large_adf.df['complex'].values,
99+
expected.values,
100+
rtol=1e-10
101+
)
102+
103+
def test_sqrt_expression(self, large_adf):
104+
"""Test sqrt function."""
105+
# Use absolute values to avoid sqrt of negative
106+
large_adf.df['x_abs'] = np.abs(large_adf.df['x'])
107+
large_adf.add_alias('sqrt_x', 'sqrt(x_abs)')
108+
large_adf.materialize_alias('sqrt_x')
109+
110+
expected = np.sqrt(np.abs(large_adf.df['x']))
111+
np.testing.assert_allclose(
112+
large_adf.df['sqrt_x'].values,
113+
expected.values,
114+
rtol=1e-6
115+
)
116+
117+
def test_trig_expression(self, large_adf):
118+
"""Test trigonometric functions."""
119+
large_adf.add_alias('sin_x', 'sin(x)')
120+
large_adf.add_alias('cos_y', 'cos(y)')
121+
large_adf.materialize_alias('sin_x')
122+
large_adf.materialize_alias('cos_y')
123+
124+
np.testing.assert_allclose(
125+
large_adf.df['sin_x'].values,
126+
np.sin(large_adf.df['x']).values,
127+
rtol=1e-10
128+
)
129+
np.testing.assert_allclose(
130+
large_adf.df['cos_y'].values,
131+
np.cos(large_adf.df['y']).values,
132+
rtol=1e-10
133+
)
134+
135+
def test_exp_log_expression(self, large_adf):
136+
"""Test exp and log functions."""
137+
# Use small values to avoid overflow
138+
large_adf.df['x_small'] = large_adf.df['x'] * 0.1
139+
large_adf.df['x_pos'] = np.abs(large_adf.df['x']) + 0.1
140+
141+
large_adf.add_alias('exp_x', 'exp(x_small)')
142+
large_adf.add_alias('log_x', 'log(x_pos)')
143+
large_adf.materialize_alias('exp_x')
144+
large_adf.materialize_alias('log_x')
145+
146+
np.testing.assert_allclose(
147+
large_adf.df['exp_x'].values,
148+
np.exp(large_adf.df['x_small']).values,
149+
rtol=1e-6
150+
)
151+
np.testing.assert_allclose(
152+
large_adf.df['log_x'].values,
153+
np.log(large_adf.df['x_pos']).values,
154+
rtol=1e-6
155+
)
156+
157+
def test_power_expression(self, large_adf):
158+
"""Test power operator."""
159+
large_adf.add_alias('x_squared', 'x ** 2')
160+
large_adf.add_alias('x_cubed', 'x ** 3')
161+
large_adf.materialize_alias('x_squared')
162+
large_adf.materialize_alias('x_cubed')
163+
164+
np.testing.assert_allclose(
165+
large_adf.df['x_squared'].values,
166+
(large_adf.df['x'] ** 2).values,
167+
rtol=1e-10
168+
)
169+
np.testing.assert_allclose(
170+
large_adf.df['x_cubed'].values,
171+
(large_adf.df['x'] ** 3).values,
172+
rtol=1e-10
173+
)
174+
175+
def test_comparison_expression(self, large_adf):
176+
"""Test comparison operators."""
177+
large_adf.add_alias('x_gt_y', 'x > y')
178+
large_adf.materialize_alias('x_gt_y')
179+
180+
expected = (large_adf.df['x'] > large_adf.df['y']).astype(bool)
181+
result = large_adf.df['x_gt_y'].astype(bool)
182+
np.testing.assert_array_equal(result.values, expected.values)
183+
184+
def test_division_type_promotion(self, large_adf):
185+
"""Test that integer division produces float results."""
186+
large_adf.add_alias('a_div_b', 'a / b')
187+
large_adf.materialize_alias('a_div_b')
188+
189+
# Result should be float
190+
assert np.issubdtype(large_adf.df['a_div_b'].dtype, np.floating)
191+
192+
# Values should be correct
193+
expected = large_adf.df['a'].astype(float) / large_adf.df['b'].astype(float)
194+
np.testing.assert_allclose(
195+
large_adf.df['a_div_b'].values,
196+
expected.values,
197+
rtol=1e-10
198+
)
199+
200+
def test_nested_expression(self, large_adf):
201+
"""Test nested function calls."""
202+
large_adf.df['x_pos'] = np.abs(large_adf.df['x']) + 0.1
203+
large_adf.df['y_pos'] = np.abs(large_adf.df['y']) + 0.1
204+
205+
large_adf.add_alias('nested', 'sqrt(x_pos**2 + y_pos**2)')
206+
large_adf.materialize_alias('nested')
207+
208+
expected = np.sqrt(large_adf.df['x_pos']**2 + large_adf.df['y_pos']**2)
209+
np.testing.assert_allclose(
210+
large_adf.df['nested'].values,
211+
expected.values,
212+
rtol=1e-6
213+
)
214+
215+
def test_fallback_for_unsupported(self, large_adf):
216+
"""Test that unsupported expressions fall back to eval()."""
217+
# 'int' is a type cast function not in ArrowComputeMapper
218+
large_adf.add_alias('int_x', 'int(x * 10)')
219+
220+
# Should still work via fallback
221+
large_adf.materialize_alias('int_x')
222+
assert 'int_x' in large_adf.df.columns
223+
224+
def test_small_array_uses_eval(self, small_adf):
225+
"""Test that small arrays use eval() path (below threshold)."""
226+
small_adf.add_alias('sum_xy', 'x + y')
227+
small_adf.materialize_alias('sum_xy')
228+
229+
expected = small_adf.df['x'] + small_adf.df['y']
230+
np.testing.assert_allclose(
231+
small_adf.df['sum_xy'].values,
232+
expected.values,
233+
rtol=1e-10
234+
)
235+
236+
def test_disabled_arrow(self, large_adf):
237+
"""Test that Arrow can be disabled."""
238+
try:
239+
from AliasDataFrame import AliasDataFrame
240+
except ImportError:
241+
from AliasDataFrame.AliasDataFrame import AliasDataFrame
242+
243+
# Create new ADF with Arrow disabled
244+
adf_no_arrow = AliasDataFrame(large_adf.df.copy(), use_arrow=False)
245+
assert adf_no_arrow.arrow_info['enabled'] == False
246+
247+
# Should still work via eval()
248+
adf_no_arrow.add_alias('sum_xy', 'x + y')
249+
adf_no_arrow.materialize_alias('sum_xy')
250+
251+
expected = adf_no_arrow.df['x'] + adf_no_arrow.df['y']
252+
np.testing.assert_allclose(
253+
adf_no_arrow.df['sum_xy'].values,
254+
expected.values,
255+
rtol=1e-10
256+
)
257+
258+
def test_hyperbolic_functions(self, large_adf):
259+
"""Test hyperbolic functions (require PyArrow >= 14)."""
260+
# Use small values to avoid overflow
261+
large_adf.df['x_small'] = large_adf.df['x'] * 0.1
262+
263+
large_adf.add_alias('sinh_x', 'sinh(x_small)')
264+
large_adf.add_alias('cosh_x', 'cosh(x_small)')
265+
large_adf.materialize_alias('sinh_x')
266+
large_adf.materialize_alias('cosh_x')
267+
268+
np.testing.assert_allclose(
269+
large_adf.df['sinh_x'].values,
270+
np.sinh(large_adf.df['x_small']).values,
271+
rtol=1e-6
272+
)
273+
np.testing.assert_allclose(
274+
large_adf.df['cosh_x'].values,
275+
np.cosh(large_adf.df['x_small']).values,
276+
rtol=1e-6
277+
)
278+
279+
280+
class TestArrowExpressionPerformance:
281+
"""Performance tests for Arrow expression evaluation."""
282+
283+
@pytest.fixture
284+
def perf_adf(self):
285+
"""Create large AliasDataFrame for performance testing."""
286+
try:
287+
from AliasDataFrame import AliasDataFrame
288+
except ImportError:
289+
from AliasDataFrame.AliasDataFrame import AliasDataFrame
290+
291+
n = 2_000_000
292+
df = pd.DataFrame({
293+
'x': np.random.randn(n).astype(np.float64),
294+
'y': np.random.randn(n).astype(np.float64),
295+
'z': np.random.randn(n).astype(np.float64),
296+
})
297+
return AliasDataFrame(df)
298+
299+
def test_expression_speed(self, perf_adf):
300+
"""Test that expression evaluation completes in reasonable time."""
301+
perf_adf.add_alias('result', 'sqrt(x**2 + y**2 + z**2)')
302+
303+
t0 = time.perf_counter()
304+
perf_adf.materialize_alias('result')
305+
elapsed = time.perf_counter() - t0
306+
307+
# Should be reasonably fast (< 1 second for 2M rows)
308+
assert elapsed < 1.0, f"Expression eval took {elapsed:.3f}s, expected < 1.0s"
309+
print(f"\nExpression eval: 2M rows in {elapsed*1000:.1f}ms")
310+
311+
def test_multiple_expressions_speed(self, perf_adf):
312+
"""Test multiple expression evaluation speed."""
313+
perf_adf.add_alias('r', 'sqrt(x**2 + y**2)')
314+
perf_adf.add_alias('theta', 'arctan2(y, x)')
315+
perf_adf.add_alias('phi', 'arctan2(z, r)')
316+
317+
t0 = time.perf_counter()
318+
perf_adf.materialize_alias('r')
319+
perf_adf.materialize_alias('theta')
320+
perf_adf.materialize_alias('phi')
321+
elapsed = time.perf_counter() - t0
322+
323+
# 3 expressions should still be fast
324+
assert elapsed < 2.0, f"3 expressions took {elapsed:.3f}s, expected < 2.0s"
325+
print(f"\n3 expressions: 2M rows in {elapsed*1000:.1f}ms")
326+
327+
328+
class TestArrowExpressionEdgeCases:
329+
"""Edge case tests for Arrow expression evaluation."""
330+
331+
@pytest.fixture
332+
def edge_adf(self):
333+
"""Create AliasDataFrame with edge case data."""
334+
try:
335+
from AliasDataFrame import AliasDataFrame
336+
except ImportError:
337+
from AliasDataFrame.AliasDataFrame import AliasDataFrame
338+
339+
n = 50_000
340+
df = pd.DataFrame({
341+
'x': np.concatenate([
342+
np.array([0.0, np.inf, -np.inf, np.nan]),
343+
np.random.randn(n - 4)
344+
]),
345+
'y': np.concatenate([
346+
np.array([1.0, 2.0, 3.0, 4.0]),
347+
np.random.randn(n - 4)
348+
]),
349+
})
350+
return AliasDataFrame(df)
351+
352+
def test_inf_handling(self, edge_adf):
353+
"""Test that inf values are handled correctly."""
354+
edge_adf.add_alias('x_plus_one', 'x + 1')
355+
edge_adf.materialize_alias('x_plus_one')
356+
357+
result = edge_adf.df['x_plus_one']
358+
assert np.isinf(result.iloc[1]) # inf + 1 = inf
359+
assert np.isinf(result.iloc[2]) # -inf + 1 = -inf
360+
361+
def test_nan_propagation(self, edge_adf):
362+
"""Test that NaN propagates correctly."""
363+
edge_adf.add_alias('x_times_y', 'x * y')
364+
edge_adf.materialize_alias('x_times_y')
365+
366+
result = edge_adf.df['x_times_y']
367+
assert np.isnan(result.iloc[3]) # nan * anything = nan
368+
369+
def test_zero_handling(self, edge_adf):
370+
"""Test zero in expressions."""
371+
edge_adf.add_alias('y_div_x', 'y / x')
372+
edge_adf.materialize_alias('y_div_x')
373+
374+
result = edge_adf.df['y_div_x']
375+
# 1.0 / 0.0 = inf
376+
assert np.isinf(result.iloc[0])
377+
378+
379+
# =============================================================================
380+
# If running standalone
381+
# =============================================================================
382+
if __name__ == '__main__':
383+
pytest.main([__file__, '-v', '--tb=short'])

0 commit comments

Comments
 (0)