Skip to content

Commit ff8439c

Browse files
test: Benchmark slowdown of element-wise list comparison (#25)
1 parent cccfad4 commit ff8439c

File tree

1 file changed

+57
-0
lines changed

1 file changed

+57
-0
lines changed

tests/test_performance.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,13 @@
77
import polars as pl
88

99
from diffly import compare_frames
10+
from diffly._conditions import condition_equal_columns
11+
from diffly._utils import (
12+
ABS_TOL_DEFAULT,
13+
ABS_TOL_TEMPORAL_DEFAULT,
14+
REL_TOL_DEFAULT,
15+
Side,
16+
)
1017

1118

1219
def test_summary_lazyframe_not_slower_than_dataframe() -> None:
@@ -74,3 +81,53 @@ def expensive_computation(col: pl.Expr) -> pl.Expr:
7481
f"({mean_time_lf:.3f}s vs {mean_time_df:.3f}s). "
7582
f"This suggests unnecessary re-collection of LazyFrames."
7683
)
84+
85+
86+
def test_element_wise_comparison_slower_than_eq_missing_for_list_columns() -> None:
87+
"""Confirm that comparing list columns with non-tolerance inner types via
88+
eq_missing() is significantly faster than the element-wise
89+
_compare_sequence_columns() path."""
90+
n_rows = 500_000
91+
list_len = 20
92+
num_runs_measured = 10
93+
num_runs_warmup = 2
94+
95+
col_left = f"val_{Side.LEFT}"
96+
col_right = f"val_{Side.RIGHT}"
97+
df = pl.DataFrame(
98+
{
99+
col_left: [list(range(list_len)) for _ in range(n_rows)],
100+
col_right: [list(range(list_len)) for _ in range(n_rows)],
101+
}
102+
)
103+
104+
times_eq = []
105+
times_cond = []
106+
for _ in range(num_runs_warmup + num_runs_measured):
107+
start = time.perf_counter()
108+
df.select(pl.col(col_left).eq_missing(pl.col(col_right))).to_series()
109+
times_eq.append(time.perf_counter() - start)
110+
111+
start = time.perf_counter()
112+
df.select(
113+
condition_equal_columns(
114+
column="val",
115+
dtype_left=df.schema[col_left],
116+
dtype_right=df.schema[col_right],
117+
max_list_length=list_len,
118+
abs_tol=ABS_TOL_DEFAULT,
119+
rel_tol=REL_TOL_DEFAULT,
120+
abs_tol_temporal=ABS_TOL_TEMPORAL_DEFAULT,
121+
)
122+
).to_series()
123+
times_cond.append(time.perf_counter() - start)
124+
125+
mean_time_eq = statistics.mean(times_eq[num_runs_warmup:])
126+
mean_time_cond = statistics.mean(times_cond[num_runs_warmup:])
127+
128+
ratio = mean_time_cond / mean_time_eq
129+
assert ratio > 2.0, (
130+
f"Element-wise comparison was only {ratio:.1f}x slower than eq_missing "
131+
f"({mean_time_cond:.3f}s vs {mean_time_eq:.3f}s). "
132+
f"Expected at least 2x slowdown to justify the optimization."
133+
)

0 commit comments

Comments
 (0)