|
7 | 7 | import polars as pl |
8 | 8 |
|
9 | 9 | from diffly import compare_frames |
| 10 | +from diffly._conditions import condition_equal_columns |
| 11 | +from diffly._utils import ( |
| 12 | + ABS_TOL_DEFAULT, |
| 13 | + ABS_TOL_TEMPORAL_DEFAULT, |
| 14 | + REL_TOL_DEFAULT, |
| 15 | + Side, |
| 16 | +) |
10 | 17 |
|
11 | 18 |
|
12 | 19 | def test_summary_lazyframe_not_slower_than_dataframe() -> None: |
@@ -74,3 +81,53 @@ def expensive_computation(col: pl.Expr) -> pl.Expr: |
74 | 81 | f"({mean_time_lf:.3f}s vs {mean_time_df:.3f}s). " |
75 | 82 | f"This suggests unnecessary re-collection of LazyFrames." |
76 | 83 | ) |
| 84 | + |
| 85 | + |
| 86 | +def test_element_wise_comparison_slower_than_eq_missing_for_list_columns() -> None: |
| 87 | + """Confirm that comparing list columns with non-tolerance inner types via |
| 88 | + eq_missing() is significantly faster than the element-wise |
| 89 | + _compare_sequence_columns() path.""" |
| 90 | + n_rows = 500_000 |
| 91 | + list_len = 20 |
| 92 | + num_runs_measured = 10 |
| 93 | + num_runs_warmup = 2 |
| 94 | + |
| 95 | + col_left = f"val_{Side.LEFT}" |
| 96 | + col_right = f"val_{Side.RIGHT}" |
| 97 | + df = pl.DataFrame( |
| 98 | + { |
| 99 | + col_left: [list(range(list_len)) for _ in range(n_rows)], |
| 100 | + col_right: [list(range(list_len)) for _ in range(n_rows)], |
| 101 | + } |
| 102 | + ) |
| 103 | + |
| 104 | + times_eq = [] |
| 105 | + times_cond = [] |
| 106 | + for _ in range(num_runs_warmup + num_runs_measured): |
| 107 | + start = time.perf_counter() |
| 108 | + df.select(pl.col(col_left).eq_missing(pl.col(col_right))).to_series() |
| 109 | + times_eq.append(time.perf_counter() - start) |
| 110 | + |
| 111 | + start = time.perf_counter() |
| 112 | + df.select( |
| 113 | + condition_equal_columns( |
| 114 | + column="val", |
| 115 | + dtype_left=df.schema[col_left], |
| 116 | + dtype_right=df.schema[col_right], |
| 117 | + max_list_length=list_len, |
| 118 | + abs_tol=ABS_TOL_DEFAULT, |
| 119 | + rel_tol=REL_TOL_DEFAULT, |
| 120 | + abs_tol_temporal=ABS_TOL_TEMPORAL_DEFAULT, |
| 121 | + ) |
| 122 | + ).to_series() |
| 123 | + times_cond.append(time.perf_counter() - start) |
| 124 | + |
| 125 | + mean_time_eq = statistics.mean(times_eq[num_runs_warmup:]) |
| 126 | + mean_time_cond = statistics.mean(times_cond[num_runs_warmup:]) |
| 127 | + |
| 128 | + ratio = mean_time_cond / mean_time_eq |
| 129 | + assert ratio > 2.0, ( |
| 130 | + f"Element-wise comparison was only {ratio:.1f}x slower than eq_missing " |
| 131 | + f"({mean_time_cond:.3f}s vs {mean_time_eq:.3f}s). " |
| 132 | + f"Expected at least 2x slowdown to justify the optimization." |
| 133 | + ) |
0 commit comments