Skip to content

Commit 12372b4

Browse files
author
miranov25
committed
feat(benchmarks): Add roofline analysis for efficiency metrics
Add theoretical limit measurements to quantify optimization potential: - measure_memory_bandwidth(): Raw numpy.copy() speed (absolute floor) - measure_numpy_indexing(): NumPy advanced indexing (ideal join target) - Efficiency calculation: theoretical_time / actual_time Results show ~1% efficiency for subframe scenarios, indicating ~100x overhead vs theoretical NumPy indexing limit. This quantifies the optimization opportunity for join caching improvements. New output sections: - "Measuring Theoretical Limits" in verbose output - "EFFICIENCY" table in summary - theoretical_limits and efficiency in JSON output Part of Phase 3 benchmark infrastructure.
1 parent d6f983a commit 12372b4

3 files changed

Lines changed: 239 additions & 1 deletion

File tree

UTILS/dfextensions/AliasDataFrame/benchmarks/README.md

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,44 @@ SPEEDUP METRICS
366366
| `safe_vs_simple_ratio` | Subframe join overhead | ~40-50x |
367367
| `missing_pct` | Percentage of missing join keys | 15.2% |
368368

369+
### Roofline Analysis (Efficiency Metrics)
370+
371+
The benchmark measures theoretical performance limits and calculates efficiency:
372+
373+
| Metric | Description |
374+
|--------|-------------|
375+
| `memory_bandwidth` | Raw numpy.copy() speed (absolute floor) |
376+
| `numpy_indexing_join` | NumPy advanced indexing (ideal join target) |
377+
| `efficiency` | `theoretical_time / actual_time` (higher = better, max = 100%) |
378+
379+
**Interpreting efficiency:**
380+
- **>50%**: Near optimal, limited optimization potential
381+
- **10-50%**: Room for optimization
382+
- **<10%**: Significant framework overhead, investigate
383+
384+
Example output:
385+
```
386+
============================================================
387+
EFFICIENCY (vs Theoretical Limits)
388+
============================================================
389+
Memory bandwidth: 15.2 GB/s
390+
NumPy indexing: 0.0080s (8 cols × 1,000,000 rows)
391+
392+
Scenario Time Limit Efficiency
393+
----------------------------------------------
394+
simple 0.019s 0.0020s 10.5%
395+
safe 0.767s 0.0080s 1.0%
396+
direct 0.706s 0.0080s 1.1%
397+
----------------------------------------------
398+
399+
Interpretation:
400+
>50% : Near optimal
401+
10-50%: Room for optimization
402+
<10% : Significant overhead (investigate)
403+
```
404+
405+
The efficiency values show how close we are to theoretical limits. Low efficiency in safe/direct modes indicates the join overhead dominates, which is the target for Phase 3 optimization.
406+
369407
### Interpreting Results
370408

371409
**Subframe Overhead (safe_vs_simple):**

UTILS/dfextensions/AliasDataFrame/benchmarks/bench_out/.gitignore

Lines changed: 0 additions & 1 deletion
This file was deleted.

UTILS/dfextensions/AliasDataFrame/benchmarks/benchmark_materialize_aliases.py

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,95 @@
6262
ROW_TOTAL = 190
6363

6464

65+
# =============================================================================
66+
# Roofline Analysis - Theoretical Limits
67+
# =============================================================================
68+
69+
def measure_memory_bandwidth(n_rows, n_iterations=10, dtype=np.float32):
70+
"""
71+
Measure theoretical memory bandwidth limit.
72+
73+
This is the absolute floor for any operation that reads and writes data.
74+
Uses numpy.copy() as the baseline memory operation.
75+
76+
Parameters
77+
----------
78+
n_rows : int
79+
Number of rows (should match benchmark scenario)
80+
n_iterations : int
81+
Number of iterations for stable timing
82+
dtype : numpy dtype
83+
Data type (should match benchmark)
84+
85+
Returns
86+
-------
87+
dict : {time_s, bandwidth_gbps, bytes_processed}
88+
"""
89+
arr = np.random.randn(n_rows).astype(dtype)
90+
91+
# Warm up cache
92+
_ = arr.copy()
93+
94+
gc.collect()
95+
t0 = time.perf_counter()
96+
for _ in range(n_iterations):
97+
out = arr.copy()
98+
elapsed = (time.perf_counter() - t0) / n_iterations
99+
100+
bytes_processed = arr.nbytes * 2 # read + write
101+
bandwidth_gbps = bytes_processed / max(elapsed, 1e-9) / 1e9
102+
103+
return {
104+
'time_s': max(elapsed, 1e-9),
105+
'bandwidth_gbps': bandwidth_gbps,
106+
'bytes_processed': bytes_processed,
107+
}
108+
109+
110+
def measure_numpy_indexing(n_rows, n_cols, n_iterations=10, dtype=np.float32):
111+
"""
112+
Measure theoretical join limit using NumPy advanced indexing.
113+
114+
This simulates the best-case join: pre-computed index lookup.
115+
This is the target for optimized join caching.
116+
117+
Parameters
118+
----------
119+
n_rows : int
120+
Number of rows in main DataFrame
121+
n_cols : int
122+
Number of columns to fetch (MUST match scenario column count)
123+
n_iterations : int
124+
Number of iterations for stable timing
125+
dtype : numpy dtype
126+
Data type
127+
128+
Returns
129+
-------
130+
dict : {time_s, rows, cols, rows_per_sec}
131+
"""
132+
# Simulate subframe with ~1000 unique keys (similar to real calibration table)
133+
subframe_size = 1000
134+
indices = np.random.randint(0, subframe_size, size=n_rows)
135+
subframe_data = np.random.randn(subframe_size, n_cols).astype(dtype)
136+
137+
# Warm up
138+
_ = subframe_data[indices]
139+
140+
gc.collect()
141+
t0 = time.perf_counter()
142+
for _ in range(n_iterations):
143+
result = subframe_data[indices]
144+
elapsed = (time.perf_counter() - t0) / n_iterations
145+
146+
return {
147+
'time_s': max(elapsed, 1e-9), # Guard against zero
148+
'rows': n_rows,
149+
'cols': n_cols,
150+
'rows_per_sec': n_rows / max(elapsed, 1e-9),
151+
}
152+
153+
65154
# =============================================================================
66155
# Synthetic Data Generation
67156
# =============================================================================
@@ -556,6 +645,30 @@ def run_all_benchmarks(n_rows, verbose=True, profile=False, results_dir=None):
556645
pct_missing = 100.0 * n_missing / len(df_main)
557646
print(f" Expected missing: {pct_missing:.1f}% (row > {ROW_MAX_WITH_CALIBRATION})")
558647

648+
# =========================================================================
649+
# Measure Theoretical Limits (Roofline Analysis)
650+
# =========================================================================
651+
if verbose:
652+
print("\n--- Measuring Theoretical Limits ---")
653+
654+
# Memory bandwidth (single column baseline)
655+
bandwidth_result = measure_memory_bandwidth(n_rows)
656+
if verbose:
657+
print(f" Memory bandwidth: {bandwidth_result['bandwidth_gbps']:.1f} GB/s")
658+
659+
# NumPy indexing for simple scenario (1 output column)
660+
simple_cols = 1
661+
numpy_simple = measure_numpy_indexing(n_rows, n_cols=simple_cols)
662+
if verbose:
663+
print(f" NumPy indexing ({simple_cols} col): {numpy_simple['time_s']:.4f}s")
664+
665+
# NumPy indexing for safe/direct scenarios
666+
# Match the number of subframe columns fetched (8 calibration coefficients)
667+
subframe_cols = 8
668+
numpy_join = measure_numpy_indexing(n_rows, n_cols=subframe_cols)
669+
if verbose:
670+
print(f" NumPy indexing ({subframe_cols} cols): {numpy_join['time_s']:.4f}s")
671+
559672
results = {}
560673

561674
# Scenario 1: Simple (no subframe)
@@ -596,6 +709,38 @@ def run_all_benchmarks(n_rows, verbose=True, profile=False, results_dir=None):
596709
results['direct']['time_s'] / results['simple']['time_s']
597710
)
598711

712+
# =========================================================================
713+
# Store Theoretical Limits and Calculate Efficiency
714+
# =========================================================================
715+
results['theoretical_limits'] = {
716+
'memory_bandwidth': bandwidth_result,
717+
'numpy_indexing_simple': numpy_simple,
718+
'numpy_indexing_join': numpy_join,
719+
}
720+
721+
# Calculate efficiency: theoretical_time / actual_time
722+
# Higher is better, max is 1.0 (100%)
723+
efficiency = {}
724+
725+
# Simple scenario vs memory bandwidth
726+
if results['simple']['time_s'] > 0:
727+
efficiency['simple_vs_bandwidth'] = (
728+
bandwidth_result['time_s'] / results['simple']['time_s']
729+
)
730+
731+
# Safe/direct scenarios vs NumPy join (the achievable target)
732+
if results['safe']['time_s'] > 0:
733+
efficiency['safe_vs_numpy_join'] = (
734+
numpy_join['time_s'] / results['safe']['time_s']
735+
)
736+
737+
if results['direct']['time_s'] > 0:
738+
efficiency['direct_vs_numpy_join'] = (
739+
numpy_join['time_s'] / results['direct']['time_s']
740+
)
741+
742+
results['efficiency'] = efficiency
743+
599744
return results
600745

601746

@@ -640,6 +785,53 @@ def print_summary(results, n_rows):
640785
if 'missing_keys_pct' in results['safe']:
641786
print(f"\n Missing keys: {results['safe']['missing_keys_pct']:.1f}%")
642787

788+
# =========================================================================
789+
# Efficiency (Roofline Analysis)
790+
# =========================================================================
791+
limits = results.get('theoretical_limits', {})
792+
efficiency = results.get('efficiency', {})
793+
794+
if limits and efficiency:
795+
print("\n" + "=" * 60)
796+
print("EFFICIENCY (vs Theoretical Limits)")
797+
print("=" * 60)
798+
799+
if limits.get('memory_bandwidth'):
800+
bw = limits['memory_bandwidth']
801+
print(f"Memory bandwidth: {bw['bandwidth_gbps']:.1f} GB/s")
802+
803+
if limits.get('numpy_indexing_join'):
804+
nj = limits['numpy_indexing_join']
805+
print(f"NumPy indexing: {nj['time_s']:.4f}s ({nj['cols']} cols × {nj['rows']:,} rows)")
806+
807+
print()
808+
print(f"{'Scenario':<12} {'Time':>10} {'Limit':>10} {'Efficiency':>12}")
809+
print("-" * 46)
810+
811+
# Simple vs bandwidth
812+
simple_time = results['simple']['time_s']
813+
bw_time = limits.get('memory_bandwidth', {}).get('time_s', 0)
814+
simple_eff = efficiency.get('simple_vs_bandwidth', 0) * 100
815+
print(f"{'simple':<12} {simple_time:>9.3f}s {bw_time:>9.4f}s {simple_eff:>11.1f}%")
816+
817+
# Safe vs numpy join
818+
safe_time = results['safe']['time_s']
819+
join_time = limits.get('numpy_indexing_join', {}).get('time_s', 0)
820+
safe_eff = efficiency.get('safe_vs_numpy_join', 0) * 100
821+
print(f"{'safe':<12} {safe_time:>9.3f}s {join_time:>9.4f}s {safe_eff:>11.1f}%")
822+
823+
# Direct vs numpy join
824+
direct_time = results['direct']['time_s']
825+
direct_eff = efficiency.get('direct_vs_numpy_join', 0) * 100
826+
print(f"{'direct':<12} {direct_time:>9.3f}s {join_time:>9.4f}s {direct_eff:>11.1f}%")
827+
828+
print("-" * 46)
829+
print()
830+
print("Interpretation:")
831+
print(" >50% : Near optimal")
832+
print(" 10-50%: Room for optimization")
833+
print(" <10% : Significant overhead (investigate)")
834+
643835
print("=" * 60)
644836

645837

@@ -680,6 +872,13 @@ def export_json(results, filepath, n_rows, mode):
680872
'direct_vs_simple_ratio': results.get('direct_vs_simple_ratio'),
681873
}
682874

875+
# Add efficiency metrics if available
876+
efficiency = results.get('efficiency', {})
877+
if efficiency:
878+
metrics['simple_efficiency'] = efficiency.get('simple_vs_bandwidth')
879+
metrics['safe_efficiency'] = efficiency.get('safe_vs_numpy_join')
880+
metrics['direct_efficiency'] = efficiency.get('direct_vs_numpy_join')
881+
683882
output = {
684883
'benchmark': 'benchmark_materialize_aliases.py',
685884
'timestamp': datetime.now().isoformat(),
@@ -695,6 +894,8 @@ def export_json(results, filepath, n_rows, mode):
695894
'safe': results['safe'],
696895
'direct': results['direct'],
697896
},
897+
'theoretical_limits': results.get('theoretical_limits', {}),
898+
'efficiency': results.get('efficiency', {}),
698899
}
699900

700901
with open(filepath, 'w') as f:

0 commit comments

Comments
 (0)