6262ROW_TOTAL = 190
6363
6464
65+ # =============================================================================
66+ # Roofline Analysis - Theoretical Limits
67+ # =============================================================================
68+
69+ def measure_memory_bandwidth (n_rows , n_iterations = 10 , dtype = np .float32 ):
70+ """
71+ Measure theoretical memory bandwidth limit.
72+
73+ This is the absolute floor for any operation that reads and writes data.
74+ Uses numpy.copy() as the baseline memory operation.
75+
76+ Parameters
77+ ----------
78+ n_rows : int
79+ Number of rows (should match benchmark scenario)
80+ n_iterations : int
81+ Number of iterations for stable timing
82+ dtype : numpy dtype
83+ Data type (should match benchmark)
84+
85+ Returns
86+ -------
87+ dict : {time_s, bandwidth_gbps, bytes_processed}
88+ """
89+ arr = np .random .randn (n_rows ).astype (dtype )
90+
91+ # Warm up cache
92+ _ = arr .copy ()
93+
94+ gc .collect ()
95+ t0 = time .perf_counter ()
96+ for _ in range (n_iterations ):
97+ out = arr .copy ()
98+ elapsed = (time .perf_counter () - t0 ) / n_iterations
99+
100+ bytes_processed = arr .nbytes * 2 # read + write
101+ bandwidth_gbps = bytes_processed / max (elapsed , 1e-9 ) / 1e9
102+
103+ return {
104+ 'time_s' : max (elapsed , 1e-9 ),
105+ 'bandwidth_gbps' : bandwidth_gbps ,
106+ 'bytes_processed' : bytes_processed ,
107+ }
108+
109+
110+ def measure_numpy_indexing (n_rows , n_cols , n_iterations = 10 , dtype = np .float32 ):
111+ """
112+ Measure theoretical join limit using NumPy advanced indexing.
113+
114+ This simulates the best-case join: pre-computed index lookup.
115+ This is the target for optimized join caching.
116+
117+ Parameters
118+ ----------
119+ n_rows : int
120+ Number of rows in main DataFrame
121+ n_cols : int
122+ Number of columns to fetch (MUST match scenario column count)
123+ n_iterations : int
124+ Number of iterations for stable timing
125+ dtype : numpy dtype
126+ Data type
127+
128+ Returns
129+ -------
130+ dict : {time_s, rows, cols, rows_per_sec}
131+ """
132+ # Simulate subframe with ~1000 unique keys (similar to real calibration table)
133+ subframe_size = 1000
134+ indices = np .random .randint (0 , subframe_size , size = n_rows )
135+ subframe_data = np .random .randn (subframe_size , n_cols ).astype (dtype )
136+
137+ # Warm up
138+ _ = subframe_data [indices ]
139+
140+ gc .collect ()
141+ t0 = time .perf_counter ()
142+ for _ in range (n_iterations ):
143+ result = subframe_data [indices ]
144+ elapsed = (time .perf_counter () - t0 ) / n_iterations
145+
146+ return {
147+ 'time_s' : max (elapsed , 1e-9 ), # Guard against zero
148+ 'rows' : n_rows ,
149+ 'cols' : n_cols ,
150+ 'rows_per_sec' : n_rows / max (elapsed , 1e-9 ),
151+ }
152+
153+
65154# =============================================================================
66155# Synthetic Data Generation
67156# =============================================================================
@@ -556,6 +645,30 @@ def run_all_benchmarks(n_rows, verbose=True, profile=False, results_dir=None):
556645 pct_missing = 100.0 * n_missing / len (df_main )
557646 print (f" Expected missing: { pct_missing :.1f} % (row > { ROW_MAX_WITH_CALIBRATION } )" )
558647
648+ # =========================================================================
649+ # Measure Theoretical Limits (Roofline Analysis)
650+ # =========================================================================
651+ if verbose :
652+ print ("\n --- Measuring Theoretical Limits ---" )
653+
654+ # Memory bandwidth (single column baseline)
655+ bandwidth_result = measure_memory_bandwidth (n_rows )
656+ if verbose :
657+ print (f" Memory bandwidth: { bandwidth_result ['bandwidth_gbps' ]:.1f} GB/s" )
658+
659+ # NumPy indexing for simple scenario (1 output column)
660+ simple_cols = 1
661+ numpy_simple = measure_numpy_indexing (n_rows , n_cols = simple_cols )
662+ if verbose :
663+ print (f" NumPy indexing ({ simple_cols } col): { numpy_simple ['time_s' ]:.4f} s" )
664+
665+ # NumPy indexing for safe/direct scenarios
666+ # Match the number of subframe columns fetched (8 calibration coefficients)
667+ subframe_cols = 8
668+ numpy_join = measure_numpy_indexing (n_rows , n_cols = subframe_cols )
669+ if verbose :
670+ print (f" NumPy indexing ({ subframe_cols } cols): { numpy_join ['time_s' ]:.4f} s" )
671+
559672 results = {}
560673
561674 # Scenario 1: Simple (no subframe)
@@ -596,6 +709,38 @@ def run_all_benchmarks(n_rows, verbose=True, profile=False, results_dir=None):
596709 results ['direct' ]['time_s' ] / results ['simple' ]['time_s' ]
597710 )
598711
712+ # =========================================================================
713+ # Store Theoretical Limits and Calculate Efficiency
714+ # =========================================================================
715+ results ['theoretical_limits' ] = {
716+ 'memory_bandwidth' : bandwidth_result ,
717+ 'numpy_indexing_simple' : numpy_simple ,
718+ 'numpy_indexing_join' : numpy_join ,
719+ }
720+
721+ # Calculate efficiency: theoretical_time / actual_time
722+ # Higher is better, max is 1.0 (100%)
723+ efficiency = {}
724+
725+ # Simple scenario vs memory bandwidth
726+ if results ['simple' ]['time_s' ] > 0 :
727+ efficiency ['simple_vs_bandwidth' ] = (
728+ bandwidth_result ['time_s' ] / results ['simple' ]['time_s' ]
729+ )
730+
731+ # Safe/direct scenarios vs NumPy join (the achievable target)
732+ if results ['safe' ]['time_s' ] > 0 :
733+ efficiency ['safe_vs_numpy_join' ] = (
734+ numpy_join ['time_s' ] / results ['safe' ]['time_s' ]
735+ )
736+
737+ if results ['direct' ]['time_s' ] > 0 :
738+ efficiency ['direct_vs_numpy_join' ] = (
739+ numpy_join ['time_s' ] / results ['direct' ]['time_s' ]
740+ )
741+
742+ results ['efficiency' ] = efficiency
743+
599744 return results
600745
601746
@@ -640,6 +785,53 @@ def print_summary(results, n_rows):
640785 if 'missing_keys_pct' in results ['safe' ]:
641786 print (f"\n Missing keys: { results ['safe' ]['missing_keys_pct' ]:.1f} %" )
642787
788+ # =========================================================================
789+ # Efficiency (Roofline Analysis)
790+ # =========================================================================
791+ limits = results .get ('theoretical_limits' , {})
792+ efficiency = results .get ('efficiency' , {})
793+
794+ if limits and efficiency :
795+ print ("\n " + "=" * 60 )
796+ print ("EFFICIENCY (vs Theoretical Limits)" )
797+ print ("=" * 60 )
798+
799+ if limits .get ('memory_bandwidth' ):
800+ bw = limits ['memory_bandwidth' ]
801+ print (f"Memory bandwidth: { bw ['bandwidth_gbps' ]:.1f} GB/s" )
802+
803+ if limits .get ('numpy_indexing_join' ):
804+ nj = limits ['numpy_indexing_join' ]
805+ print (f"NumPy indexing: { nj ['time_s' ]:.4f} s ({ nj ['cols' ]} cols × { nj ['rows' ]:,} rows)" )
806+
807+ print ()
808+ print (f"{ 'Scenario' :<12} { 'Time' :>10} { 'Limit' :>10} { 'Efficiency' :>12} " )
809+ print ("-" * 46 )
810+
811+ # Simple vs bandwidth
812+ simple_time = results ['simple' ]['time_s' ]
813+ bw_time = limits .get ('memory_bandwidth' , {}).get ('time_s' , 0 )
814+ simple_eff = efficiency .get ('simple_vs_bandwidth' , 0 ) * 100
815+ print (f"{ 'simple' :<12} { simple_time :>9.3f} s { bw_time :>9.4f} s { simple_eff :>11.1f} %" )
816+
817+ # Safe vs numpy join
818+ safe_time = results ['safe' ]['time_s' ]
819+ join_time = limits .get ('numpy_indexing_join' , {}).get ('time_s' , 0 )
820+ safe_eff = efficiency .get ('safe_vs_numpy_join' , 0 ) * 100
821+ print (f"{ 'safe' :<12} { safe_time :>9.3f} s { join_time :>9.4f} s { safe_eff :>11.1f} %" )
822+
823+ # Direct vs numpy join
824+ direct_time = results ['direct' ]['time_s' ]
825+ direct_eff = efficiency .get ('direct_vs_numpy_join' , 0 ) * 100
826+ print (f"{ 'direct' :<12} { direct_time :>9.3f} s { join_time :>9.4f} s { direct_eff :>11.1f} %" )
827+
828+ print ("-" * 46 )
829+ print ()
830+ print ("Interpretation:" )
831+ print (" >50% : Near optimal" )
832+ print (" 10-50%: Room for optimization" )
833+ print (" <10% : Significant overhead (investigate)" )
834+
643835 print ("=" * 60 )
644836
645837
@@ -680,6 +872,13 @@ def export_json(results, filepath, n_rows, mode):
680872 'direct_vs_simple_ratio' : results .get ('direct_vs_simple_ratio' ),
681873 }
682874
875+ # Add efficiency metrics if available
876+ efficiency = results .get ('efficiency' , {})
877+ if efficiency :
878+ metrics ['simple_efficiency' ] = efficiency .get ('simple_vs_bandwidth' )
879+ metrics ['safe_efficiency' ] = efficiency .get ('safe_vs_numpy_join' )
880+ metrics ['direct_efficiency' ] = efficiency .get ('direct_vs_numpy_join' )
881+
683882 output = {
684883 'benchmark' : 'benchmark_materialize_aliases.py' ,
685884 'timestamp' : datetime .now ().isoformat (),
@@ -695,6 +894,8 @@ def export_json(results, filepath, n_rows, mode):
695894 'safe' : results ['safe' ],
696895 'direct' : results ['direct' ],
697896 },
897+ 'theoretical_limits' : results .get ('theoretical_limits' , {}),
898+ 'efficiency' : results .get ('efficiency' , {}),
698899 }
699900
700901 with open (filepath , 'w' ) as f :
0 commit comments