@@ -91,46 +91,104 @@ def _measure_heuristic(func, input_size, iterations):
9191 return None
9292
9393
94+ def _compute_residuals (normalized_times , theoretical ):
95+ """
96+ Compute residuals using least-squares linear regression with intercept.
97+
98+ For constant models (all values equal), uses mean as fit.
99+ For other models, fits t = a * f(n) + b and requires positive slope.
100+
101+ Returns:
102+ list of residuals, or None if model is not applicable.
103+ """
104+ if len (set (theoretical )) == 1 :
105+ # Constant model: best fit is mean of normalized times
106+ mean_time = statistics .fmean (normalized_times )
107+ return [t - mean_time for t in normalized_times ]
108+
109+ # Linear regression with intercept: t = a * f(n) + b
110+ n = len (theoretical )
111+ sum_x = sum (theoretical )
112+ sum_y = sum (normalized_times )
113+ sum_xx = sum (x * x for x in theoretical )
114+ sum_xy = sum (x * y for x , y in zip (theoretical , normalized_times ))
115+
116+ denom = n * sum_xx - sum_x * sum_x
117+ if abs (denom ) < 1e-12 :
118+ return None
119+
120+ a = (n * sum_xy - sum_x * sum_y ) / denom
121+ b = (sum_y - a * sum_x ) / n
122+
123+ # Require positive slope; negative/zero means model doesn't explain growth
124+ if a <= 1e-12 :
125+ return None
126+
127+ return [t - (a * x + b ) for t , x in zip (normalized_times , theoretical )]
128+
129+
94130def detect_complexity (n_values , times ):
95131 """
96- Estimate complexity by comparing RSquared values for different models.
97- Simplified approach: Normalize data and check correlation with theoretical curves.
132+ Estimate complexity by fitting theoretical curves to measured times.
133+
134+ Uses least-squares linear regression (with intercept) to fit each model
135+ curve to the timing data, then selects the model with lowest RMSE.
136+ Prefers simpler models when RMSE values are within 5% of each other.
137+
138+ Returns:
139+ tuple: (complexity_name, rmse) or (None, None) if insufficient data.
98140 """
99141 if len (times ) < 3 :
100- return "Insufficient Data"
142+ return ( None , None )
101143
102- # Normalize times
144+ # Normalize times to reduce numerical effects across models
103145 min_time = min (times )
104- if min_time = = 0 :
146+ if min_time < = 0 :
105147 min_time = 1e-9
106148 normalized_times = [t / min_time for t in times ]
107149
108- models = {
109- "O(1) (Constant)" : [1 for _ in n_values ],
110- "O(log n) (Logarithmic)" : [math .log (n ) if n > 0 else 0 for n in n_values ],
111- "O(n) (Linear)" : list (n_values ),
112- "O(n log n) (Linearithmic)" : [n * math .log (n ) if n > 0 else 0 for n in n_values ],
113- "O(n^2) (Quadratic)" : [n ** 2 for n in n_values ],
150+ models = [
151+ ("O(1) (Constant)" , [1 for _ in n_values ]),
152+ ("O(log n) (Logarithmic)" , [math .log (n ) if n > 0 else 0 for n in n_values ]),
153+ ("O(n) (Linear)" , list (n_values )),
154+ ("O(n log n) (Linearithmic)" , [n * math .log (n ) if n > 0 else 0 for n in n_values ]),
155+ ("O(n^2) (Quadratic)" , [n ** 2 for n in n_values ]),
156+ ]
157+
158+ # Prefer simpler models when scores are effectively tied.
159+ model_priority = {
160+ "O(1) (Constant)" : 0 ,
161+ "O(log n) (Logarithmic)" : 1 ,
162+ "O(n) (Linear)" : 2 ,
163+ "O(n log n) (Linearithmic)" : 3 ,
164+ "O(n^2) (Quadratic)" : 4 ,
114165 }
115166
116167 best_fit = None
117- best_score = - float ("inf" )
168+ best_score = float ("inf" )
118169
119- for name , theoretical in models .items ():
120- # Calculate correlation coefficient (Pearson)
170+ for name , theoretical in models :
121171 try :
122- if len (set (theoretical )) == 1 : # Handle constant case
123- # For constant time, we check variance of times
124- score = 1.0 / (statistics .stdev (normalized_times ) + 1.0 )
125- else :
126- # Correlation between theoretical and actual
127- # Using covariance / (std_dev_x * std_dev_y)
128- correlation = statistics .correlation (theoretical , times )
129- score = correlation
130-
131- if score > best_score :
132- best_score = score
172+ residuals = _compute_residuals (normalized_times , theoretical )
173+ if residuals is None :
174+ continue
175+
176+ rmse = math .sqrt (statistics .fmean (r * r for r in residuals ))
177+
178+ # Use 5% relative epsilon for tie-breaking to handle timing noise
179+ # and prefer simpler models when fits are comparable
180+ relative_eps = 0.05
181+ threshold = relative_eps * best_score if best_score > 0 else 1e-9
182+
183+ if rmse < best_score - threshold :
184+ best_score = rmse
133185 best_fit = name
186+ elif abs (rmse - best_score ) <= threshold :
187+ # Scores are effectively tied; prefer simpler model
188+ current_priority = model_priority [best_fit ] if best_fit else 999
189+ if model_priority [name ] < current_priority :
190+ best_fit = name
191+ best_score = rmse
134192 except statistics .StatisticsError :
135193 continue
136194
@@ -170,10 +228,13 @@ def main():
170228 print (f"{ n :<15} | { t :.6f} " )
171229
172230 if len (times ) == len (n_values ):
173- complexity , score = detect_complexity (n_values , times )
231+ complexity , rmse = detect_complexity (n_values , times )
174232 print ("-" * 35 )
175- print (f"Estimated Complexity: { complexity } " )
176- print (f"Fit Score: { score :.3f} " )
233+ if complexity is None :
234+ print ("Insufficient data to estimate complexity." )
235+ else :
236+ print (f"Estimated Complexity: { complexity } " )
237+ print (f"RMSE: { rmse :.3f} " )
177238
178239
179240if __name__ == "__main__" :
0 commit comments