feat(benchmarks): Complete benchmark infrastructure with history and profiling

miranov25 · miranov25 · commit d6f983a1e6f7 · 2025-11-30T21:14:41.000+01:00
Add comprehensive benchmark infrastructure for performance tracking:

    - Row count configuration: quick=500K, default=1M, full=2M rows
    - Profile naming: bench_&lt;component&gt;_&lt;scenario&gt;_&lt;timestamp&gt;_&lt;commit&gt;.prof
    - History archiving: Every run archived with git commit info
    - Diff command: Compare arbitrary history files with threshold detection
    - History analysis: DataFrame utilities (long/wide format) for custom queries

    New files:
    - history_analysis.py: Load history into pandas DataFrames

    Modified files:
    - benchmark_materialize_aliases.py: --full flag, profile naming, row counts
    - baseline_utils.py: diff command, get_git_info()
    - run_benchmark.sh: --full flag passthrough
    - README.md: Documentation for new features

    Usage:
      ./run_benchmark.sh --full              # Full analysis with profiling
      python baseline_utils.py diff A.json B.json  # Compare runs
      python history_analysis.py list results/history/  # List metrics

    Part of benchmark infrastructure for Phase 3 join optimization.
diff --git a/UTILS/dfextensions/AliasDataFrame/AliasDataFrame.py b/UTILS/dfextensions/AliasDataFrame/AliasDataFrame.py
@@ -1657,8 +1657,18 @@ def _run_with_profiling(self, func, profile=False, profile_output=None):
             
             if profile_output:
                 from pathlib import Path
+                
+                # Save binary .prof for programmatic analysis (pstats, snakeviz)
+                if profile_output.endswith('.txt'):
+                    prof_path = profile_output[:-4] + '.prof'
+                else:
+                    prof_path = profile_output + '.prof'
+                profiler.dump_stats(prof_path)
+                print(f"[profiler] Binary profile saved to: {prof_path}")
+                
+                # Save text for human reading
                 Path(profile_output).write_text(output)
-                print(f"[profiler] Results saved to: {profile_output}")
+                print(f"[profiler] Text profile saved to: {profile_output}")
             else:
                 print(output)
         
diff --git a/UTILS/dfextensions/AliasDataFrame/benchmarks/baseline.json b/UTILS/dfextensions/AliasDataFrame/benchmarks/baseline.json
@@ -1,24 +1,24 @@
 {
   "version": 1,
-  "created": "2025-11-30T20:59:30.057647",
+  "created": "2025-11-30T21:10:30.196534",
   "host": "Marians-MBP-3.fritz.box",
   "python_version": "3.9.6",
   "cpu_count": 12,
   "platform": "macOS-14.5-arm64-arm-64bit",
   "benchmarks": {
     "benchmark_materialize_aliases.py": {
-      "time_s": 6.602904751,
+      "time_s": 2.745856375,
       "metrics": {
-        "direct_vs_safe_speedup": 1.1213459329029052,
-        "safe_vs_simple_ratio": 33.48148131583073
+        "direct_vs_safe_speedup": 1.255547595408744,
+        "safe_vs_simple_ratio": 67.37643196035532
       }
     },
     "benchmark_parallel.py": {
       "time_s": null,
       "metrics": {}
     },
     "benchmark_performance.py": {
-      "time_s": 0.057923543000000466,
+      "time_s": 0.0549549170000001,
       "metrics": {
         "all_passed": 1
       }