Fix profile export and add scale mode for 10^7 benchmarks

miranov25 · miranov25 · commit 7dccaeb7f0fd · 2025-12-06T22:39:07.000+01:00
Profile fix:
- benchmark_materialize_aliases.py was using wrong parameter name
- Changed profile_output to profile_text + profile_binary
- Profiles now generated correctly with --full mode

Scale mode:
- Add --scale flag to benchmark_composite_keys_rdf.py
- Add --large flag to run_benchmark.sh
- Tests 10^5, 10^6, 10^7 rows for performance characterization
- Default and quick modes unchanged
diff --git a/UTILS/dfextensions/AliasDataFrame/benchmarks/benchmark_composite_keys_rdf.py b/UTILS/dfextensions/AliasDataFrame/benchmarks/benchmark_composite_keys_rdf.py
@@ -10,10 +10,16 @@
 
 Usage:
     python benchmark_composite_keys_rdf.py --json results.json           # Required: JSON output
-    python benchmark_composite_keys_rdf.py --quick --json results.json   # Quick mode
+    python benchmark_composite_keys_rdf.py --quick --json results.json   # Quick mode (10^4 rows)
+    python benchmark_composite_keys_rdf.py --scale --json results.json   # Scale mode (10^7 rows)
     python benchmark_composite_keys_rdf.py --profile --json results.json # With profiling
     python benchmark_composite_keys_rdf.py --quiet --json results.json   # Minimal output
 
+Modes:
+    --quick: 10^4 rows (fast, for CI)
+    default: 10^5, 10^6 rows
+    --scale: 10^5, 10^6, 10^7 rows (slow, for performance characterization)
+
 Exit Codes:
     0 - All benchmarks completed (passed or skipped)
     1 - Fatal error
@@ -96,6 +102,13 @@
     'tmemfile_friend': 100_000,
 }
 
+SCALE_SIZES = {
+    'dense': [100_000, 1_000_000, 10_000_000],
+    'sparse': [100_000, 1_000_000, 10_000_000],
+    'tmemfile_main': 10_000_000,
+    'tmemfile_friend': 1_000_000,
+}
+
 # TPC-like key ranges (realistic ALICE calibration)
 KEY_COLUMNS = ['k0', 'k1', 'k2']
 MAX_VALUES = [2, 152, 25]  # side, row, drift25
@@ -453,23 +466,44 @@ def benchmark_rdf_query(tmemfile_result, verbose=True):
 # Main Runner
 # =============================================================================
 
-def run_all_benchmarks(quick_mode=False, verbose=True, profile=False, results_dir=None):
+def run_all_benchmarks(quick_mode=False, scale_mode=False, verbose=True, profile=False, results_dir=None):
     """
     Run all benchmark scenarios.
     
+    Parameters
+    ----------
+    quick_mode : bool
+        Use smaller data sizes (10^4 rows)
+    scale_mode : bool
+        Use larger data sizes (10^5, 10^6, 10^7 rows)
+    verbose : bool
+        Print progress information
+    profile : bool
+        Enable cProfile profiling
+    results_dir : str
+        Directory for profile output files
+    
     Returns dict with results for each scenario.
     """
     np.random.seed(RNG_SEED)
     
-    sizes = QUICK_SIZES if quick_mode else DEFAULT_SIZES
+    # Select configuration based on mode
+    if scale_mode:
+        sizes = SCALE_SIZES
+        mode_name = 'scale'
+    elif quick_mode:
+        sizes = QUICK_SIZES
+        mode_name = 'quick'
+    else:
+        sizes = DEFAULT_SIZES
+        mode_name = 'default'
     
     results = {}
     total_start = time.perf_counter()
     
     if verbose:
-        mode = 'quick' if quick_mode else 'default'
         print(f"\n{'='*60}")
-        print(f"Composite Keys / RDF Benchmark ({mode} mode)")
+        print(f"Composite Keys / RDF Benchmark ({mode_name} mode)")
         print(f"{'='*60}")
     
     # Scenario A: Dense Key Generation
@@ -671,8 +705,14 @@ def main():
 Examples:
     python benchmark_composite_keys_rdf.py --json results.json
     python benchmark_composite_keys_rdf.py --quick --json results.json
+    python benchmark_composite_keys_rdf.py --scale --json results.json
     python benchmark_composite_keys_rdf.py --profile --json results.json
 
+Modes:
+    --quick: 10^4 rows (fast, for CI)
+    default: 10^5, 10^6 rows
+    --scale: 10^5, 10^6, 10^7 rows (slow, for performance characterization)
+
 Scenarios:
     A. dense_generation:  compute_composite_key_dense() performance
     B. sparse_generation: compute_composite_key_sparse() performance
@@ -683,7 +723,9 @@ def main():
     parser.add_argument('--json', type=str, required=True, metavar='FILE',
                         help='Export results to JSON file (required)')
     parser.add_argument('--quick', action='store_true',
-                        help='Quick mode: smaller data sizes')
+                        help='Quick mode: 10^4 rows (fast)')
+    parser.add_argument('--scale', action='store_true',
+                        help='Scale mode: 10^5, 10^6, 10^7 rows (slow)')
     parser.add_argument('--quiet', action='store_true',
                         help='Minimal output')
     parser.add_argument('--profile', action='store_true',
@@ -692,14 +734,22 @@ def main():
     args = parser.parse_args()
     
     verbose = not args.quiet
-    mode = 'quick' if args.quick else 'default'
+    
+    # Determine mode name
+    if args.scale:
+        mode = 'scale'
+    elif args.quick:
+        mode = 'quick'
+    else:
+        mode = 'default'
     
     # Determine results directory for profiling
     results_dir = os.path.dirname(args.json) or 'results'
     
     # Run benchmarks
     results = run_all_benchmarks(
         quick_mode=args.quick,
+        scale_mode=args.scale,
         verbose=verbose,
         profile=args.profile,
         results_dir=results_dir if args.profile else None,
diff --git a/UTILS/dfextensions/AliasDataFrame/benchmarks/benchmark_materialize_aliases.py b/UTILS/dfextensions/AliasDataFrame/benchmarks/benchmark_materialize_aliases.py
@@ -604,12 +604,16 @@ def run_scenario_simple(df_main, verbose=True, profile=False, profile_output=Non
         print(f"  Targets: {targets}")
     
     def do_materialize():
+        # Generate both text and binary profile paths
+        profile_text_path = profile_output if profile_output else None
+        profile_binary_path = profile_output.replace('.txt', '.prof') if profile_output else None
         adf.materialize_aliases(
             names=targets,
             with_dependencies=True,
             cleanTemporary=True,
             profile=profile,
-            profile_text=profile_output,
+            profile_text=profile_text_path,
+            profile_binary=profile_binary_path,
         )
     
     result = measure_materialize(do_materialize, adf)
@@ -690,12 +694,16 @@ def run_scenario_subframe(df_main, df_subframe, fill_mode, verbose=True,
         print(f"  Fill mode: {fill_mode}")
     
     def do_materialize():
+        # Generate both text and binary profile paths
+        profile_text_path = profile_output if profile_output else None
+        profile_binary_path = profile_output.replace('.txt', '.prof') if profile_output else None
         adf.materialize_aliases(
             names=targets,
             with_dependencies=True,
             cleanTemporary=True,
             profile=profile,
-            profile_text=profile_output,
+            profile_text=profile_text_path,
+            profile_binary=profile_binary_path,
         )
     
     result = measure_materialize(do_materialize, adf)
diff --git a/UTILS/dfextensions/AliasDataFrame/benchmarks/run_benchmark.sh b/UTILS/dfextensions/AliasDataFrame/benchmarks/run_benchmark.sh
@@ -49,6 +49,7 @@ THRESHOLD=20
 BASELINE_FILE="${SCRIPT_DIR}/baseline.json"
 PROFILE_FLAG=""
 FULL_FLAG=""
+SCALE_FLAG=""
 
 # Results tracking
 declare -a BENCHMARK_NAMES
@@ -164,6 +165,11 @@ while [[ $# -gt 0 ]]; do
             VERBOSE=true  # Full mode should show all output
             shift
             ;;
+        --large)
+            SCALE_FLAG="--scale"
+            VERBOSE=true  # Large mode should show output
+            shift
+            ;;
         --save-baseline)
             SAVE_BASELINE=true
             shift
@@ -195,6 +201,7 @@ while [[ $# -gt 0 ]]; do
             echo "  --verbose, -v      Show detailed output"
             echo "  --profile          Save profiler output (.prof and .txt) for analysis"
             echo "  --full             Full analysis: verbose + profiling + baseline comparison + history"
+            echo "  --large            Run large scale benchmarks (10^7 rows, slow)"
             echo "  --output DIR       Output directory (default: benchmarks/results)"
             echo ""
             echo "Regression Detection:"
@@ -558,11 +565,11 @@ COMPOSITE_JSON="${OUTPUT_DIR}/benchmark_composite_keys_rdf_${TIMESTAMP}.json"
 START_TIME=$(get_time)
 
 if [[ "$VERBOSE" = true ]]; then
-    OUTPUT=$(python3 "${SCRIPT_DIR}/benchmark_composite_keys_rdf.py" $QUICK_MODE $PROFILE_FLAG --json "$COMPOSITE_JSON" 2>&1)
+    OUTPUT=$(python3 "${SCRIPT_DIR}/benchmark_composite_keys_rdf.py" $QUICK_MODE $SCALE_FLAG $PROFILE_FLAG --json "$COMPOSITE_JSON" 2>&1)
     COMP_STATUS=$?
     echo "$OUTPUT"
 else
-    OUTPUT=$(python3 "${SCRIPT_DIR}/benchmark_composite_keys_rdf.py" $QUICK_MODE --json "$COMPOSITE_JSON" --quiet 2>&1)
+    OUTPUT=$(python3 "${SCRIPT_DIR}/benchmark_composite_keys_rdf.py" $QUICK_MODE $SCALE_FLAG --json "$COMPOSITE_JSON" --quiet 2>&1)
     COMP_STATUS=$?
 fi
 
diff --git a/UTILS/dfextensions/RDataFrameDSL/.gitignore b/UTILS/dfextensions/RDataFrameDSL/.gitignore
@@ -0,0 +1 @@
+files/