Skip to content

Commit 7dccaeb

Browse files
author
miranov25
committed
Fix profile export and add scale mode for 10^7 benchmarks
Profile fix: - benchmark_materialize_aliases.py was using wrong parameter name - Changed profile_output to profile_text + profile_binary - Profiles now generated correctly with --full mode Scale mode: - Add --scale flag to benchmark_composite_keys_rdf.py - Add --large flag to run_benchmark.sh - Tests 10^5, 10^6, 10^7 rows for performance characterization - Default and quick modes unchanged
1 parent 44b20ec commit 7dccaeb

4 files changed

Lines changed: 77 additions & 11 deletions

File tree

UTILS/dfextensions/AliasDataFrame/benchmarks/benchmark_composite_keys_rdf.py

Lines changed: 57 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,16 @@
1010
1111
Usage:
1212
python benchmark_composite_keys_rdf.py --json results.json # Required: JSON output
13-
python benchmark_composite_keys_rdf.py --quick --json results.json # Quick mode
13+
python benchmark_composite_keys_rdf.py --quick --json results.json # Quick mode (10^4 rows)
14+
python benchmark_composite_keys_rdf.py --scale --json results.json # Scale mode (10^7 rows)
1415
python benchmark_composite_keys_rdf.py --profile --json results.json # With profiling
1516
python benchmark_composite_keys_rdf.py --quiet --json results.json # Minimal output
1617
18+
Modes:
19+
--quick: 10^4 rows (fast, for CI)
20+
default: 10^5, 10^6 rows
21+
--scale: 10^5, 10^6, 10^7 rows (slow, for performance characterization)
22+
1723
Exit Codes:
1824
0 - All benchmarks completed (passed or skipped)
1925
1 - Fatal error
@@ -96,6 +102,13 @@
96102
'tmemfile_friend': 100_000,
97103
}
98104

105+
SCALE_SIZES = {
106+
'dense': [100_000, 1_000_000, 10_000_000],
107+
'sparse': [100_000, 1_000_000, 10_000_000],
108+
'tmemfile_main': 10_000_000,
109+
'tmemfile_friend': 1_000_000,
110+
}
111+
99112
# TPC-like key ranges (realistic ALICE calibration)
100113
KEY_COLUMNS = ['k0', 'k1', 'k2']
101114
MAX_VALUES = [2, 152, 25] # side, row, drift25
@@ -453,23 +466,44 @@ def benchmark_rdf_query(tmemfile_result, verbose=True):
453466
# Main Runner
454467
# =============================================================================
455468

456-
def run_all_benchmarks(quick_mode=False, verbose=True, profile=False, results_dir=None):
469+
def run_all_benchmarks(quick_mode=False, scale_mode=False, verbose=True, profile=False, results_dir=None):
457470
"""
458471
Run all benchmark scenarios.
459472
473+
Parameters
474+
----------
475+
quick_mode : bool
476+
Use smaller data sizes (10^4 rows)
477+
scale_mode : bool
478+
Use larger data sizes (10^5, 10^6, 10^7 rows)
479+
verbose : bool
480+
Print progress information
481+
profile : bool
482+
Enable cProfile profiling
483+
results_dir : str
484+
Directory for profile output files
485+
460486
Returns dict with results for each scenario.
461487
"""
462488
np.random.seed(RNG_SEED)
463489

464-
sizes = QUICK_SIZES if quick_mode else DEFAULT_SIZES
490+
# Select configuration based on mode
491+
if scale_mode:
492+
sizes = SCALE_SIZES
493+
mode_name = 'scale'
494+
elif quick_mode:
495+
sizes = QUICK_SIZES
496+
mode_name = 'quick'
497+
else:
498+
sizes = DEFAULT_SIZES
499+
mode_name = 'default'
465500

466501
results = {}
467502
total_start = time.perf_counter()
468503

469504
if verbose:
470-
mode = 'quick' if quick_mode else 'default'
471505
print(f"\n{'='*60}")
472-
print(f"Composite Keys / RDF Benchmark ({mode} mode)")
506+
print(f"Composite Keys / RDF Benchmark ({mode_name} mode)")
473507
print(f"{'='*60}")
474508

475509
# Scenario A: Dense Key Generation
@@ -671,8 +705,14 @@ def main():
671705
Examples:
672706
python benchmark_composite_keys_rdf.py --json results.json
673707
python benchmark_composite_keys_rdf.py --quick --json results.json
708+
python benchmark_composite_keys_rdf.py --scale --json results.json
674709
python benchmark_composite_keys_rdf.py --profile --json results.json
675710
711+
Modes:
712+
--quick: 10^4 rows (fast, for CI)
713+
default: 10^5, 10^6 rows
714+
--scale: 10^5, 10^6, 10^7 rows (slow, for performance characterization)
715+
676716
Scenarios:
677717
A. dense_generation: compute_composite_key_dense() performance
678718
B. sparse_generation: compute_composite_key_sparse() performance
@@ -683,7 +723,9 @@ def main():
683723
parser.add_argument('--json', type=str, required=True, metavar='FILE',
684724
help='Export results to JSON file (required)')
685725
parser.add_argument('--quick', action='store_true',
686-
help='Quick mode: smaller data sizes')
726+
help='Quick mode: 10^4 rows (fast)')
727+
parser.add_argument('--scale', action='store_true',
728+
help='Scale mode: 10^5, 10^6, 10^7 rows (slow)')
687729
parser.add_argument('--quiet', action='store_true',
688730
help='Minimal output')
689731
parser.add_argument('--profile', action='store_true',
@@ -692,14 +734,22 @@ def main():
692734
args = parser.parse_args()
693735

694736
verbose = not args.quiet
695-
mode = 'quick' if args.quick else 'default'
737+
738+
# Determine mode name
739+
if args.scale:
740+
mode = 'scale'
741+
elif args.quick:
742+
mode = 'quick'
743+
else:
744+
mode = 'default'
696745

697746
# Determine results directory for profiling
698747
results_dir = os.path.dirname(args.json) or 'results'
699748

700749
# Run benchmarks
701750
results = run_all_benchmarks(
702751
quick_mode=args.quick,
752+
scale_mode=args.scale,
703753
verbose=verbose,
704754
profile=args.profile,
705755
results_dir=results_dir if args.profile else None,

UTILS/dfextensions/AliasDataFrame/benchmarks/benchmark_materialize_aliases.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -604,12 +604,16 @@ def run_scenario_simple(df_main, verbose=True, profile=False, profile_output=Non
604604
print(f" Targets: {targets}")
605605

606606
def do_materialize():
607+
# Generate both text and binary profile paths
608+
profile_text_path = profile_output if profile_output else None
609+
profile_binary_path = profile_output.replace('.txt', '.prof') if profile_output else None
607610
adf.materialize_aliases(
608611
names=targets,
609612
with_dependencies=True,
610613
cleanTemporary=True,
611614
profile=profile,
612-
profile_text=profile_output,
615+
profile_text=profile_text_path,
616+
profile_binary=profile_binary_path,
613617
)
614618

615619
result = measure_materialize(do_materialize, adf)
@@ -690,12 +694,16 @@ def run_scenario_subframe(df_main, df_subframe, fill_mode, verbose=True,
690694
print(f" Fill mode: {fill_mode}")
691695

692696
def do_materialize():
697+
# Generate both text and binary profile paths
698+
profile_text_path = profile_output if profile_output else None
699+
profile_binary_path = profile_output.replace('.txt', '.prof') if profile_output else None
693700
adf.materialize_aliases(
694701
names=targets,
695702
with_dependencies=True,
696703
cleanTemporary=True,
697704
profile=profile,
698-
profile_text=profile_output,
705+
profile_text=profile_text_path,
706+
profile_binary=profile_binary_path,
699707
)
700708

701709
result = measure_materialize(do_materialize, adf)

UTILS/dfextensions/AliasDataFrame/benchmarks/run_benchmark.sh

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ THRESHOLD=20
4949
BASELINE_FILE="${SCRIPT_DIR}/baseline.json"
5050
PROFILE_FLAG=""
5151
FULL_FLAG=""
52+
SCALE_FLAG=""
5253

5354
# Results tracking
5455
declare -a BENCHMARK_NAMES
@@ -164,6 +165,11 @@ while [[ $# -gt 0 ]]; do
164165
VERBOSE=true # Full mode should show all output
165166
shift
166167
;;
168+
--large)
169+
SCALE_FLAG="--scale"
170+
VERBOSE=true # Large mode should show output
171+
shift
172+
;;
167173
--save-baseline)
168174
SAVE_BASELINE=true
169175
shift
@@ -195,6 +201,7 @@ while [[ $# -gt 0 ]]; do
195201
echo " --verbose, -v Show detailed output"
196202
echo " --profile Save profiler output (.prof and .txt) for analysis"
197203
echo " --full Full analysis: verbose + profiling + baseline comparison + history"
204+
echo " --large Run large scale benchmarks (10^7 rows, slow)"
198205
echo " --output DIR Output directory (default: benchmarks/results)"
199206
echo ""
200207
echo "Regression Detection:"
@@ -558,11 +565,11 @@ COMPOSITE_JSON="${OUTPUT_DIR}/benchmark_composite_keys_rdf_${TIMESTAMP}.json"
558565
START_TIME=$(get_time)
559566

560567
if [[ "$VERBOSE" = true ]]; then
561-
OUTPUT=$(python3 "${SCRIPT_DIR}/benchmark_composite_keys_rdf.py" $QUICK_MODE $PROFILE_FLAG --json "$COMPOSITE_JSON" 2>&1)
568+
OUTPUT=$(python3 "${SCRIPT_DIR}/benchmark_composite_keys_rdf.py" $QUICK_MODE $SCALE_FLAG $PROFILE_FLAG --json "$COMPOSITE_JSON" 2>&1)
562569
COMP_STATUS=$?
563570
echo "$OUTPUT"
564571
else
565-
OUTPUT=$(python3 "${SCRIPT_DIR}/benchmark_composite_keys_rdf.py" $QUICK_MODE --json "$COMPOSITE_JSON" --quiet 2>&1)
572+
OUTPUT=$(python3 "${SCRIPT_DIR}/benchmark_composite_keys_rdf.py" $QUICK_MODE $SCALE_FLAG --json "$COMPOSITE_JSON" --quiet 2>&1)
566573
COMP_STATUS=$?
567574
fi
568575

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
files/

0 commit comments

Comments
 (0)