kompot_scaling/complexity_utils.py at main · settylab/kompot_scaling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
#!/usr/bin/env python3
"""
Complexity Scaling Analysis for Kompot

This module analyzes the runtime and memory scaling of kompot differential expression
across different parameter combinations:
- n_cells: Number of cells (10k to 422k)
- n_genes: Number of genes (200 to 33k)

We examine scaling for:
- No SV: Differential expression WITHOUT sample variance
- SV (disk): Differential expression WITH sample variance (stored on disk)

IMPORTANT NOTES:
- Sample variance makes computation 10-100x slower
- Only configs at 200 genes allow fair comparison between No SV and SV
- Other configs show scaling trends but should NOT be directly compared across modes
- All configs use n_landmarks=5000 (the default value)
- Memory measurements use SLURM MaxRSS (OS-reported peak memory)
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from pathlib import Path
import json
import subprocess
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# Set transparent background for all plots
matplotlib.rcParams['figure.facecolor'] = (1, 1, 1, 0)
matplotlib.rcParams['axes.facecolor'] = (1, 1, 1, 0)
matplotlib.rcParams['savefig.facecolor'] = (1, 1, 1, 0)

# High quality plots
matplotlib.rcParams['figure.dpi'] = 150
matplotlib.rcParams['savefig.dpi'] = 300
matplotlib.rcParams['font.size'] = 10
matplotlib.rcParams['axes.labelsize'] = 11
matplotlib.rcParams['axes.titlesize'] = 12
matplotlib.rcParams['legend.fontsize'] = 9

# Set Arial font globally
matplotlib.rcParams['font.family'] = 'Arial'
matplotlib.rcParams['font.sans-serif'] = ['Arial']


def parse_maxrss(maxrss_str):
    """Parse MaxRSS from SLURM format (e.g., '123456K') to GB."""
    if not maxrss_str or maxrss_str == '':
        return None

    # Handle format like "123456K" or "1234M"
    if maxrss_str.endswith('K'):
        kb = float(maxrss_str[:-1])
        return kb / (1024 * 1024)  # KB to GB
    elif maxrss_str.endswith('M'):
        mb = float(maxrss_str[:-1])
        return mb / 1024  # MB to GB
    elif maxrss_str.endswith('G'):
        return float(maxrss_str[:-1])
    else:
        # Assume bytes
        return float(maxrss_str) / (1024**3)


def collect_benchmark_results(
    configs_file = 'all',
    results_dir: str = 'results',
    job_ids: List[int] = [40443845, 40443863]
) -> pd.DataFrame:
    """
    Collect benchmark results from JSON files and SLURM MaxRSS.

    Parameters:
    -----------
    configs_file : str or DataFrame
        Path to configuration CSV file(s) or DataFrame
        Options:
        - 'all': Load all available experiments
        - 'landmarks': Load n_landmarks sweep experiment
        - 'dimensions': Load n_genes/n_cells experiment
        - 'batching': Load batching comparison experiment
        - 'unified': Load unified multi-parameter experiment
        - explicit path: Load specific file
        - DataFrame: Use provided DataFrame directly
    results_dir : str
        Directory containing result JSON files
    job_ids : List[int]
        SLURM job IDs to query for MaxRSS

    Returns:
    --------
    DataFrame with all benchmark results including SLURM MaxRSS
    """
    # Load configurations
    if isinstance(configs_file, pd.DataFrame):
        # configs_file is already a DataFrame
        configs_df = configs_file
        print(f"Loaded {len(configs_df)} configs from provided DataFrame")
    elif configs_file == 'all':
        # Load all available experiments (DE and DA)
        config_files = []
        # Try new generated configs first
        generated_file = Path('results/configs_generated.csv')
        if generated_file.exists():
            config_files.append(generated_file)

        # Try legacy experiment files
        for experiment in ['landmarks', 'dimensions', 'batching', 'unified',
                          'da_landmarks', 'da_dimensions', 'da_batching']:
            v_file = Path(f'results/configs_{experiment}.csv')
            if v_file.exists():
                config_files.append(v_file)

        if not config_files:
            raise FileNotFoundError("No config files found. Run scripts/generate_all_configs.py first.")

        configs_df = pd.concat([pd.read_csv(f) for f in config_files], ignore_index=True)
        print(f"Loaded {len(configs_df)} configs from {len(config_files)} experiment(s)")

    elif configs_file == 'generated':
        # Load new spec-driven generated configs
        v_file = Path('results/configs_generated.csv')

        if not v_file.exists():
            raise FileNotFoundError(
                f"Config file not found: {v_file}\n"
                f"Run: python scripts/generate_all_configs.py"
            )

        configs_df = pd.read_csv(v_file)
        print(f"Loaded {len(configs_df)} configs from {v_file.name}")

    elif configs_file in ['landmarks', 'dimensions', 'batching', 'unified']:
        # Load specific legacy experiment
        v_file = Path(f'results/configs_{configs_file}.csv')

        if not v_file.exists():
            raise FileNotFoundError(
                f"Config file not found: {v_file}\n"
                f"Run: python scripts/generate_unified_configs.py --benchmarks {configs_file}"
            )

        configs_df = pd.read_csv(v_file)
        print(f"Loaded {len(configs_df)} configs from {v_file.name}")
    else:
        # Load explicit file path
        configs_df = pd.read_csv(configs_file)
        print(f"Loaded {len(configs_df)} configs from {configs_file}")

    # Get SLURM job status (without -X to get .batch subjobs which have MaxRSS)
    job_ids_str = ','.join(map(str, job_ids))
    cmd = f"sacct -j {job_ids_str} --format=JobID,State,Elapsed,MaxRSS -P"
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)

    # Parse SLURM output
    # Key by (job_id_prefix, config_id) to avoid collisions between experiments
    jobs = {}
    lines = result.stdout.strip().split('\n')[1:]  # Skip header
    for line in lines:
        if not line:
            continue
        parts = line.split('|')
        if len(parts) >= 4:
            job_id = parts[0]
            state = parts[1]
            elapsed = parts[2]
            maxrss = parts[3] if len(parts) > 3 else ''

            # Extract job_id_prefix and config_id from job_id (e.g., "40443845_0.batch" -> (40443845, 0))
            # We want .batch subjobs for MaxRSS, main jobs for state
            if '_' in job_id:
                # Split to get job_id_prefix and config_id
                job_id_prefix = int(job_id.split('_')[0])  # e.g., "40443845"
                config_part = job_id.split('_')[1]  # e.g., "0.batch" or "0"

                # Skip array summary lines like "40870717_[1597-1599,1620-1629]"
                if '[' in config_part or ']' in config_part:
                    continue

                config_id = int(config_part.split('.')[0])  # Get just the number

                job_key = (job_id_prefix, config_id)

                # Store .batch info for MaxRSS (actual resource usage)
                if '.batch' in job_id and maxrss:
                    if job_key not in jobs:
                        jobs[job_key] = {}
                    jobs[job_key]['maxrss_raw'] = maxrss
                    jobs[job_key]['maxrss_gb'] = parse_maxrss(maxrss)

                # Store main job info for state and elapsed
                elif '.batch' not in job_id and '.extern' not in job_id:
                    if job_key not in jobs:
                        jobs[job_key] = {}
                    jobs[job_key]['job_id'] = job_id
                    jobs[job_key]['state'] = state
                    jobs[job_key]['elapsed'] = elapsed

    # Collect results
    results = []
    results_path = Path(results_dir)

    for _, config in configs_df.iterrows():
        config_id = config['config_id']
        config_name = config['config_name']

        # Determine analysis type from config or infer from presence of n_genes
        if 'analysis_type' in config and pd.notna(config['analysis_type']):
            analysis_type = config['analysis_type']
        elif 'n_genes' in config and pd.notna(config['n_genes']):
            analysis_type = 'de'
        else:
            analysis_type = 'da'  # No n_genes means DA

        result_dict = {
            'config_id': config_id,
            'config_name': config_name,
            'benchmark_set': config.get('benchmark_set'),
            'benchmark_type': config.get('benchmark_type'),
            'comparison_group': config.get('comparison_group'),  # NEW: Store comparison group for spec-driven plotting
            'plot_name': config.get('plot_name'),  # NEW: Store target plot name
            'analysis_type': analysis_type,
            'n_cells': int(config['n_cells']),
            'use_sample_variance': bool(config['use_sample_variance']),
            'store_on_disk': bool(config['store_on_disk'])
        }

        # Add replicate_id (critical for aggregating replicates in plots)
        if 'replicate_id' in config and pd.notna(config['replicate_id']):
            result_dict['replicate_id'] = int(config['replicate_id'])

        # Add n_landmarks if not NaN (some configs don't use landmarks)
        if 'n_landmarks' in config and pd.notna(config['n_landmarks']):
            result_dict['n_landmarks'] = int(config['n_landmarks'])

        # Add DE-specific fields (only if not NaN)
        if 'n_genes' in config and pd.notna(config['n_genes']):
            result_dict['n_genes'] = int(config['n_genes'])
        if 'compute_mahalanobis' in config and pd.notna(config['compute_mahalanobis']):
            result_dict['compute_mahalanobis'] = bool(config['compute_mahalanobis'])

        # Add DA-specific fields (only if not NaN)
        if 'n_components' in config and pd.notna(config['n_components']):
            result_dict['n_components'] = int(config['n_components'])

        # Check for JSON result file using benchmark_set from config
        # Primary method: use benchmark_set from config (spec-driven)
        # Fallback: try legacy prefixes for backward compatibility
        json_file = None

        # Try benchmark_set first (e.g., de-n-cells-sweep, da-n-landmarks-sweep)
        if 'benchmark_set' in config and pd.notna(config['benchmark_set']):
            # Convert underscores to hyphens to match actual file naming
            prefix = config['benchmark_set'].replace('_', '-')
            candidate = results_path / f'{prefix}_{config_name}.json'
            if candidate.exists():
                json_file = candidate

        # Fallback to legacy prefixes for backward compatibility
        if json_file is None:
            legacy_prefixes = [
                'landmarks', 'dimensions', 'batching', 'unified',
                'da-landmarks', 'da-dimensions', 'da-batching',
                'nobatch', 'batched', 'final'
            ]
            for prefix in legacy_prefixes:
                candidate = results_path / f'{prefix}_{config_name}.json'
                if candidate.exists():
                    json_file = candidate
                    break

        if json_file:
            with open(json_file, 'r') as f:
                json_data = json.load(f)

            result_dict['runtime_seconds'] = json_data.get('runtime_seconds')
            result_dict['runtime_minutes'] = json_data.get('runtime_minutes')
            result_dict['memory_before_gb'] = json_data.get('memory_before_gb')
            result_dict['memory_after_gb'] = json_data.get('memory_after_gb')
            result_dict['peak_memory_tracemalloc_gb'] = json_data.get('peak_memory_tracemalloc_gb')
            result_dict['success'] = json_data.get('success')
            result_dict['error_msg'] = json_data.get('error_msg')

            # Read batch_size with backward compatibility
            if 'batch_size' in json_data:
                result_dict['batch_size'] = json_data['batch_size']
            else:
                result_dict['batch_size'] = 100  # Default for old results
                if result_dict['success']:  # Only warn for successful runs
                    warnings.warn(f"Config {config_id}: batch_size not found in JSON, assuming 100 (backward compatibility)")
        else:
            result_dict['runtime_seconds'] = None
            result_dict['runtime_minutes'] = None
            result_dict['memory_before_gb'] = None
            result_dict['memory_after_gb'] = None
            result_dict['peak_memory_tracemalloc_gb'] = None
            result_dict['success'] = None
            result_dict['error_msg'] = None
            result_dict['batch_size'] = None

        # Add SLURM job info - PREFER saved data from JSON file
        # New reliable approach: Benchmark script saves SLURM data directly in JSON
        # This preserves data even if SLURM accounting is dropped
        job_info = None

        if json_file and json_data:
            # FIRST: Try to use SLURM data saved in JSON file (most reliable!)
            # New benchmark runs query SLURM at runtime and save in result file
            if 'slurm_state' in json_data and json_data.get('slurm_state') is not None:
                # Use saved SLURM data (guaranteed correct, preserved forever)
                result_dict['slurm_state'] = json_data.get('slurm_state')
                result_dict['slurm_elapsed'] = json_data.get('slurm_elapsed')
                result_dict['slurm_maxrss_raw'] = None  # Not saved in JSON
                result_dict['slurm_maxrss_gb'] = json_data.get('slurm_maxrss_gb')
                # Skip querying SLURM - we have the data already!
                results.append(result_dict)
                continue

            # SECOND: Try to match using stored job ID from JSON (new method)
            slurm_job_id = json_data.get('slurm_job_id')
            slurm_array_task_id = json_data.get('slurm_array_task_id')

            if slurm_job_id and slurm_array_task_id:
                # Use stored job info for reliable matching
                job_key = (int(slurm_job_id), int(slurm_array_task_id))
                if job_key in jobs:
                    job_info = jobs[job_key]
            elif slurm_job_id:
                # Handle non-array jobs
                job_key = int(slurm_job_id)
                # Check if job_id exists in jobs dict (with any array task)
                for key in jobs:
                    if isinstance(key, tuple) and key[0] == job_key:
                        job_info = jobs[key]
                        break

            # NOTE: Removed unreliable config_id fallback matching
            # Old approach caused false data in plots (wrong job matching)
            # If no SLURM data available, we leave it as None - better than wrong data!

        if job_info:
            result_dict['slurm_state'] = job_info.get('state', None)
            result_dict['slurm_elapsed'] = job_info.get('elapsed')
            result_dict['slurm_maxrss_raw'] = job_info.get('maxrss_raw')
            result_dict['slurm_maxrss_gb'] = job_info.get('maxrss_gb')
        else:
            # No SLURM data available - leave as None (trust JSON success flag)
            result_dict['slurm_state'] = None
            result_dict['slurm_elapsed'] = None
            result_dict['slurm_maxrss_raw'] = None
            result_dict['slurm_maxrss_gb'] = None

        results.append(result_dict)

    return pd.DataFrame(results)


def plot_scaling(
    df: pd.DataFrame,
    x_param: str,
    y_metric: str,
    fixed_params: Dict[str, any],
    output_dir: Optional[Path] = None,
    show_legend_separately: bool = True,
    title: str = None,
    xlabel: str = None,
    ylabel: str = None,
    reference_lines: List[Dict] = None
) -> Tuple[plt.Figure, Optional[plt.Figure]]:
    """
    Plot scaling of a metric (runtime or memory) vs a parameter (n_cells or n_genes).

    Parameters:
    -----------
    df : pd.DataFrame
        Benchmark results DataFrame
    x_param : str
        Parameter to plot on x-axis ('n_cells' or 'n_genes')
    y_metric : str
        Metric to plot on y-axis ('runtime_seconds' or 'slurm_maxrss_gb')
    fixed_params : Dict[str, any]
        Parameters to hold fixed (e.g., {'n_genes': 200} for cell sweep at 200 genes)
    output_dir : Optional[Path]
        Directory to save plots
    show_legend_separately : bool
        If True, create separate legend plot
    title : str
        Plot title (auto-generated if None)
    xlabel : str
        X-axis label (auto-generated if None)
    ylabel : str
        Y-axis label (auto-generated if None)
    reference_lines : List[Dict]
        List of reference lines to add (e.g., default values)

    Returns:
    --------
    (main_fig, legend_fig) : Tuple of matplotlib figures
    """
    # Filter data based on fixed parameters
    filtered_df = df.copy()
    for param, value in fixed_params.items():
        filtered_df = filtered_df[filtered_df[param] == value]

    # Filter out failed/missing data
    filtered_df = filtered_df[filtered_df['success'] == True]
    filtered_df = filtered_df.dropna(subset=[y_metric])

    # Remove invalid/zero measurements
    if y_metric == 'slurm_maxrss_gb':
        filtered_df = filtered_df[filtered_df[y_metric] > 0]

    # Deduplicate: Keep only one result per unique (x_param, use_sample_variance, batch_size, store_on_disk) combination
    # Prefer results from primary experiments (dimensions, landmarks, batching) over unified
    # Sort by benchmark_type to prioritize specific experiments over unified
    if 'benchmark_type' in filtered_df.columns:
        # Sort so that 'unified' comes first, then we drop duplicates keeping last (non-unified)
        filtered_df = filtered_df.sort_values('benchmark_type')

    # Define grouping columns for deduplication
    # Include n_genes if it's in the dataframe and not fixed (so we don't deduplicate across gene counts)
    dedup_cols = [x_param, 'use_sample_variance', 'batch_size', 'store_on_disk']
    if 'n_genes' in filtered_df.columns and 'n_genes' not in fixed_params:
        dedup_cols.append('n_genes')
    filtered_df = filtered_df.drop_duplicates(subset=dedup_cols, keep='last')

    if len(filtered_df) == 0:
        print(f"No data available for {x_param} sweep with {fixed_params}")
        return None, None

    # Create main plot
    fig, ax = plt.subplots(figsize=(8, 5))

    # Group by boolean parameters to show as separate lines for comparison
    legend_entries = []

    # Collect data for each group
    group_data = []
    group_names = []

    # Determine grouping variables (when NOT in fixed_params AND not the x-axis)
    # Don't group by the x-axis parameter since that's already represented spatially
    group_by_n_genes = 'n_genes' in filtered_df.columns and 'n_genes' not in fixed_params and x_param != 'n_genes'
    group_by_n_landmarks = 'n_landmarks' in filtered_df.columns and 'n_landmarks' not in fixed_params and x_param != 'n_landmarks'

    # Group by all combinations of: use_sample_variance, batch_size, store_on_disk, (optionally n_genes, n_landmarks)
    for use_sv in [False, True]:
        sv_df = filtered_df[filtered_df['use_sample_variance'] == use_sv]
        if len(sv_df) == 0:
            continue

        # Group by store_on_disk (only relevant for SV)
        if use_sv:
            for store_disk in [False, True]:
                disk_df = sv_df[sv_df['store_on_disk'] == store_disk]
                if len(disk_df) == 0:
                    continue

                # Group by batch_size
                for batch_size in sorted(disk_df['batch_size'].dropna().unique()):
                    batch_df = disk_df[disk_df['batch_size'] == batch_size]
                    if len(batch_df) == 0:
                        continue

                    # Optionally group by n_genes
                    if group_by_n_genes:
                        for n_genes in sorted(batch_df['n_genes'].dropna().unique()):
                            gene_df = batch_df[batch_df['n_genes'] == n_genes]
                            if len(gene_df) > 0:
                                group_data.append(gene_df.sort_values(x_param))
                                batch_str = "no-batch" if batch_size == 0 else f"batch={int(batch_size)}"
                                storage_str = 'disk' if store_disk else 'mem'
                                group_names.append(f'SV ({storage_str}), {batch_str}, {int(n_genes)}g')
                    else:
                        group_data.append(batch_df.sort_values(x_param))
                        batch_str = "no-batch" if batch_size == 0 else f"batch={int(batch_size)}"
                        storage_str = 'disk' if store_disk else 'mem'
                        group_names.append(f'SV ({storage_str}), {batch_str}')
        else:
            # No SV case - group by batch_size (and optionally n_genes)
            for batch_size in sorted(sv_df['batch_size'].dropna().unique()):
                batch_df = sv_df[sv_df['batch_size'] == batch_size]
                if len(batch_df) == 0:
                    continue

                # Optionally group by n_genes
                if group_by_n_genes:
                    for n_genes in sorted(batch_df['n_genes'].dropna().unique()):
                        gene_df = batch_df[batch_df['n_genes'] == n_genes]
                        if len(gene_df) > 0:
                            group_data.append(gene_df.sort_values(x_param))
                            batch_str = "no-batch" if batch_size == 0 else f"batch={int(batch_size)}"
                            group_names.append(f'No SV, {batch_str}, {int(n_genes)}g')
                else:
                    group_data.append(batch_df.sort_values(x_param))
                    batch_str = "no-batch" if batch_size == 0 else f"batch={int(batch_size)}"
                    group_names.append(f'No SV, {batch_str}')

    # Color palette for groups (matching runtime analysis style)
    colors = plt.cm.tab10(np.linspace(0, 1, len(group_data)))

    for (group_df, label_name), color in zip(zip(group_data, group_names), colors):
        x = group_df[x_param].values
        y = group_df[y_metric].values

        # Plot line (matching runtime analysis: marker='o', markersize=3, linewidth=1.5)
        line, = ax.plot(x, y, marker='o', markersize=3, linewidth=1.5,
                       color=color, alpha=0.8)

        # Build legend label with metadata
        n_landmarks = group_df['n_landmarks'].iloc[0]
        label_parts = [label_name]

        # Add fixed parameter info
        fixed_info = []
        for param, value in fixed_params.items():
            if param == 'n_genes':
                fixed_info.append(f"{value:,} genes")
            elif param == 'n_cells':
                fixed_info.append(f"{value:,} cells")

        if fixed_info:
            label_parts.append(", ".join(fixed_info))

        label_parts.append(f"lm={n_landmarks}")

        label = f"{label_parts[0]} ({', '.join(label_parts[1:])})"
        legend_entries.append((line, label))

    # Configure axes
    ax.set_xscale('log')
    ax.set_yscale('log')

    # Set labels
    if xlabel is None:
        xlabel = 'Number of cells' if x_param == 'n_cells' else 'Number of genes'
    if ylabel is None:
        if y_metric == 'runtime_seconds':
            ylabel = 'Log runtime (seconds)'
        elif y_metric == 'slurm_maxrss_gb':
            ylabel = 'Log peak memory (GB, SLURM MaxRSS)'
        else:
            ylabel = y_metric

    ax.set_xlabel(xlabel, fontsize=11)
    ax.set_ylabel(ylabel, fontsize=11)

    # Set title
    if title is None:
        metric_name = 'Runtime' if 'runtime' in y_metric else 'Memory'
        fixed_desc = ', '.join([f"{k}={v}" for k, v in fixed_params.items()])
        title = f'{metric_name} Scaling vs {xlabel} ({fixed_desc})'

    ax.set_title(title, fontsize=12, fontweight='bold')
    ax.grid(True, alpha=0.3, linestyle='--', linewidth=0.5)

    # Add reference lines if specified
    if reference_lines:
        for ref_line in reference_lines:
            if ref_line.get('type') == 'vertical':
                ax.axvline(x=ref_line['value'], color='gray', linestyle='--',
                          linewidth=1, alpha=0.6, zorder=1)
                if 'label' in ref_line:
                    ax.text(ref_line['value'], ax.get_ylim()[1] * 0.95,
                           f" {ref_line['label']}", fontsize=8, color='gray',
                           va='top', ha='right', rotation=90)

    # Add legend to main plot if not showing separately
    if not show_legend_separately:
        lines, labels = zip(*legend_entries)
        ax.legend(lines, labels, fontsize=9, framealpha=0.9, loc='best')

    plt.tight_layout()

    # Save main plot
    if output_dir is not None:
        output_dir.mkdir(parents=True, exist_ok=True)
        fixed_str = '_'.join([f"{k}{v}" for k, v in fixed_params.items()])
        filename = f"scaling_{y_metric}_{x_param}_{fixed_str}.png"
        fig.savefig(output_dir / filename, dpi=300, bbox_inches='tight',
                   facecolor=(1, 1, 1, 0))
        print(f"Saved: {output_dir / filename}")

    # Create separate legend plot if requested
    legend_fig = None
    if show_legend_separately and len(legend_entries) > 0:
        legend_fig = plt.figure(figsize=(6, len(legend_entries) * 0.3 + 0.5))
        legend_ax = legend_fig.add_subplot(111)
        legend_ax.axis('off')

        lines, labels = zip(*legend_entries)
        legend_ax.legend(lines, labels, loc='center', fontsize=9,
                        frameon=False, ncol=1)

        plt.tight_layout()

        # Save legend
        if output_dir is not None:
            fixed_str = '_'.join([f"{k}{v}" for k, v in fixed_params.items()])
            filename = f"scaling_{y_metric}_{x_param}_{fixed_str}_legend.png"
            legend_fig.savefig(output_dir / filename, dpi=300, bbox_inches='tight',
                             facecolor=(1, 1, 1, 0))
            print(f"Saved: {output_dir / filename}")

    return fig, legend_fig


def print_summary_statistics(df: pd.DataFrame):
    """Print summary statistics for benchmark results."""

    print("\n" + "="*80)
    print("COMPLEXITY SCALING SUMMARY STATISTICS")
    print("="*80)

    # Filter successful runs only
    success_df = df[df['success'] == True].copy()

    if len(success_df) == 0:
        print("No successful runs found.")
        return

    # Group by benchmark type (if column exists)
    if 'benchmark_type' in success_df.columns:
        benchmark_types = success_df['benchmark_type'].dropna().unique()
    else:
        benchmark_types = ['all']

    for benchmark_type in benchmark_types:
        if benchmark_type == 'all':
            type_df = success_df
            print(f"\nALL EXPERIMENTS:")
        else:
            type_df = success_df[success_df['benchmark_type'] == benchmark_type]
            print(f"\n{str(benchmark_type).upper().replace('_', ' ')}:")
        print("-" * 80)

        # Group by use_sample_variance
        for use_sv in [False, True]:
            sv_df = type_df[type_df['use_sample_variance'] == use_sv]

            if len(sv_df) == 0:
                continue

            mode = 'SV (disk)' if use_sv else 'No SV'
            print(f"\n  {mode}:")

            # Show parameter ranges
            n_cells_vals = sv_df['n_cells'].unique()
            n_genes_vals = sv_df['n_genes'].unique()

            print(f"    n_cells: {', '.join(map(lambda x: f'{x:,}', sorted(n_cells_vals)))}")
            print(f"    n_genes: {', '.join(map(lambda x: f'{x:,}', sorted(n_genes_vals)))}")
            print(f"    n_landmarks: {sv_df['n_landmarks'].iloc[0]}")

            # Runtime statistics
            runtime_valid = sv_df.dropna(subset=['runtime_seconds'])
            if len(runtime_valid) > 0:
                rt_min = runtime_valid['runtime_seconds'].min()
                rt_max = runtime_valid['runtime_seconds'].max()
                rt_mean = runtime_valid['runtime_seconds'].mean()
                print(f"    Runtime: min={rt_min:.1f}s ({rt_min/60:.1f}m), " +
                      f"max={rt_max:.1f}s ({rt_max/60:.1f}m), " +
                      f"mean={rt_mean:.1f}s ({rt_mean/60:.1f}m)")

            # Memory statistics (SLURM MaxRSS)
            memory_valid = sv_df.dropna(subset=['slurm_maxrss_gb'])
            if len(memory_valid) > 0:
                mem_min = memory_valid['slurm_maxrss_gb'].min()
                mem_max = memory_valid['slurm_maxrss_gb'].max()
                mem_mean = memory_valid['slurm_maxrss_gb'].mean()
                print(f"    Memory (SLURM MaxRSS): min={mem_min:.2f}GB, " +
                      f"max={mem_max:.2f}GB, mean={mem_mean:.2f}GB")

    print("\n" + "="*80)
    print("\nKEY INTERPRETATION NOTES:")
    print("-" * 80)
    print("  ✓ Only compare No SV vs SV at 200 genes (comparison benchmark)")
    print("  ✓ Other configs show scaling trends, NOT direct comparison")
    print("  ✓ All configs use n_landmarks=5000 (default value)")
    print("  ✓ Memory measured using SLURM MaxRSS (OS-reported peak)")
    print("="*80)