kompot_de_benchmark/run_comprehensive_tests.py at main · settylab/kompot_de_benchmark · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python3
"""
Comprehensive test suite for the benchmarking system.

This script runs all validation tests and verifies the complete
benchmarking pipeline is ready for production use.
"""

import subprocess
import sys
from pathlib import Path
import time


def run_test(test_name: str, command: list, description: str = "") -> bool:
    """Run a single test and report results."""
    print(f"\n{'='*60}")
    print(f"TEST: {test_name}")
    if description:
        print(f"Description: {description}")
    print(f"Command: {' '.join(command)}")
    print(f"{'='*60}")

    start_time = time.time()

    try:
        result = subprocess.run(
            command,
            capture_output=True,
            text=True,
            timeout=300  # 5 minute timeout
        )

        duration = time.time() - start_time

        if result.returncode == 0:
            print(f"✅ PASSED ({duration:.1f}s)")
            if result.stdout:
                print("STDOUT:")
                print(result.stdout[-500:])  # Last 500 chars
            return True
        else:
            print(f"❌ FAILED ({duration:.1f}s)")
            print("STDERR:")
            print(result.stderr)
            if result.stdout:
                print("STDOUT:")
                print(result.stdout)
            return False

    except subprocess.TimeoutExpired:
        print(f"❌ TIMEOUT (>300s)")
        return False
    except Exception as e:
        print(f"❌ ERROR: {e}")
        return False


def main():
    """Run comprehensive test suite."""
    print("🧪 COMPREHENSIVE BENCHMARKING SYSTEM TEST SUITE")
    print("="*80)

    # Track all test results
    test_results = {}

    # Test 1: Environment and dependencies
    test_results["list_tools"] = run_test(
        "List Available Tools",
        ["python", "processing_config.py", "--list-tools"],
        "Verify tool configuration and basic imports"
    )

    # Test 2: Pipeline functionality
    test_results["pipeline_test"] = run_test(
        "Benchmarking Pipeline",
        ["python", "test_benchmarking_pipeline.py"],
        "Test complete pipeline with synthetic data"
    )

    # Test 3: Result format validation
    test_results["format_validation"] = run_test(
        "Result Format Validation",
        ["python", "validate_result_format.py"],
        "Ensure standardized output format"
    )

    # Test 4: Dry run submission
    test_results["dry_run"] = run_test(
        "SLURM Dry Run",
        ["python", "processing_config.py", "--submit-tool-benchmark", "aging", "scanpy", "--dry-run"],
        "Test SLURM job submission without execution"
    )

    # Test 5: Configuration listing
    test_results["list_configs"] = run_test(
        "List Configurations",
        ["python", "processing_config.py", "--list"],
        "Verify dataset and parameter configurations"
    )

    # Test 6: Result collection (basic)
    test_results["result_collection"] = run_test(
        "Result Collection Script",
        ["python", "benchmarking/scripts/gather_benchmark_results.py", "--help"],
        "Test result collection and comparison tools"
    )

    # Test 7: Environment setup script syntax
    test_results["setup_script"] = run_test(
        "Setup Script Syntax",
        ["bash", "-n", "setup_environments.sh"],
        "Validate setup script syntax"
    )

    # Summary
    print(f"\n" + "="*80)
    print("🏁 TEST SUITE SUMMARY")
    print("="*80)

    passed = 0
    failed = 0

    for test_name, success in test_results.items():
        status = "PASS" if success else "FAIL"
        icon = "✅" if success else "❌"
        print(f"{icon} {test_name}: {status}")

        if success:
            passed += 1
        else:
            failed += 1

    print(f"\nResults: {passed} passed, {failed} failed")

    # Overall assessment
    critical_tests = ["list_tools", "pipeline_test", "format_validation"]
    critical_passed = all(test_results.get(test, False) for test in critical_tests)

    if critical_passed and failed == 0:
        print(f"\n🎉 ALL TESTS PASSED!")
        print(f"The benchmarking system is ready for production use.")

        print(f"\n📋 Next Steps:")
        print(f"1. Set up tool environments: ./setup_environments.sh")
        print(f"2. Submit first benchmark: python processing_config.py --submit-tool-benchmark aging scanpy")
        print(f"3. Monitor progress: squeue -u $USER")
        print(f"4. Analyze results: python benchmarking/scripts/gather_benchmark_results.py")

        return True

    elif critical_passed:
        print(f"\n⚠️  CRITICAL TESTS PASSED")
        print(f"Core functionality works, but {failed} non-critical tests failed.")
        print(f"The system may be usable but should be reviewed.")

        return True

    else:
        print(f"\n💥 CRITICAL TESTS FAILED")
        print(f"The benchmarking system is not ready for use.")
        print(f"Please review and fix the failed tests before proceeding.")

        failed_critical = [test for test in critical_tests if not test_results.get(test, False)]
        print(f"Failed critical tests: {failed_critical}")

        return False


if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)