Update run_phase0_experiments.py

Vishal-sys-code · Vishal-sys-code · commit 7199bb12d6bb · 2026-01-19T17:32:01.000+05:30
diff --git a/scripts/run_phase0_experiments.py b/scripts/run_phase0_experiments.py
@@ -3,6 +3,8 @@
 import subprocess
 import time
 import yaml
+import argparse
+import concurrent.futures
 from pathlib import Path
 import logging
 
@@ -21,20 +23,8 @@ def is_run_complete(save_dir):
     # Check for metrics.csv or a specific completion flag
     return (save_dir / "metrics.csv").exists()
 
-def main():
-    configs = get_configs()
-    logger.info(f"Found {len(configs)} experiments to run.")
-    
-    # Verify Data Exists
-    # We expect data/d4rl/{env}/dataset_v1.npz
-    # We can check the first config to see what env it needs, but roughly:
-    if not DATA_DIR.exists():
-        logger.error(f"Data directory {DATA_DIR} does not exist. Please run convert_d4rl.py first.")
-        # We could try to run conversion here automatically?
-        # Let's assume the user/previous step handled it, or alert.
-        pass
-
-    for config_path in configs:
+def run_experiment(config_path, worker_id=0):
+    try:
         with open(config_path, 'r') as f:
             cfg = yaml.safe_load(f)
             
@@ -49,9 +39,9 @@ def main():
         
         if is_run_complete(save_dir):
             logger.info(f"Skipping {model} on {env} (Run Complete)")
-            continue
+            return
             
-        logger.info(f"Starting {model} on {env}...")
+        logger.info(f"Worker {worker_id}: Starting {model} on {env}...")
         
         # Construct Command
         cmd = [
@@ -61,21 +51,52 @@ def main():
             "--env", env,
             "--model", model,
             "--seed", str(seed),
-            # Add dataset path explicitly if needed, but train.py infers it.
-            # train.py infers: project_root / f"data/{args.env}/dataset.npz"
-            # Our convert script puts it in: data/d4rl/{env}/dataset_v1.npz
-            # This is a MISMATCH. We need to point train.py to the right place.
             "--dataset-path", str(DATA_DIR / env / "dataset_v1.npz")
         ]
         
-        try:
-            # Run Synchronously for now
-            subprocess.run(cmd, check=True)
-            logger.info(f"Finished {model} on {env}")
-        except subprocess.CalledProcessError as e:
-            logger.error(f"Failed {model} on {env}: {e}")
-            # Continue to next experiment?
-            time.sleep(1)
+        # Set environment variables for this process to limit CPU usage
+        env_vars = os.environ.copy()
+        # Limit threads per process to avoid thrashing
+        # Assuming 3 workers on a typical 12+ core machine, 4 threads each is safe.
+        # If user has fewer cores, they should reduce max-workers.
+        env_vars["OMP_NUM_THREADS"] = "4"
+        env_vars["MKL_NUM_THREADS"] = "4"
+        env_vars["TORCH_NUM_THREADS"] = "4"
+        
+        # Run Synchronously (within the worker thread)
+        subprocess.run(cmd, check=True, env=env_vars)
+        logger.info(f"Worker {worker_id}: Finished {model} on {env}")
+        
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Worker {worker_id}: Failed {model} on {env}: {e}")
+    except Exception as e:
+        logger.error(f"Worker {worker_id}: Error processing {config_path}: {e}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Run Phase 0 Experiments")
+    parser.add_argument("--max-workers", type=int, default=3, help="Number of parallel experiments to run")
+    args = parser.parse_args()
+
+    configs = get_configs()
+    logger.info(f"Found {len(configs)} experiments to run.")
+    
+    if not DATA_DIR.exists():
+        logger.error(f"Data directory {DATA_DIR} does not exist. Please run convert_d4rl.py first.")
+        return
+
+    # Use ThreadPoolExecutor to run experiments in parallel
+    with concurrent.futures.ThreadPoolExecutor(max_workers=args.max_workers) as executor:
+        futures = []
+        for i, config_path in enumerate(configs):
+            # i % args.max_workers is just a rough worker ID for logging
+            futures.append(executor.submit(run_experiment, config_path, i % args.max_workers))
+            
+        # Wait for all futures to complete
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                future.result()
+            except Exception as e:
+                logger.error(f"An experiment failed with exception: {e}")
 
 if __name__ == "__main__":
     main()