fix: add macos spawn fallback for pytorch dataloader workers

marcellodebernardi · marcellodebernardi · commit 4aa0d897350b · 2026-03-02T21:25:06.000Z
diff --git a/Makefile b/Makefile
@@ -71,21 +71,23 @@ help:
 .PHONY: test-integration
 test-integration:
 	@echo "🧪 Running staged pytest integration suite..."
+	@echo "Using DATALOADER_WORKERS=$${DATALOADER_WORKERS:-0}"
 	@if [ -n "$(INTEGRATION_RUN_ID)" ]; then \
 		echo "Using integration run id: $(INTEGRATION_RUN_ID)"; \
-		PLEXE_IT_RUN_ID="$(INTEGRATION_RUN_ID)" bash scripts/tests/run_integration_staged.sh; \
+		DATALOADER_WORKERS="$${DATALOADER_WORKERS:-0}" PLEXE_IT_RUN_ID="$(INTEGRATION_RUN_ID)" bash scripts/tests/run_integration_staged.sh; \
 	else \
-		bash scripts/tests/run_integration_staged.sh; \
+		DATALOADER_WORKERS="$${DATALOADER_WORKERS:-0}" bash scripts/tests/run_integration_staged.sh; \
 	fi
 
 .PHONY: test-integration-verbose
 test-integration-verbose:
 	@echo "🧪 Running staged pytest integration suite (verbose)..."
+	@echo "Using DATALOADER_WORKERS=$${DATALOADER_WORKERS:-0}"
 	@if [ -n "$(INTEGRATION_RUN_ID)" ]; then \
 		echo "Using integration run id: $(INTEGRATION_RUN_ID)"; \
-		PLEXE_IT_RUN_ID="$(INTEGRATION_RUN_ID)" PLEXE_IT_VERBOSE=1 bash scripts/tests/run_integration_staged.sh; \
+		DATALOADER_WORKERS="$${DATALOADER_WORKERS:-0}" PLEXE_IT_RUN_ID="$(INTEGRATION_RUN_ID)" PLEXE_IT_VERBOSE=1 bash scripts/tests/run_integration_staged.sh; \
 	else \
-		PLEXE_IT_VERBOSE=1 bash scripts/tests/run_integration_staged.sh; \
+		DATALOADER_WORKERS="$${DATALOADER_WORKERS:-0}" PLEXE_IT_VERBOSE=1 bash scripts/tests/run_integration_staged.sh; \
 	fi
 
 # Fast sanity check - 1 iteration, minimal config
diff --git a/plexe/CODE_INDEX.md b/plexe/CODE_INDEX.md
@@ -1,6 +1,6 @@
 # Code Index: plexe
 
-> Generated on 2026-03-02 21:24:57
+> Generated on 2026-03-02 21:25:06
 
 Code structure and public interface documentation for the **plexe** package.
 
diff --git a/plexe/templates/training/train_pytorch.py b/plexe/templates/training/train_pytorch.py
@@ -12,6 +12,7 @@
 import inspect
 import json
 import logging
+import multiprocessing as mp
 import os
 import sys
 from pathlib import Path
@@ -48,6 +49,27 @@ def _is_rank0(use_ddp: bool) -> bool:
     return dist.get_rank() == 0
 
 
+def _resolve_num_workers(requested_workers: int) -> int:
+    """Resolve safe DataLoader worker count for the current runtime."""
+    if requested_workers <= 0:
+        return 0
+
+    start_method = mp.get_start_method(allow_none=True)
+    if start_method is None:
+        start_method = mp.get_context().get_start_method()
+
+    if sys.platform == "darwin" and start_method == "spawn":
+        logger.warning(
+            "Falling back DataLoader workers from %s to 0 on platform=%s start_method=%s",
+            requested_workers,
+            sys.platform,
+            start_method,
+        )
+        return 0
+
+    return requested_workers
+
+
 def train_pytorch(
     untrained_model_path: Path,
     train_uri: str,
@@ -151,16 +173,18 @@ def train_pytorch(
     train_dataset = ParquetIterableDataset(train_uri, target_column, task_type)
     val_dataset = ParquetIterableDataset(val_uri, target_column, task_type)
 
+    effective_num_workers = _resolve_num_workers(num_workers)
+
     train_loader = torch.utils.data.DataLoader(
         train_dataset,
         batch_size=batch_size,
-        num_workers=num_workers,
+        num_workers=effective_num_workers,
         pin_memory=device.type == "cuda",
     )
     val_loader = torch.utils.data.DataLoader(
         val_dataset,
         batch_size=batch_size,
-        num_workers=num_workers,
+        num_workers=effective_num_workers,
         pin_memory=device.type == "cuda",
     )
 
@@ -172,6 +196,7 @@ def train_pytorch(
         logger.info("Using ParquetIterableDataset for streaming data loading")
         logger.info(f"Training data: {train_rows} rows, {n_features} features (streaming)")
         logger.info(f"Validation data: {val_rows} rows (streaming)")
+        logger.info(f"DataLoader workers: requested={num_workers}, effective={effective_num_workers}")
 
     # ============================================
     # Step 6: Setup mixed precision
diff --git a/tests/CODE_INDEX.md b/tests/CODE_INDEX.md
@@ -1,6 +1,6 @@
 # Code Index: tests
 
-> Generated on 2026-03-02 21:24:57
+> Generated on 2026-03-02 21:25:06
 
 Test suite structure and test case documentation.
 
@@ -147,6 +147,16 @@ Unit tests for pipeline_runner feature name resolution.
 - `test_resolve_feature_names_falls_back_on_mismatch()` - Returns generic names when resolved names don't match output count.
 - `test_resolve_feature_names_falls_back_when_unavailable()` - Returns generic names when no get_feature_names_out is available.
 
+---
+## `unit/templates/training/test_train_pytorch_worker_fallback.py`
+Unit tests for PyTorch DataLoader worker fallback behavior.
+
+**Functions:**
+- `test_resolve_num_workers_zero_is_unchanged() -> None` - Requested zero workers should remain zero.
+- `test_resolve_num_workers_falls_back_on_darwin_spawn(monkeypatch) -> None` - On macOS spawn, requested workers should fall back to zero.
+- `test_resolve_num_workers_uses_context_when_start_method_is_none(monkeypatch) -> None` - When get_start_method returns None, context start method should be used.
+- `test_resolve_num_workers_kept_on_non_darwin_spawn(monkeypatch) -> None` - Spawn on non-macOS should keep the requested worker count.
+
 ---
 ## `unit/test_config.py`
 Unit tests for config helpers.
diff --git a/tests/unit/templates/training/test_train_pytorch_worker_fallback.py b/tests/unit/templates/training/test_train_pytorch_worker_fallback.py
@@ -0,0 +1,40 @@
+"""Unit tests for PyTorch DataLoader worker fallback behavior."""
+
+import pytest
+
+pytest.importorskip("torch")
+
+from plexe.templates.training import train_pytorch
+
+
+def test_resolve_num_workers_zero_is_unchanged() -> None:
+    """Requested zero workers should remain zero."""
+    assert train_pytorch._resolve_num_workers(0) == 0
+
+
+def test_resolve_num_workers_falls_back_on_darwin_spawn(monkeypatch) -> None:
+    """On macOS spawn, requested workers should fall back to zero."""
+    monkeypatch.setattr(train_pytorch.sys, "platform", "darwin")
+    monkeypatch.setattr(train_pytorch.mp, "get_start_method", lambda allow_none=True: "spawn")
+    assert train_pytorch._resolve_num_workers(4) == 0
+
+
+def test_resolve_num_workers_uses_context_when_start_method_is_none(monkeypatch) -> None:
+    """When get_start_method returns None, context start method should be used."""
+
+    class _Context:
+        @staticmethod
+        def get_start_method() -> str:
+            return "spawn"
+
+    monkeypatch.setattr(train_pytorch.sys, "platform", "darwin")
+    monkeypatch.setattr(train_pytorch.mp, "get_start_method", lambda allow_none=True: None)
+    monkeypatch.setattr(train_pytorch.mp, "get_context", lambda: _Context())
+    assert train_pytorch._resolve_num_workers(2) == 0
+
+
+def test_resolve_num_workers_kept_on_non_darwin_spawn(monkeypatch) -> None:
+    """Spawn on non-macOS should keep the requested worker count."""
+    monkeypatch.setattr(train_pytorch.sys, "platform", "linux")
+    monkeypatch.setattr(train_pytorch.mp, "get_start_method", lambda allow_none=True: "spawn")
+    assert train_pytorch._resolve_num_workers(3) == 3