Implement Inline Script Execution in Kubeflow Executor

jskswamy · jskswamy · commit 5cf880d4de67 · 2025-08-18T22:40:12.000+05:30
Add support for inline script execution in the KubeflowExecutor
using the SDK's function argument injection style. This change allows
users to pass scripts directly as inline parameters, enhancing
flexibility in task execution.

Key changes include:
- Introduced `_nemo_inline_entry_params` function for handling
  inline script execution.
- Updated `create_trainjob` and `submit` methods to support
  inline scripts.
- Enhanced logging for better tracking of execution modes.
- Improved Kubernetes runtime management, enabling reuse of
  ClusterTrainingRuntime across experiments with similar
  configurations.

Signed-off-by: Krishnaswamy Subramanian &lt;subramk@thoughtworks.com&gt;
diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
diff --git a/nemo_run/core/execution/templates/kubeflow_clustertrainingruntime.yaml.j2 b/nemo_run/core/execution/templates/kubeflow_clustertrainingruntime.yaml.j2
@@ -0,0 +1,38 @@
+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: ClusterTrainingRuntime
+metadata:
+  name: {{ runtime_name }}
+  namespace: {{ namespace }}
+spec:
+  mlPolicy:
+    numNodes: {{ nodes }}
+    torch:
+      numProcPerNode: "auto"
+  template:
+    spec:
+      replicatedJobs:
+        - name: node
+          replicas: {{ nodes }}
+          template:
+            metadata:
+              labels:
+                trainer.kubeflow.org/trainjob-ancestor-step: trainer
+            spec:
+              template:
+                spec:
+                  volumes:
+                    - name: workspace
+                      configMap:
+                        name: {{ configmap_name }}
+                  containers:
+                    - name: node
+                      image: {{ image }}
+                      volumeMounts:
+                        - name: workspace
+                          mountPath: {{ volume_mount_path }}
+                      resources:
+                        requests: {}
+                        limits:
+                          {% if cpu_limit %}cpu: {{ cpu_limit }}{% endif %}
+                          {% if memory_limit %}memory: {{ memory_limit }}{% endif %}
+                          {% if gpus %}"nvidia.com/gpu": {{ gpus }}{% endif %}
diff --git a/nemo_run/core/packaging/configmap.py b/nemo_run/core/packaging/configmap.py
@@ -41,6 +41,7 @@ class ConfigMapPackager(Packager):
     relative_path: str | List[str] = "."
     namespace: str = "default"
     configmap_prefix: str = "nemo-workspace"
+    configmap_id: Optional[str] = None  # Reusable configmap identifier
 
     def __post_init__(self):
         """Initialize the Kubernetes client."""
@@ -111,11 +112,12 @@ def package(self, path: Path, job_dir: str, name: str) -> str:
         Returns:
             The name of the created ConfigMap (or intended name if not created)
         """
+        # Resolve the final ConfigMap name centrally
+        configmap_name = self.resolve_configmap_name(name)
+
         if self.v1 is None:
             logger.warning("Kubernetes client not available, skipping ConfigMap creation")
-            return f"{self.configmap_prefix}-{name}"
-
-        configmap_name = f"{self.configmap_prefix}-{name}"
+            return configmap_name
         files_to_stage = self._find_files_to_package(path)
         if not files_to_stage:
             logger.warning("No files found to package into ConfigMap")
@@ -165,6 +167,18 @@ def package(self, path: Path, job_dir: str, name: str) -> str:
                 logger.error(f"Failed to create ConfigMap {configmap_name}: {e}")
         return configmap_name
 
+    def resolve_configmap_name(self, name: str) -> str:
+        """
+        Resolve the full ConfigMap name from a caller-provided suffix.
+
+        Centralizes naming logic so callers never assemble full names.
+        If configmap_id is set, it takes precedence and is sanitized.
+        Otherwise, returns "{configmap_prefix}-{name}".
+        """
+        if self.configmap_id:
+            return f"{self.configmap_prefix}-{sanitize_kubernetes_name(self.configmap_id)}"
+        return f"{self.configmap_prefix}-{name}"
+
     def _find_files_to_package(self, base_path: Path) -> List[Path]:
         """
         Find files to package based on include_pattern and relative_path.
@@ -198,7 +212,8 @@ def cleanup(self, name: str) -> None:
         """
         if self.v1 is None:
             return
-        configmap_name = f"{self.configmap_prefix}-{name}"
+        # Use the same resolution logic as in package()
+        configmap_name = self.resolve_configmap_name(name)
         try:
             self.v1.delete_namespaced_config_map(name=configmap_name, namespace=self.namespace)
             logger.info(f"Cleaned up ConfigMap: {configmap_name}")
diff --git a/nemo_run/run/experiment.py b/nemo_run/run/experiment.py
@@ -342,6 +342,7 @@ def __init__(
 
         self.log_level = log_level
         self._runner = get_runner(component_defaults=None, experiment=self)
+        self._detach_mode = False  # Will be set in _run_dag
 
         if not _reconstruct:
             self.executor = executor if executor else LocalExecutor()
@@ -471,6 +472,23 @@ def _add_single_job(
             task_dir=name if reuse_job_dir else task_id,
         )
 
+        # Set detach mode on executor if supported
+        if hasattr(self, "detach") and hasattr(executor, "set_detach_mode"):
+            set_detach_mode = getattr(executor, "set_detach_mode", None)
+            if set_detach_mode:
+                self.console.log(
+                    f"Setting detach mode to {self.detach} on executor {type(executor).__name__}"
+                )
+                set_detach_mode(self.detach)
+            else:
+                self.console.log(
+                    f"Executor {type(executor).__name__} doesn't support set_detach_mode"
+                )
+        else:
+            self.console.log(
+                f"Experiment detach mode: {getattr(self, 'detach', 'not set')}, Executor has set_detach_mode: {hasattr(executor, 'set_detach_mode')}"
+            )
+
         cloned = copy.deepcopy(task) if isinstance(task, Script) else task.clone()
         job = Job(
             id=task_id,
@@ -783,6 +801,12 @@ def _run_dag(self, detach: bool, tail_logs: bool, executors: set[Executor]):
             )
             wait = False
             self.detach = detach
+            self._detach_mode = detach
+
+            # Create a new runner with detach mode for this execution
+            from nemo_run.run.torchx_backend.runner import get_runner
+
+            self._runner = get_runner(component_defaults=None, detach_mode=detach)
 
         for level in order:
             # Launch jobs in this level concurrently since they are independent
diff --git a/nemo_run/run/torchx_backend/runner.py b/nemo_run/run/torchx_backend/runner.py
@@ -112,6 +112,7 @@ def schedule(self, dryrun_info: AppDryRunInfo) -> AppHandle:
 
 def get_runner(
     component_defaults: Optional[dict[str, dict[str, str]]] = None,
+    detach_mode: bool = False,
     **scheduler_params: Any,
 ) -> Runner:
     """
@@ -144,5 +145,9 @@ def get_runner(
     """
     name = "nemo_run"
 
+    # Add detach_mode to scheduler_params for kubeflow scheduler
+    if detach_mode:
+        scheduler_params["detach_mode"] = detach_mode
+
     scheduler_factories = get_scheduler_factories()
     return Runner(name, scheduler_factories, component_defaults, scheduler_params=scheduler_params)
diff --git a/nemo_run/run/torchx_backend/schedulers/kubeflow.py b/nemo_run/run/torchx_backend/schedulers/kubeflow.py
@@ -26,6 +26,7 @@
 
 from nemo_run.core.execution.base import Executor
 from nemo_run.core.execution.kubeflow import KubeflowExecutor
+from nemo_run.core.packaging.configmap import ConfigMapPackager
 from nemo_run.run.torchx_backend.schedulers.api import SchedulerMixin
 
 logger = logging.getLogger(__name__)
@@ -43,11 +44,13 @@ def __init__(
         self,
         session_name: str,
         namespace: str = "default",
+        detach_mode: bool = False,
         **kwargs: Any,
     ) -> None:
         self.backend = "kubeflow"
         self.session_name = session_name
         self.namespace = namespace
+        self.detach_mode = detach_mode
         self._apps: dict[str, dict[str, Any]] = {}
 
     def _submit_dryrun(self, app: AppDef, cfg: Executor) -> AppDryRunInfo[dict[str, Any]]:
@@ -60,23 +63,47 @@ def _submit_dryrun(self, app: AppDef, cfg: Executor) -> AppDryRunInfo[dict[str,
         job_config = self._appdef_to_kubeflow_config(app, cfg)
 
         return AppDryRunInfo(
-            app_id=f"kubeflow://{self.session_name}/{app.name}",
-            app=app,
-            request=job_config,
-            repr=f"Kubeflow job: {app.name}",
+            job_config,
+            lambda _: f"Kubeflow job: {app.name}",
         )
 
     def schedule(self, dryrun_info: AppDryRunInfo[dict[str, Any]]) -> str:
         """Submit the job to Kubeflow."""
-        app = dryrun_info.app
-        cfg = dryrun_info.request["executor"]
+        job_config = dryrun_info.request
+        cfg = job_config["executor"]
 
         # Create the TrainJob using KubeflowExecutor
-        job_id = cfg.create_trainjob(app.name)
+        # Extract the task from the app definition
+        app = job_config["app"]
+        task = None
+
+        # Try to extract task from the app roles
+        if app.roles and len(app.roles) > 0:
+            main_role = app.roles[0]
+            if main_role.args:
+                # Create a simple task object for the executor
+                from nemo_run.config import Script
+
+                task = Script(inline=" ".join(main_role.args))
+
+        if task is None:
+            # Create a default task if none found
+            from nemo_run.config import Script
+
+            task = Script(inline="echo 'No task specified'")
+
+        # Stage files via ConfigMap if configured
+        try:
+            if isinstance(cfg.packager, ConfigMapPackager):
+                cfg.stage_files(cfg.default_task_dir, task)
+        except Exception as e:
+            logger.error(f"Failed to stage files via ConfigMapPackager: {e}")
+
+        job_id = cfg.create_trainjob(job_config["app"].name, task)
 
         # Store job info for later reference
         self._apps[job_id] = {
-            "app": app,
+            "app": job_config["app"],
             "executor": cfg,
             "job_id": job_id,
             "state": AppState.SUBMITTED,
@@ -103,7 +130,7 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
                 state=app_state,
                 num_restarts=0,  # Kubeflow handles restarts internally
                 msg=f"Kubeflow job status: {status}",
-                structured_error_msg=None,
+                structured_error_msg="",
                 roles_statuses=[],
             )
         except Exception as e:
@@ -166,12 +193,10 @@ def _appdef_to_kubeflow_config(self, app: AppDef, cfg: KubeflowExecutor) -> dict
             # If we have a script with inline content, extract it
             if len(main_role.args) >= 2 and main_role.args[0] == "python":
                 # This is a file-based execution
-                cfg.python_file = main_role.args[1]
+                logger.info(f"File-based execution: {main_role.args[1]}")
             elif len(main_role.args) >= 2 and main_role.args[0] == "-c":
                 # This is inline script execution
-                script_content = main_role.args[1]
-                # For now, we'll create a temporary file or use a default
-                cfg.python_file = "inline_script.py"
+                logger.info("Inline script execution detected")
                 logger.warning("Inline script execution not fully implemented yet")
 
         return {
@@ -195,15 +220,39 @@ def _map_kubeflow_status_to_torchx(self, kubeflow_status: str) -> AppState:
         else:
             return AppState.UNKNOWN
 
+    def _validate(self, app: AppDef, scheduler: str) -> None:
+        """Validate the app definition for Kubeflow."""
+        # For now, skip validation as Kubeflow handles this internally
+        pass
+
+    def close(self) -> None:
+        """Clean up resources when the scheduler is closed."""
+        # Cancel all running jobs unless in detach mode
+        for app_id in list(self._apps.keys()):
+            try:
+                # Check if scheduler is in detach mode
+                if self.detach_mode:
+                    logger.info(f"Skipping cleanup for job {app_id} in detach mode")
+                    continue
+
+                self.cancel(app_id)
+            except Exception as e:
+                logger.error(f"Failed to cancel job {app_id} during close: {e}")
+
+        # Clear the apps dictionary
+        self._apps.clear()
+
 
 def create_scheduler(
     session_name: str,
     namespace: str = "default",
+    detach_mode: bool = False,
     **kwargs: Any,
 ) -> KubeflowScheduler:
     """Create a Kubeflow scheduler instance."""
     return KubeflowScheduler(
         session_name=session_name,
         namespace=namespace,
+        detach_mode=detach_mode,
         **kwargs,
     )
diff --git a/pyproject.toml b/pyproject.toml
@@ -49,8 +49,8 @@ skypilot = "nemo_run.run.torchx_backend.schedulers.skypilot:create_scheduler"
 local_persistent = "nemo_run.run.torchx_backend.schedulers.local:create_scheduler"
 docker_persistent = "nemo_run.run.torchx_backend.schedulers.docker:create_scheduler"
 dgx_cloud = "nemo_run.run.torchx_backend.schedulers.dgxcloud:create_scheduler"
-lepton = "nemo_run.run.torchx_backend.schedulers.lepton:create_scheduler"
 kubeflow = "nemo_run.run.torchx_backend.schedulers.kubeflow:create_scheduler"
+lepton = "nemo_run.run.torchx_backend.schedulers.lepton:create_scheduler"
 
 [project.optional-dependencies]
 skypilot = ["skypilot[kubernetes]>=0.10.0"]
diff --git a/test/core/execution/test_kubeflow.py b/test/core/execution/test_kubeflow.py