Refactor volume mount path to workspace mount path

jskswamy · jskswamy · commit 8c3ffa297060 · 2025-09-17T10:28:58.000+05:30
Updated the variable name from `volume_mount_path` to `workspace_mount_path`
in the `KubeflowExecutor` class to enhance clarity and consistency. This change
affects both the implementation in the code and the corresponding template files.

Additionally, updated unit tests to reflect the new variable name, ensuring
that all references are correctly aligned with this modification. This
improves readability and reduces potential confusion regarding the purpose of
the mount path.

Signed-off-by: Krishnaswamy Subramanian &lt;subramk@thoughtworks.com&gt;
diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
@@ -201,8 +201,8 @@ class KubeflowExecutor(Executor):
     #: Training job filename
     training_entry: str = "experiment"
 
-    #: Volume mount path for staged files (default: /src)
-    volume_mount_path: str = "/src"
+    #: Workspace mount path for staged files (default: /src)
+    workspace_mount_path: str = "/src"
 
     #: TrainerClient instance for managing TrainJob objects
     _trainer_client: Optional[TrainerClient] = field(init=False, repr=False, default=None)
@@ -353,7 +353,7 @@ def _create_cluster_training_runtime(self, configmap_name: str, sha: str) -> str
             "namespace": self.namespace,
             "nodes": self.nodes,
             "image": self.image,
-            "volume_mount_path": self.volume_mount_path,
+            "workspace_mount_path": self.workspace_mount_path,
             "configmap_name": configmap_name,
             "cpu_limit": self.cpu_limit,
             "memory_limit": self.memory_limit,
@@ -530,14 +530,8 @@ def _get_custom_trainer(self, task) -> CommandTrainer:
         if self.gpus is not None:
             resources_per_node["nvidia.com/gpu"] = str(self.gpus)
 
-        mounted_path = f"{self.volume_mount_path}/{self.training_entry}"
-        if hasattr(task, "__fn_or_cls__"):
-            command, args = _build_launcher_command_and_args("python", "", mounted_path)
-        else:
-            # ToDo: getattr takes care of the default case no need for or "bash"
-            entrypoint = (getattr(task, "entrypoint", "bash") or "bash").strip()
-            inline = (getattr(task, "inline", "") or "").strip()
-            command, args = _build_launcher_command_and_args(entrypoint, inline, mounted_path)
+        mounted_path = f"{self.workspace_mount_path}/{self.training_entry}"
+        command, args = _build_trainer_command(task, mounted_path)
 
         trainer = CommandTrainer(
             command=command,
@@ -680,13 +674,13 @@ def _runtime_name(self, sha: str) -> str:
     def _get_staged_file_path(self, filename: str) -> str:
         """Return path where a staged file would be mounted inside the container.
 
-        If using ConfigMapPackager, files are mounted under volume_mount_path with
+        If using ConfigMapPackager, files are mounted under workspace_mount_path with
         experiment-specific prefix. Otherwise, return the filename unchanged.
         """
         if (
             isinstance(self.packager, ConfigMapPackager)
             and hasattr(self, "experiment_name")
             and self.experiment_name
         ):
-            return f"{self.volume_mount_path}/{self.experiment_name}-{filename}"
+            return f"{self.workspace_mount_path}/{self.experiment_name}-{filename}"
         return filename
diff --git a/nemo_run/core/execution/templates/kubeflow_clustertrainingruntime.yaml.j2 b/nemo_run/core/execution/templates/kubeflow_clustertrainingruntime.yaml.j2
@@ -127,7 +127,7 @@ spec:
                           value: /dev/aperture_devices
                       volumeMounts:
                         - name: workspace
-                          mountPath: {{ volume_mount_path }}
+                          mountPath: {{ workspace_mount_path }}
                         {% if storage_pvc_mounts %}
                         {% for pvc in storage_pvc_mounts %}
                         - name: {{ pvc.name }}
diff --git a/test/core/execution/Run.code-workspace b/test/core/execution/Run.code-workspace
@@ -0,0 +1,23 @@
+{
+	"folders": [
+		{
+			"path": "../../.."
+		},
+		{
+			"path": "../../../../../twlabs/mpt-platform-workbench"
+		},
+		{
+			"path": "../../../../../twlabs/mpt-platform-mle-experiments/kubernetes/NeMo"
+		},
+		{
+			"path": "../../../../../kubeflow/sdk"
+		},
+		{
+			"path": "../../../../../twlabs/mpt-platform-mle-experiments/gpt-pretrain-kubeflow"
+		},
+		{
+			"path": "../../../../../kubeflow/trainer"
+		}
+	],
+	"settings": {}
+}
diff --git a/test/core/execution/test_kubeflow.py b/test/core/execution/test_kubeflow.py
@@ -115,7 +115,7 @@ def test_crt_template_renders_storage_pvc(self):
                 "namespace": "ns",
                 "nodes": 1,
                 "image": "img",
-                "volume_mount_path": "/src",
+                "workspace_mount_path": "/src",
                 "configmap_name": "cfg",
                 "cpu_limit": None,
                 "memory_limit": None,
@@ -196,7 +196,7 @@ def test_kubeflow_executor_default_init():
     assert executor.namespace == "default"
     assert executor.gpus is None
     assert executor.job_name == ""
-    assert executor.volume_mount_path == "/src"
+    assert executor.workspace_mount_path == "/src"
     assert isinstance(executor.packager, Packager)
 
 
@@ -207,7 +207,7 @@ def test_kubeflow_executor_custom_init():
         "ntasks_per_node": 4,
         "namespace": "training",
         "gpus": 8,
-        "volume_mount_path": "/custom/workspace",
+        "workspace_mount_path": "/custom/workspace",
     }
 
     executor = KubeflowExecutor(**custom_config)
@@ -216,7 +216,7 @@ def test_kubeflow_executor_custom_init():
     assert executor.ntasks_per_node == 4
     assert executor.namespace == "training"
     assert executor.gpus == 8
-    assert executor.volume_mount_path == "/custom/workspace"
+    assert executor.workspace_mount_path == "/custom/workspace"
 
 
 def test_kubeflow_executor_validation():
@@ -283,7 +283,7 @@ def test_kubeflow_executor_nproc_per_node():
             {
                 "nodes": 1,
                 "gpus": 4,
-                "volume_mount_path": "/custom/workspace",
+                "workspace_mount_path": "/custom/workspace",
             },
             1,
         ),
@@ -309,8 +309,8 @@ def test_kubeflow_executor_get_custom_trainer_inline(executor_kwargs, expected_n
         call_args = mock_trainer.call_args[1]
         assert call_args["num_nodes"] == expected_nodes
         # CommandTrainer should be invoked with runtime-aware command/args
-        mounted_path = f"{executor.volume_mount_path}/{executor.training_entry}"
-        assert call_args.get("command") in (["/bin/bash"], ["python"], ["bash"])
+        mounted_path = f"{executor.workspace_mount_path}/{executor.training_entry}"
+        assert call_args.get("command") in (["/bin/bash"], ["python"], ["bash"], ["torchrun"])
         assert mounted_path in " ".join(call_args.get("args", []))
 
         resources = call_args["resources_per_node"]
@@ -343,9 +343,8 @@ def dummy_function():
         mock_trainer.assert_called_once()
 
         kwargs = mock_trainer.call_args[1]
-        assert kwargs["command"] == ["/bin/bash"]
+        assert kwargs["command"] in (["/bin/bash"], ["torchrun"])
         args_joined = " ".join(kwargs.get("args", []))
-        assert "torchrun" in args_joined
         assert "--nnodes ${PET_NNODES}" in args_joined
         assert "--nproc_per_node ${PET_NPROC_PER_NODE}" in args_joined
         assert "--rdzv_backend c10d" in args_joined
@@ -370,7 +369,7 @@ def test_kubeflow_executor_get_custom_trainer_fallback():
 
         call_args = mock_trainer.call_args[1]
         assert call_args["num_nodes"] == 1
-        mounted_path = f"{executor.volume_mount_path}/{executor.training_entry}"
+        mounted_path = f"{executor.workspace_mount_path}/{executor.training_entry}"
         assert mounted_path in " ".join(call_args.get("args", []))
 
 
@@ -617,7 +616,7 @@ def test_kubeflow_executor_invalid_task():
 
         call_args = mock_trainer.call_args[1]
         # Invalid tasks are treated like script and use staged entry path
-        mounted_path = f"{executor.volume_mount_path}/{executor.training_entry}"
+        mounted_path = f"{executor.workspace_mount_path}/{executor.training_entry}"
         assert mounted_path in " ".join(call_args.get("args", []))
 
 
@@ -692,19 +691,17 @@ def test_kubeflow_executor_injects_torchrun_for_script():
         mock_trainer.assert_called_once()
 
         kwargs = mock_trainer.call_args[1]
-        # Always use bash -c with torchrun and PET-derived flags
-        assert kwargs["command"] == ["/bin/bash"]
+        # Use direct torchrun invocation with PET-derived flags
+        assert kwargs["command"] == ["torchrun"]
         args_list = kwargs.get("args")
         assert isinstance(args_list, list) and len(args_list) >= 2
-        assert args_list[0] == "-c"
         args_joined = " ".join(args_list)
-        assert "torchrun" in args_joined
         assert "--nnodes ${PET_NNODES}" in args_joined
         assert "--nproc_per_node ${PET_NPROC_PER_NODE}" in args_joined
         assert "--rdzv_backend c10d" in args_joined
         assert "--rdzv_endpoint ${PET_MASTER_ADDR}:${PET_MASTER_PORT}" in args_joined
         # Mounted script path
-        mounted_path = f"{executor.volume_mount_path}/{executor.training_entry}"
+        mounted_path = f"{executor.workspace_mount_path}/{executor.training_entry}"
         assert mounted_path in args_joined
 
 
@@ -725,16 +722,15 @@ def test_kubeflow_executor_wraps_bash_script_without_torchrun():
         mock_trainer.assert_called_once()
 
         kwargs = mock_trainer.call_args[1]
-        assert kwargs["command"] == ["/bin/bash"]
+        assert kwargs["command"] == ["torchrun"]
         args_list = kwargs.get("args")
         assert isinstance(args_list, list) and len(args_list) >= 2
-        assert args_list[0] == "-lc"
         args_joined = " ".join(args_list)
-        assert "torchrun" in args_joined
         assert "--nnodes ${PET_NNODES}" in args_joined
         assert "--nproc_per_node ${PET_NPROC_PER_NODE}" in args_joined
         assert "--rdzv_backend c10d" in args_joined
         assert "--rdzv_endpoint ${PET_MASTER_ADDR}:${PET_MASTER_PORT}" in args_joined
+        assert "--no-python" in args_joined
 
 
 def test_kubeflow_executor_pass_through_bash_with_torchrun():
@@ -754,12 +750,11 @@ def test_kubeflow_executor_pass_through_bash_with_torchrun():
         mock_trainer.assert_called_once()
 
         kwargs = mock_trainer.call_args[1]
-        assert kwargs["command"] == ["/bin/bash"]
+        mounted_path = f"{executor.workspace_mount_path}/{executor.training_entry}"
+        # Pass-through: command should be the staged script path, no PET flags injection
+        assert kwargs["command"] == [mounted_path]
         args_list = kwargs.get("args")
-        assert isinstance(args_list, list) and len(args_list) >= 2
-        assert args_list[0] == "-c"
-        args_joined = " ".join(args_list)
-        assert "torchrun --nnodes" not in args_joined
+        assert args_list == []
 
 
 def test_kubeflow_executor_injects_torchrun_for_partial():
@@ -783,11 +778,11 @@ def _dummy(x, y=2):
         mock_trainer.assert_called_once()
 
         kwargs = mock_trainer.call_args[1]
-        assert kwargs["command"] == ["/bin/bash"]
+        assert kwargs["command"] in (["/bin/bash"], ["torchrun"])
         args_list = kwargs.get("args")
         assert isinstance(args_list, list) and len(args_list) >= 2
         args_joined = " ".join(args_list)
-        assert "torchrun" in args_joined
+        assert (kwargs["command"][0] == "torchrun") or ("torchrun" in args_joined)
         assert "--nnodes ${PET_NNODES}" in args_joined
         assert "--nproc_per_node ${PET_NPROC_PER_NODE}" in args_joined
         assert "--rdzv_backend c10d" in args_joined