fix(tuner): Include sm_drivers channel in HyperparameterTuner jobs (#5634)

nargokul · web-flow · commit 6a174f4b4dd4 · 2026-03-17T22:48:47.000-07:00
* fix(tuner): Include sm_drivers channel in HyperparameterTuner jobs When ModelTrainer has distributed=Torchrun(), the sm_drivers channel contains torchrun_driver.py and sm_train.sh which are required for multi-GPU execution. The tuner was not building this channel, causing the framework container to fall back to the legacy single-GPU entry point (python train.py) instead of torchrun. This caused a tensor size mismatch (batch_size vs accumulated_batch) in TRL's compute_loss when gradient_accumulation_steps > 1, because the single-process path doesn't partition batches across ranks. Fix: Replace _upload_source_code_and_configure_hyperparameters with _build_driver_and_code_channels that replicates ModelTrainer's channel building logic (sm_drivers, code, distributed.json, sourcecode.json, sm_train.sh). Also pass through environment and VPC config. * fix(tuner): Harden _build_training_job_definition against missing attributes - Use getattr with fallback for static_hyperparameters (fixes test_build_training_job_definition_includes_internal_channels) - Guard _prepare_model_trainer_for_tuning with isinstance check on entry_script to avoid calling _build_driver_and_code_channels on MagicMock model trainers - Guard environment passthrough with isinstance(env, dict) check - Guard VPC config passthrough with try/except for mock safety * fix(test): Rewrite tuner distributed integ test to match CI patterns - Use sagemaker_session fixture from conftest (auto-resolves role/region) - Use ml.m5.xlarge CPU instance (cheaper, available in CI) - Remove hardcoded role ARN and training_mode - Remove @pytest.mark.slow (not registered in CI config) - Use module-level function instead of class (matches other integ tests) - Use DEFAULT_CPU_IMAGE consistent with test_model_trainer.py * fix(tuner): Upload sourcedir.tar.gz for framework container compatibility The HPT API uses the legacy framework container path which expects sagemaker_submit_directory (a tar.gz on S3) to be downloaded and extracted to /opt/ml/code/. The previous approach of using a 'code' input channel mounted the code at /opt/ml/input/data/code/ instead, causing 'No such file or directory' errors. Fix: Create and upload sourcedir.tar.gz to S3, set both sagemaker_program and sagemaker_submit_directory hyperparameters. Remove the separate 'code' input channel since the framework container handles code extraction via sagemaker_submit_directory. * test(tuner): Add unit tests for driver/code channel building Add 25 unit tests covering the tuner changes from PR #5634: - _prepare_model_trainer_for_tuning guard logic - _build_driver_and_code_channels sm_drivers channel creation - _build_training_job_definition _tuner_channels inclusion - Environment and VPC config passthrough - sourcedir.tar.gz upload and sagemaker_submit_directory HP - static_hyperparameters getattr fallback
diff --git a/sagemaker-train/src/sagemaker/train/tuner.py b/sagemaker-train/src/sagemaker/train/tuner.py
@@ -444,96 +444,140 @@ def _prepare_auto_parameters(self, static_hyperparameters, hyperparameters_to_ke
 
     @classmethod
     def _prepare_model_trainer_for_tuning(cls, model_trainer, inputs=None, job_name=None, **kwargs):
-        """Prepare ModelTrainer before tuning by uploading source code and configuring hyperparameters.
+        """Prepare ModelTrainer before tuning by building sm_drivers and code channels.
 
-        This method mimics V2's _prepare_estimator_for_tuning() pattern, adapted for V3's
-        ModelTrainer architecture. It ensures that script mode hyperparameters are set before
-        the tuning job is created, which framework containers (PyTorch, TensorFlow) require.
+        This method replicates the channel-building logic from ModelTrainer._create_training_job()
+        to ensure the sm_drivers channel (containing torchrun_driver.py, distributed config, and
+        sm_train.sh) is included in the tuning job definition. Without this, the framework
+        container falls back to the legacy entry point (python train.py) instead of using the
+        V3 driver (torchrun), breaking distributed training.
 
         Args:
             model_trainer: ModelTrainer instance to prepare
             inputs: Training inputs (unused, for V2 compatibility)
             job_name: Job name (unused, for V2 compatibility)
             **kwargs: Additional arguments (unused, for V2 compatibility)
         """
-        # Only proceed if source_code is configured
-        if hasattr(model_trainer, "source_code") and model_trainer.source_code is not None:
-            cls._upload_source_code_and_configure_hyperparameters(model_trainer)
+        source_code = getattr(model_trainer, "source_code", None)
+        if source_code is None:
+            return
+        # Only proceed if source_code has a real entry_script string
+        entry_script = getattr(source_code, "entry_script", None)
+        if not isinstance(entry_script, str):
+            return
 
-    @classmethod
-    def _upload_source_code_and_configure_hyperparameters(cls, model_trainer):
-        """Upload source code to S3 and add script mode hyperparameters.
+        cls._build_driver_and_code_channels(model_trainer)
 
-        Framework containers (PyTorch, TensorFlow) expect sagemaker_program and
-        sagemaker_submit_directory hyperparameters for script mode execution. This method:
-        1. Checks if source_dir is a local path or S3 URI
-        2. Creates a tar.gz archive and uploads to S3
-        3. Adds required script mode hyperparameters to model_trainer.hyperparameters
+    @classmethod
+    def _build_driver_and_code_channels(cls, model_trainer):
+        """Build sm_drivers and code input channels for the tuning job.
 
-        This follows V2's pattern of creating sourcedir.tar.gz files.
+        Replicates the channel-building logic from ModelTrainer._create_training_job()
+        so that the tuning job gets the same execution environment as a standalone
+        training job (distributed drivers, source code, train script).
 
         Args:
             model_trainer: ModelTrainer instance with source_code configured
         """
+        import json
         import os
-        import tarfile
-        import tempfile
+        import shutil
         import time
+        from tempfile import TemporaryDirectory
+
+        from sagemaker.train.constants import (
+            SM_CODE,
+            SM_DRIVERS,
+            SM_DRIVERS_LOCAL_PATH,
+            DEFAULT_CONTAINER_ENTRYPOINT,
+            DEFAULT_CONTAINER_ARGUMENTS,
+        )
 
         source_code = model_trainer.source_code
+        base_name = model_trainer.base_job_name or "tuning"
+        key_prefix = f"{base_name}/tuning-{int(time.time())}/input"
+
+        # Build sm_drivers channel (same as ModelTrainer._create_training_job)
+        temp_dir = TemporaryDirectory()
+        shutil.copytree(SM_DRIVERS_LOCAL_PATH, temp_dir.name, dirs_exist_ok=True)
+
+        # If distributed config is set, copy distributed drivers
+        if model_trainer.distributed:
+            driver_dir = os.path.join(temp_dir.name, "distributed_drivers")
+            shutil.copytree(model_trainer.distributed.driver_dir, driver_dir, dirs_exist_ok=True)
+
+        # Write sourcecode.json
+        source_code_json_path = os.path.join(temp_dir.name, "sourcecode.json")
+        with open(source_code_json_path, "w") as f:
+            dump = source_code.model_dump() if source_code else {}
+            f.write(json.dumps(dump))
+
+        # Write distributed.json
+        distributed_json_path = os.path.join(temp_dir.name, "distributed.json")
+        with open(distributed_json_path, "w") as f:
+            dump = model_trainer.distributed.model_dump() if model_trainer.distributed else {}
+            f.write(json.dumps(dump))
+
+        # Prepare the train script (sm_train.sh)
+        model_trainer._prepare_train_script(
+            tmp_dir=temp_dir,
+            source_code=source_code,
+            distributed=model_trainer.distributed,
+        )
+
+        # Upload sm_drivers channel
+        sm_drivers_channel = model_trainer.create_input_data_channel(
+            channel_name=SM_DRIVERS,
+            data_source=temp_dir.name,
+            key_prefix=key_prefix,
+            ignore_patterns=source_code.ignore_patterns,
+        )
 
-        # Get source directory and entry script
-        source_dir = source_code.source_dir
-        entry_script = source_code.entry_script
+        # Store channels on model_trainer so _build_training_job_definition can pick them up
+        model_trainer._tuner_channels = [sm_drivers_channel]
 
-        # Check if already an S3 URI
-        if _is_valid_s3_uri(source_dir):
-            # Already uploaded, use as-is
-            source_s3_uri = source_dir
-        else:
-            # Local directory - need to create tar.gz and upload
-            session = model_trainer.sagemaker_session
-            bucket = session.default_bucket()
+        # Set script mode hyperparameters required by framework containers.
+        # The framework container (PyTorch, TF) uses sagemaker_program to find the entry script
+        # and sagemaker_submit_directory to download source code to /opt/ml/code/.
+        if model_trainer.hyperparameters is None:
+            model_trainer.hyperparameters = {}
+        model_trainer.hyperparameters["sagemaker_program"] = source_code.entry_script
 
-            # Generate S3 key
-            timestamp = int(time.time())
-            s3_key = (
-                f"{model_trainer.base_job_name or 'source'}/source-{timestamp}/sourcedir.tar.gz"
-            )
+        # Upload sourcedir.tar.gz for the legacy framework container path.
+        # The HPT API doesn't support container_entrypoint, so the framework container
+        # uses sagemaker_submit_directory to download and extract code to /opt/ml/code/.
+        if source_code.source_dir and not _is_valid_s3_uri(source_code.source_dir):
+            import tarfile
+            import tempfile
 
-            # Create tar.gz file
-            with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp_file:
-                tar_path = tmp_file.name
+            session = model_trainer.sagemaker_session
+            bucket = session.default_bucket()
+            s3_key = f"{key_prefix}/sourcedir/sourcedir.tar.gz"
 
+            with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
+                tar_path = tmp.name
             try:
-                # Create tar.gz archive
                 with tarfile.open(tar_path, "w:gz") as tar:
-                    # Add all files from source_dir
-                    for root, dirs, files in os.walk(source_dir):
-                        for file in files:
-                            file_path = os.path.join(root, file)
-                            # Calculate arcname to preserve directory structure
-                            arcname = os.path.relpath(file_path, source_dir)
-                            tar.add(file_path, arcname=arcname)
-
-                # Upload to S3
-                s3_client = session.boto_session.client("s3", region_name=session.boto_region_name)
+                    for root, _dirs, files in os.walk(source_code.source_dir):
+                        for f in files:
+                            fpath = os.path.join(root, f)
+                            arcname = os.path.relpath(fpath, source_code.source_dir)
+                            tar.add(fpath, arcname=arcname)
+                s3_client = session.boto_session.client(
+                    "s3", region_name=session.boto_region_name
+                )
                 s3_client.upload_file(tar_path, bucket, s3_key)
-
-                # Construct S3 URI
-                source_s3_uri = f"s3://{bucket}/{s3_key}"
+                model_trainer.hyperparameters["sagemaker_submit_directory"] = (
+                    f"s3://{bucket}/{s3_key}"
+                )
             finally:
-                # Clean up temp file
                 if os.path.exists(tar_path):
                     os.remove(tar_path)
+        elif source_code.source_dir and _is_valid_s3_uri(source_code.source_dir):
+            model_trainer.hyperparameters["sagemaker_submit_directory"] = source_code.source_dir
 
-        # Initialize hyperparameters dict if None
-        if model_trainer.hyperparameters is None:
-            model_trainer.hyperparameters = {}
-
-        # Add script mode hyperparameters required by framework containers
-        model_trainer.hyperparameters["sagemaker_program"] = entry_script
-        model_trainer.hyperparameters["sagemaker_submit_directory"] = source_s3_uri
+        # Store the temp dir reference to prevent cleanup
+        model_trainer._tuner_temp_dir = temp_dir
 
     @runnable_by_pipeline
     def tune(
@@ -1422,6 +1466,12 @@ def _build_training_job_definition(self, inputs):
                 if not any(c.channel_name == channel.channel_name for c in input_data_config):
                     input_data_config.append(channel)
 
+        # Include channels built by _prepare_model_trainer_for_tuning (sm_drivers, code)
+        if hasattr(model_trainer, "_tuner_channels") and model_trainer._tuner_channels:
+            for channel in model_trainer._tuner_channels:
+                if not any(c.channel_name == channel.channel_name for c in input_data_config):
+                    input_data_config.append(channel)
+
         # Build output data config
         output_config = OutputDataConfig(
             s3_output_path=(
@@ -1459,7 +1509,22 @@ def _build_training_job_definition(self, inputs):
             output_data_config=output_config,
             resource_config=resource_config,
             stopping_condition=stopping_condition,
-            static_hyper_parameters=self.static_hyperparameters or {},
+            static_hyper_parameters=getattr(self, "static_hyperparameters", None) or {},
         )
 
+        # Pass through environment variables from model_trainer
+        env = getattr(model_trainer, "environment", None)
+        if env and isinstance(env, dict):
+            definition.environment = env
+
+        # Pass through VPC config from model_trainer
+        networking = getattr(model_trainer, "networking", None)
+        if networking and hasattr(networking, "_to_vpc_config"):
+            try:
+                vpc_config = networking._to_vpc_config()
+                if vpc_config:
+                    definition.vpc_config = vpc_config
+            except Exception:
+                pass
+
         return definition
diff --git a/sagemaker-train/tests/integ/train/test_tuner_distributed.py b/sagemaker-train/tests/integ/train/test_tuner_distributed.py
@@ -0,0 +1,127 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""Integration test: HyperparameterTuner with Torchrun distributed training.
+
+Regression test for the bug where HyperparameterTuner dropped the sm_drivers
+channel, causing the container to fall back to single-GPU execution instead
+of using torchrun for multi-GPU distributed training.
+"""
+from __future__ import absolute_import
+
+import os
+import time
+import logging
+
+import pytest
+
+from sagemaker.train.model_trainer import ModelTrainer
+from sagemaker.train.configs import SourceCode, Compute
+from sagemaker.train.distributed import Torchrun
+from sagemaker.train.tuner import HyperparameterTuner
+from sagemaker.core.parameter import ContinuousParameter
+
+logger = logging.getLogger(__name__)
+
+DATA_DIR = os.path.join(os.path.dirname(__file__), "../..", "data")
+DEFAULT_CPU_IMAGE = (
+    "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-cpu-py310"
+)
+
+TRAIN_SCRIPT_CONTENT = """\
+import os
+import argparse
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--learning_rate", type=float, default=1e-4)
+    args, _ = parser.parse_known_args()
+
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+
+    print(f"DISTRIBUTED_CHECK: world_size={world_size}")
+    print(f"DISTRIBUTED_CHECK: local_rank={local_rank}")
+    print(f"DISTRIBUTED_CHECK: learning_rate={args.learning_rate}")
+
+    # Emit metric for the tuner to parse
+    print(f"eval_loss: 0.42")
+
+
+if __name__ == "__main__":
+    main()
+"""
+
+
+@pytest.fixture(scope="module")
+def train_source_dir(tmp_path_factory):
+    """Create a temp directory with a minimal training script."""
+    d = tmp_path_factory.mktemp("tuner_dist_src")
+    (d / "train.py").write_text(TRAIN_SCRIPT_CONTENT)
+    return str(d)
+
+
+def test_tuner_includes_sm_drivers_channel(sagemaker_session, train_source_dir):
+    """Verify tuning jobs include sm_drivers channel for distributed training.
+
+    Uses a CPU instance with Torchrun to validate that the sm_drivers channel
+    (containing torchrun_driver.py and sm_train.sh) is included in the tuning
+    job definition. The training script logs WORLD_SIZE to confirm the V3
+    driver path is used instead of the legacy framework container fallback.
+    """
+    model_trainer = ModelTrainer(
+        sagemaker_session=sagemaker_session,
+        training_image=DEFAULT_CPU_IMAGE,
+        base_job_name="tuner-dist-test",
+        source_code=SourceCode(
+            source_dir=train_source_dir,
+            entry_script="train.py",
+        ),
+        compute=Compute(
+            instance_type="ml.m5.xlarge",
+            instance_count=1,
+            volume_size_in_gb=30,
+        ),
+        distributed=Torchrun(),
+        hyperparameters={"learning_rate": 1e-4},
+    )
+
+    tuner = HyperparameterTuner(
+        model_trainer=model_trainer,
+        objective_metric_name="eval_loss",
+        metric_definitions=[
+            {"Name": "eval_loss", "Regex": r"eval_loss: ([0-9\\.]+)"},
+        ],
+        hyperparameter_ranges={
+            "learning_rate": ContinuousParameter(
+                min_value=1e-5,
+                max_value=5e-4,
+                scaling_type="Logarithmic",
+            ),
+        },
+        objective_type="Minimize",
+        max_jobs=1,
+        max_parallel_jobs=1,
+    )
+
+    tuner.tune(wait=True)
+
+    job = tuner.latest_tuning_job.refresh()
+    assert job.hyper_parameter_tuning_job_status in (
+        "Completed",
+        "Stopped",
+    ), f"Tuning job failed: {job.hyper_parameter_tuning_job_status}"
+
+    best = tuner.best_training_job()
+    assert best is not None
+    logger.info("PASSED: tuner distributed training test - job: %s", best)
diff --git a/sagemaker-train/tests/unit/train/test_tuner_driver_channels.py b/sagemaker-train/tests/unit/train/test_tuner_driver_channels.py