aws
diff --git a/‎.github/workflows/fortress-scan.yml‎
Lines changed: 62 additions & 0 deletions b/‎.github/workflows/fortress-scan.yml‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎migration.md‎
Lines changed: 12 additions & 12 deletions b/‎migration.md‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎sagemaker-train/src/sagemaker/train/tuner.py‎
Lines changed: 125 additions & 60 deletions b/‎sagemaker-train/src/sagemaker/train/tuner.py‎
Lines changed: 125 additions & 60 deletions
@@ -0,0 +1,62 @@
+name: Fortress Security Scan
+on:
+  pull_request_target:
+    branches:
+      - "master"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.head_ref }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+
+jobs:
+  collab-check:
+    runs-on: ubuntu-latest
+    outputs:
+      approval-env: ${{ steps.collab-check.outputs.result }}
+    steps:
+      - name: Collaborator Check
+        uses: actions/github-script@v7
+        id: collab-check
+        with:
+          github-token: ${{ secrets.COLLAB_CHECK_TOKEN }}
+          result-encoding: string
+          script: |
+            try {
+              const res = await github.rest.repos.checkCollaborator({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                username: "${{ github.event.pull_request.user.login }}",
+              });
+              console.log("Verified ${{ github.event.pull_request.user.login }} is a repo collaborator. Auto Approving.")
+              return res.status == "204" ? "auto-approve" : "manual-approval"
+            } catch (error) {
+              console.log("${{ github.event.pull_request.user.login }} is not a collaborator. Requiring Manual Approval.")
+              return "manual-approval"
+            }
+
+  wait-for-approval:
+    runs-on: ubuntu-latest
+    needs: [collab-check]
+    environment: ${{ needs.collab-check.outputs.approval-env }}
+    steps:
+      - run: echo "Workflow Approved! Starting Fortress Security Scan."
+
+  fortress-scan:
+    runs-on: ubuntu-latest
+    needs: [wait-for-approval]
+    steps:
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }}
+          aws-region: us-west-2
+          role-duration-seconds: 10800
+
+      - name: Run Fortress Security Scan
+        uses: aws-actions/aws-codebuild-run-build@v1
+        with:
+          project-name: ${{ github.event.repository.name }}-ci-fortress-scan
+          source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}'
@@ -36,8 +36,8 @@ pip install --no-cache-dir https://d3azyja9oqj8z1.cloudfront.net/sagemaker_sdk_h
 Verify installation:
 
 ```bash
-which sagemaker-migration-mcp   # Should output the path to the executable
-sagemaker-migration-mcp --help   # Test the server runs
+which sagemaker-sdk-helper   # Should output the path to the executable
+sagemaker-sdk-helper --help   # Test the server runs
 ```
 
 ### IDE Setup
@@ -60,23 +60,23 @@ Add the following to your MCP configuration file:
 ```json
 {
   "mcpServers": {
-    "sagemaker-migration": {
-      "command": "sagemaker-migration-mcp",
+    "sagemaker-sdk-helper": {
+      "command": "sagemaker-sdk-helper",
       "args": ["--log-level", "INFO"]
     }
   }
 }
 ```
 
-> **Note**: If you installed in a virtual environment, use the full path to the executable (find it with `which sagemaker-migration-mcp`).
+> **Note**: If you installed in a virtual environment, use the full path to the executable (find it with `which sagemaker-sdk-helper`).
 
 **With SDK source artifacts (recommended, 20-30% better accuracy):**
 
 ```json
 {
   "mcpServers": {
-    "sagemaker-migration": {
-      "command": "/path/to/.venv/bin/sagemaker-migration-mcp",
+    "sagemaker-sdk-helper": {
+      "command": "/path/to/.venv/bin/sagemaker-sdk-helper",
       "args": [
         "--log-level", "INFO",
         "--v2-artifacts", "/path/to/sdk_v2/sagemaker-python-sdk",
@@ -96,8 +96,8 @@ For Kiro CLI, add the same configuration to `~/.kiro/settings/mcp.json`:
 ```json
 {
   "mcpServers": {
-    "sagemaker-migration": {
-      "command": "sagemaker-migration-mcp",
+    "sagemaker-sdk-helper": {
+      "command": "sagemaker-sdk-helper",
       "args": ["--log-level", "INFO"]
     }
   }
@@ -135,10 +135,10 @@ You can also point the tool at entire notebooks for full conversion.
 
 ### Troubleshooting
 
-- **Server not starting**: Verify the command path with `which sagemaker-migration-mcp`. Requires Python 3.9+.
-- **Permission denied**: Run `chmod +x /path/to/.venv/bin/sagemaker-migration-mcp`
+- **Server not starting**: Verify the command path with `which sagemaker-sdk-helper`. Requires Python 3.9+.
+- **Permission denied**: Run `chmod +x /path/to/.venv/bin/sagemaker-sdk-helper`
 - **Artifacts not loading**: Paths must be absolute. The server works without artifacts but with reduced accuracy.
-- **Manual test**: `echo '{"jsonrpc":"2.0","id":1,"method":"tools/list"}' | sagemaker-migration-mcp`
+- **Manual test**: `echo '{"jsonrpc":"2.0","id":1,"method":"tools/list"}' | sagemaker-sdk-helper`
 
 ---
 
 
@@ -444,96 +444,140 @@ def _prepare_auto_parameters(self, static_hyperparameters, hyperparameters_to_ke
 
     @classmethod
     def _prepare_model_trainer_for_tuning(cls, model_trainer, inputs=None, job_name=None, **kwargs):
-        """Prepare ModelTrainer before tuning by uploading source code and configuring hyperparameters.
+        """Prepare ModelTrainer before tuning by building sm_drivers and code channels.
 
-        This method mimics V2's _prepare_estimator_for_tuning() pattern, adapted for V3's
-        ModelTrainer architecture. It ensures that script mode hyperparameters are set before
-        the tuning job is created, which framework containers (PyTorch, TensorFlow) require.
+        This method replicates the channel-building logic from ModelTrainer._create_training_job()
+        to ensure the sm_drivers channel (containing torchrun_driver.py, distributed config, and
+        sm_train.sh) is included in the tuning job definition. Without this, the framework
+        container falls back to the legacy entry point (python train.py) instead of using the
+        V3 driver (torchrun), breaking distributed training.
 
         Args:
             model_trainer: ModelTrainer instance to prepare
             inputs: Training inputs (unused, for V2 compatibility)
             job_name: Job name (unused, for V2 compatibility)
             **kwargs: Additional arguments (unused, for V2 compatibility)
         """
-        # Only proceed if source_code is configured
-        if hasattr(model_trainer, "source_code") and model_trainer.source_code is not None:
-            cls._upload_source_code_and_configure_hyperparameters(model_trainer)
+        source_code = getattr(model_trainer, "source_code", None)
+        if source_code is None:
+            return
+        # Only proceed if source_code has a real entry_script string
+        entry_script = getattr(source_code, "entry_script", None)
+        if not isinstance(entry_script, str):
+            return
 
-    @classmethod
-    def _upload_source_code_and_configure_hyperparameters(cls, model_trainer):
-        """Upload source code to S3 and add script mode hyperparameters.
+        cls._build_driver_and_code_channels(model_trainer)
 
-        Framework containers (PyTorch, TensorFlow) expect sagemaker_program and
-        sagemaker_submit_directory hyperparameters for script mode execution. This method:
-        1. Checks if source_dir is a local path or S3 URI
-        2. Creates a tar.gz archive and uploads to S3
-        3. Adds required script mode hyperparameters to model_trainer.hyperparameters
+    @classmethod
+    def _build_driver_and_code_channels(cls, model_trainer):
+        """Build sm_drivers and code input channels for the tuning job.
 
-        This follows V2's pattern of creating sourcedir.tar.gz files.
+        Replicates the channel-building logic from ModelTrainer._create_training_job()
+        so that the tuning job gets the same execution environment as a standalone
+        training job (distributed drivers, source code, train script).
 
         Args:
             model_trainer: ModelTrainer instance with source_code configured
         """
+        import json
         import os
-        import tarfile
-        import tempfile
+        import shutil
         import time
+        from tempfile import TemporaryDirectory
+
+        from sagemaker.train.constants import (
+            SM_CODE,
+            SM_DRIVERS,
+            SM_DRIVERS_LOCAL_PATH,
+            DEFAULT_CONTAINER_ENTRYPOINT,
+            DEFAULT_CONTAINER_ARGUMENTS,
+        )
 
         source_code = model_trainer.source_code
+        base_name = model_trainer.base_job_name or "tuning"
+        key_prefix = f"{base_name}/tuning-{int(time.time())}/input"
+
+        # Build sm_drivers channel (same as ModelTrainer._create_training_job)
+        temp_dir = TemporaryDirectory()
+        shutil.copytree(SM_DRIVERS_LOCAL_PATH, temp_dir.name, dirs_exist_ok=True)
+
+        # If distributed config is set, copy distributed drivers
+        if model_trainer.distributed:
+            driver_dir = os.path.join(temp_dir.name, "distributed_drivers")
+            shutil.copytree(model_trainer.distributed.driver_dir, driver_dir, dirs_exist_ok=True)
+
+        # Write sourcecode.json
+        source_code_json_path = os.path.join(temp_dir.name, "sourcecode.json")
+        with open(source_code_json_path, "w") as f:
+            dump = source_code.model_dump() if source_code else {}
+            f.write(json.dumps(dump))
+
+        # Write distributed.json
+        distributed_json_path = os.path.join(temp_dir.name, "distributed.json")
+        with open(distributed_json_path, "w") as f:
+            dump = model_trainer.distributed.model_dump() if model_trainer.distributed else {}
+            f.write(json.dumps(dump))
+
+        # Prepare the train script (sm_train.sh)
+        model_trainer._prepare_train_script(
+            tmp_dir=temp_dir,
+            source_code=source_code,
+            distributed=model_trainer.distributed,
+        )
+
+        # Upload sm_drivers channel
+        sm_drivers_channel = model_trainer.create_input_data_channel(
+            channel_name=SM_DRIVERS,
+            data_source=temp_dir.name,
+            key_prefix=key_prefix,
+            ignore_patterns=source_code.ignore_patterns,
+        )
 
-        # Get source directory and entry script
-        source_dir = source_code.source_dir
-        entry_script = source_code.entry_script
+        # Store channels on model_trainer so _build_training_job_definition can pick them up
+        model_trainer._tuner_channels = [sm_drivers_channel]
 
-        # Check if already an S3 URI
-        if _is_valid_s3_uri(source_dir):
-            # Already uploaded, use as-is
-            source_s3_uri = source_dir
-        else:
-            # Local directory - need to create tar.gz and upload
-            session = model_trainer.sagemaker_session
-            bucket = session.default_bucket()
+        # Set script mode hyperparameters required by framework containers.
+        # The framework container (PyTorch, TF) uses sagemaker_program to find the entry script
+        # and sagemaker_submit_directory to download source code to /opt/ml/code/.
+        if model_trainer.hyperparameters is None:
+            model_trainer.hyperparameters = {}
+        model_trainer.hyperparameters["sagemaker_program"] = source_code.entry_script
 
-            # Generate S3 key
-            timestamp = int(time.time())
-            s3_key = (
-                f"{model_trainer.base_job_name or 'source'}/source-{timestamp}/sourcedir.tar.gz"
-            )
+        # Upload sourcedir.tar.gz for the legacy framework container path.
+        # The HPT API doesn't support container_entrypoint, so the framework container
+        # uses sagemaker_submit_directory to download and extract code to /opt/ml/code/.
+        if source_code.source_dir and not _is_valid_s3_uri(source_code.source_dir):
+            import tarfile
+            import tempfile
 
-            # Create tar.gz file
-            with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp_file:
-                tar_path = tmp_file.name
+            session = model_trainer.sagemaker_session
+            bucket = session.default_bucket()
+            s3_key = f"{key_prefix}/sourcedir/sourcedir.tar.gz"
 
+            with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
+                tar_path = tmp.name
             try:
-                # Create tar.gz archive
                 with tarfile.open(tar_path, "w:gz") as tar:
-                    # Add all files from source_dir
-                    for root, dirs, files in os.walk(source_dir):
-                        for file in files:
-                            file_path = os.path.join(root, file)
-                            # Calculate arcname to preserve directory structure
-                            arcname = os.path.relpath(file_path, source_dir)
-                            tar.add(file_path, arcname=arcname)
-
-                # Upload to S3
-                s3_client = session.boto_session.client("s3", region_name=session.boto_region_name)
+                    for root, _dirs, files in os.walk(source_code.source_dir):
+                        for f in files:
+                            fpath = os.path.join(root, f)
+                            arcname = os.path.relpath(fpath, source_code.source_dir)
+                            tar.add(fpath, arcname=arcname)
+                s3_client = session.boto_session.client(
+                    "s3", region_name=session.boto_region_name
+                )
                 s3_client.upload_file(tar_path, bucket, s3_key)
-
-                # Construct S3 URI
-                source_s3_uri = f"s3://{bucket}/{s3_key}"
+                model_trainer.hyperparameters["sagemaker_submit_directory"] = (
+                    f"s3://{bucket}/{s3_key}"
+                )
             finally:
-                # Clean up temp file
                 if os.path.exists(tar_path):
                     os.remove(tar_path)
+        elif source_code.source_dir and _is_valid_s3_uri(source_code.source_dir):
+            model_trainer.hyperparameters["sagemaker_submit_directory"] = source_code.source_dir
 
-        # Initialize hyperparameters dict if None
-        if model_trainer.hyperparameters is None:
-            model_trainer.hyperparameters = {}
-
-        # Add script mode hyperparameters required by framework containers
-        model_trainer.hyperparameters["sagemaker_program"] = entry_script
-        model_trainer.hyperparameters["sagemaker_submit_directory"] = source_s3_uri
+        # Store the temp dir reference to prevent cleanup
+        model_trainer._tuner_temp_dir = temp_dir
 
     @runnable_by_pipeline
     def tune(
@@ -1422,6 +1466,12 @@ def _build_training_job_definition(self, inputs):
                 if not any(c.channel_name == channel.channel_name for c in input_data_config):
                     input_data_config.append(channel)
 
+        # Include channels built by _prepare_model_trainer_for_tuning (sm_drivers, code)
+        if hasattr(model_trainer, "_tuner_channels") and model_trainer._tuner_channels:
+            for channel in model_trainer._tuner_channels:
+                if not any(c.channel_name == channel.channel_name for c in input_data_config):
+                    input_data_config.append(channel)
+
         # Build output data config
         output_config = OutputDataConfig(
             s3_output_path=(
@@ -1459,7 +1509,22 @@ def _build_training_job_definition(self, inputs):
             output_data_config=output_config,
             resource_config=resource_config,
             stopping_condition=stopping_condition,
-            static_hyper_parameters=self.static_hyperparameters or {},
+            static_hyper_parameters=getattr(self, "static_hyperparameters", None) or {},
         )
 
+        # Pass through environment variables from model_trainer
+        env = getattr(model_trainer, "environment", None)
+        if env and isinstance(env, dict):
+            definition.environment = env
+
+        # Pass through VPC config from model_trainer
+        networking = getattr(model_trainer, "networking", None)
+        if networking and hasattr(networking, "_to_vpc_config"):
+            try:
+                vpc_config = networking._to_vpc_config()
+                if vpc_config:
+                    definition.vpc_config = vpc_config
+            except Exception:
+                pass
+
         return definition