Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 15 additions & 12 deletions migration.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ pip install --no-cache-dir https://d3azyja9oqj8z1.cloudfront.net/sagemaker_sdk_h
Verify installation:

```bash
which sagemaker-migration-mcp # Should output the path to the executable
sagemaker-migration-mcp --help # Test the server runs
which sagemaker-sdk-helper # Should output the path to the executable
sagemaker-sdk-helper --help # Test the server runs
```

### IDE Setup
Expand All @@ -48,7 +48,10 @@ Add the following to your MCP configuration file:
|-----|----------------|
| Kiro | `~/.kiro/settings/mcp.json` |
| Kiro CLI | `~/.kiro/settings/mcp.json` |
| VS Code (Copilot) | `.vscode/mcp.json` in workspace root |
| VS Code (Cline) | Cline extension settings → MCP Servers |
| VS Code (Roo Code) | Roo Code extension settings → MCP Servers |
| Codex CLI | `codex.json` or `~/.codex/config.json` |
| Claude Desktop | `~/Library/Application Support/Claude/claude_desktop_config.json` (macOS) |
| Cursor | Settings → MCP section |

Expand All @@ -57,23 +60,23 @@ Add the following to your MCP configuration file:
```json
{
"mcpServers": {
"sagemaker-migration": {
"command": "sagemaker-migration-mcp",
"sagemaker-sdk-helper": {
"command": "sagemaker-sdk-helper",
"args": ["--log-level", "INFO"]
}
}
}
```

> **Note**: If you installed in a virtual environment, use the full path to the executable (find it with `which sagemaker-migration-mcp`).
> **Note**: If you installed in a virtual environment, use the full path to the executable (find it with `which sagemaker-sdk-helper`).

**With SDK source artifacts (recommended, 20-30% better accuracy):**

```json
{
"mcpServers": {
"sagemaker-migration": {
"command": "/path/to/.venv/bin/sagemaker-migration-mcp",
"sagemaker-sdk-helper": {
"command": "/path/to/.venv/bin/sagemaker-sdk-helper",
"args": [
"--log-level", "INFO",
"--v2-artifacts", "/path/to/sdk_v2/sagemaker-python-sdk",
Expand All @@ -93,8 +96,8 @@ For Kiro CLI, add the same configuration to `~/.kiro/settings/mcp.json`:
```json
{
"mcpServers": {
"sagemaker-migration": {
"command": "sagemaker-migration-mcp",
"sagemaker-sdk-helper": {
"command": "sagemaker-sdk-helper",
"args": ["--log-level", "INFO"]
}
}
Expand Down Expand Up @@ -132,10 +135,10 @@ You can also point the tool at entire notebooks for full conversion.

### Troubleshooting

- **Server not starting**: Verify the command path with `which sagemaker-migration-mcp`. Requires Python 3.9+.
- **Permission denied**: Run `chmod +x /path/to/.venv/bin/sagemaker-migration-mcp`
- **Server not starting**: Verify the command path with `which sagemaker-sdk-helper`. Requires Python 3.9+.
- **Permission denied**: Run `chmod +x /path/to/.venv/bin/sagemaker-sdk-helper`
- **Artifacts not loading**: Paths must be absolute. The server works without artifacts but with reduced accuracy.
- **Manual test**: `echo '{"jsonrpc":"2.0","id":1,"method":"tools/list"}' | sagemaker-migration-mcp`
- **Manual test**: `echo '{"jsonrpc":"2.0","id":1,"method":"tools/list"}' | sagemaker-sdk-helper`

---

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import os
import re
import shutil
import stat
import subprocess
from tempfile import TemporaryDirectory
from typing import Any, Dict, List, Optional
Expand Down Expand Up @@ -58,15 +57,34 @@
SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE"


def _rmtree(path):
def _rmtree(path, image=None, is_studio=False):
"""Remove a directory tree, handling root-owned files from Docker containers."""
def _onerror(func, path, exc_info):
if isinstance(exc_info[1], PermissionError):
os.chmod(path, stat.S_IRWXU)
func(path)
else:
raise exc_info[1]
shutil.rmtree(path, onerror=_onerror)
try:
shutil.rmtree(path)
except PermissionError:
# Files created by Docker containers are owned by root.
# Use docker to chmod as root, then retry shutil.rmtree.
if image is None:
logger.warning(
"Failed to clean up root-owned files in %s. "
"You may need to remove them manually with: sudo rm -rf %s",
path, path,
)
raise
try:
cmd = ["docker", "run", "--rm"]
if is_studio:
cmd += ["--network", "sagemaker"]
cmd += ["-v", f"{path}:/delete", image, "chmod", "-R", "777", "/delete"]
subprocess.run(cmd, check=True, capture_output=True)
shutil.rmtree(path)
except Exception:
logger.warning(
"Failed to clean up root-owned files in %s. "
"You may need to remove them manually with: sudo rm -rf %s",
path, path,
)
raise


class _LocalContainer(BaseModel):
Expand Down Expand Up @@ -221,12 +239,12 @@ def train(
# Print our Job Complete line
logger.info("Local training job completed, output artifacts saved to %s", artifacts)

_rmtree(os.path.join(self.container_root, "input"))
_rmtree(os.path.join(self.container_root, "shared"))
_rmtree(os.path.join(self.container_root, "input"), self.image, self.is_studio)
_rmtree(os.path.join(self.container_root, "shared"), self.image, self.is_studio)
for host in self.hosts:
_rmtree(os.path.join(self.container_root, host))
_rmtree(os.path.join(self.container_root, host), self.image, self.is_studio)
for folder in self._temporary_folders:
_rmtree(os.path.join(self.container_root, folder))
_rmtree(os.path.join(self.container_root, folder), self.image, self.is_studio)
return artifacts

def retrieve_artifacts(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -941,3 +941,63 @@ def test_write_config_files_with_content_type(self, mock_session):
calls = mock_write.call_args_list
input_config_call = [c for c in calls if "inputdataconfig.json" in str(c)]
assert len(input_config_call) > 0


RMTREE_IMAGE = "763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.1-cpu-py310"
MODULE = "sagemaker.core.modules.local_core.local_container"


class TestRmtree:
"""Test cases for _rmtree function."""

@patch(f"{MODULE}.shutil.rmtree")
def test_rmtree_success(self, mock_rmtree):
from sagemaker.core.modules.local_core.local_container import _rmtree
_rmtree("/tmp/test", RMTREE_IMAGE)
mock_rmtree.assert_called_once_with("/tmp/test")

@patch(f"{MODULE}.shutil.rmtree")
@patch(f"{MODULE}.subprocess.run")
def test_rmtree_permission_error_docker_chmod_fallback(self, mock_run, mock_rmtree):
from sagemaker.core.modules.local_core.local_container import _rmtree
mock_rmtree.side_effect = [PermissionError("Permission denied"), None]
_rmtree("/tmp/test", RMTREE_IMAGE)
mock_run.assert_called_once_with(
["docker", "run", "--rm", "-v", "/tmp/test:/delete", RMTREE_IMAGE, "chmod", "-R", "777", "/delete"],
check=True,
capture_output=True,
)
assert mock_rmtree.call_count == 2

@patch(f"{MODULE}.shutil.rmtree")
@patch(f"{MODULE}.subprocess.run")
def test_rmtree_studio_adds_network(self, mock_run, mock_rmtree):
from sagemaker.core.modules.local_core.local_container import _rmtree
mock_rmtree.side_effect = [PermissionError("Permission denied"), None]
_rmtree("/tmp/test", RMTREE_IMAGE, is_studio=True)
mock_run.assert_called_once_with(
[
"docker", "run", "--rm",
"--network", "sagemaker",
"-v", "/tmp/test:/delete", RMTREE_IMAGE,
"chmod", "-R", "777", "/delete",
],
check=True,
capture_output=True,
)

@patch(f"{MODULE}.shutil.rmtree")
@patch(f"{MODULE}.subprocess.run")
def test_rmtree_docker_fallback_fails_raises(self, mock_run, mock_rmtree):
from sagemaker.core.modules.local_core.local_container import _rmtree
mock_rmtree.side_effect = PermissionError("Permission denied")
mock_run.side_effect = Exception("docker failed")
with pytest.raises(Exception, match="docker failed"):
_rmtree("/tmp/test", RMTREE_IMAGE)

@patch(f"{MODULE}.shutil.rmtree")
def test_rmtree_no_image_raises(self, mock_rmtree):
from sagemaker.core.modules.local_core.local_container import _rmtree
mock_rmtree.side_effect = PermissionError("Permission denied")
with pytest.raises(PermissionError):
_rmtree("/tmp/test")
44 changes: 31 additions & 13 deletions sagemaker-train/src/sagemaker/train/local/local_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import os
import re
import shutil
import stat
import subprocess
from tempfile import TemporaryDirectory
from typing import Any, Dict, List, Optional
Expand Down Expand Up @@ -66,15 +65,34 @@
SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE"


def _rmtree(path):
def _rmtree(path, image=None, is_studio=False):
"""Remove a directory tree, handling root-owned files from Docker containers."""
def _onerror(func, path, exc_info):
if isinstance(exc_info[1], PermissionError):
os.chmod(path, stat.S_IRWXU)
func(path)
else:
raise exc_info[1]
shutil.rmtree(path, onerror=_onerror)
try:
shutil.rmtree(path)
except PermissionError:
# Files created by Docker containers are owned by root.
# Use docker to chmod as root, then retry shutil.rmtree.
if image is None:
logger.warning(
"Failed to clean up root-owned files in %s. "
"You may need to remove them manually with: sudo rm -rf %s",
path, path,
)
raise
try:
cmd = ["docker", "run", "--rm"]
if is_studio:
cmd += ["--network", "sagemaker"]
cmd += ["-v", f"{path}:/delete", image, "chmod", "-R", "777", "/delete"]
subprocess.run(cmd, check=True, capture_output=True)
shutil.rmtree(path)
except Exception:
logger.warning(
"Failed to clean up root-owned files in %s. "
"You may need to remove them manually with: sudo rm -rf %s",
path, path,
)
raise


class _LocalContainer(BaseModel):
Expand Down Expand Up @@ -229,12 +247,12 @@ def train(
# Print our Job Complete line
logger.info("Local training job completed, output artifacts saved to %s", artifacts)

_rmtree(os.path.join(self.container_root, "input"))
_rmtree(os.path.join(self.container_root, "shared"))
_rmtree(os.path.join(self.container_root, "input"), self.image, self.is_studio)
_rmtree(os.path.join(self.container_root, "shared"), self.image, self.is_studio)
for host in self.hosts:
_rmtree(os.path.join(self.container_root, host))
_rmtree(os.path.join(self.container_root, host), self.image, self.is_studio)
for folder in self._temporary_folders:
_rmtree(os.path.join(self.container_root, folder))
_rmtree(os.path.join(self.container_root, folder), self.image, self.is_studio)
return artifacts

def retrieve_artifacts(
Expand Down
80 changes: 80 additions & 0 deletions sagemaker-train/tests/unit/train/local/test_local_container.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from unittest.mock import patch, call
import pytest

from sagemaker.train.local.local_container import _rmtree

IMAGE = "763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.1-cpu-py310"


class TestRmtree:
"""Test cases for _rmtree function."""

@patch("sagemaker.train.local.local_container.shutil.rmtree")
def test_rmtree_success(self, mock_rmtree):
"""Normal case — shutil.rmtree succeeds."""
_rmtree("/tmp/test", IMAGE)
mock_rmtree.assert_called_once_with("/tmp/test")

@patch("sagemaker.train.local.local_container.shutil.rmtree")
@patch("sagemaker.train.local.local_container.subprocess.run")
def test_rmtree_permission_error_docker_chmod_fallback(self, mock_run, mock_rmtree):
"""PermissionError triggers docker chmod then retry."""
mock_rmtree.side_effect = [PermissionError("Permission denied"), None]

_rmtree("/tmp/test", IMAGE)

mock_run.assert_called_once_with(
["docker", "run", "--rm", "-v", "/tmp/test:/delete", IMAGE, "chmod", "-R", "777", "/delete"],
check=True,
capture_output=True,
)
assert mock_rmtree.call_count == 2

@patch("sagemaker.train.local.local_container.shutil.rmtree")
@patch("sagemaker.train.local.local_container.subprocess.run")
def test_rmtree_studio_adds_network(self, mock_run, mock_rmtree):
"""In Studio, docker run includes --network sagemaker."""
mock_rmtree.side_effect = [PermissionError("Permission denied"), None]

_rmtree("/tmp/test", IMAGE, is_studio=True)

mock_run.assert_called_once_with(
[
"docker", "run", "--rm",
"--network", "sagemaker",
"-v", "/tmp/test:/delete", IMAGE,
"chmod", "-R", "777", "/delete",
],
check=True,
capture_output=True,
)

@patch("sagemaker.train.local.local_container.shutil.rmtree")
@patch("sagemaker.train.local.local_container.subprocess.run")
def test_rmtree_docker_fallback_fails_raises(self, mock_run, mock_rmtree):
"""If docker fallback also fails, the exception propagates."""
mock_rmtree.side_effect = PermissionError("Permission denied")
mock_run.side_effect = Exception("docker failed")

with pytest.raises(Exception, match="docker failed"):
_rmtree("/tmp/test", IMAGE)

@patch("sagemaker.train.local.local_container.shutil.rmtree")
def test_rmtree_no_image_raises(self, mock_rmtree):
"""PermissionError without image raises immediately."""
mock_rmtree.side_effect = PermissionError("Permission denied")

with pytest.raises(PermissionError):
_rmtree("/tmp/test")
4 changes: 4 additions & 0 deletions test_e2e_pr_dummy_1773791538.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# E2E Test File
# Created at 2026-03-17 23:52:18 UTC
# Test ID: 1773791538
# This file was created by an automated e2e test.
Loading