Skip to content

Commit c0134f5

Browse files
committed
Remove alpine
1 parent 1dd9f25 commit c0134f5

3 files changed

Lines changed: 55 additions & 21 deletions

File tree

sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -57,24 +57,36 @@
5757
SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE"
5858

5959

60-
def _rmtree(path):
60+
def _rmtree(path, image=None):
6161
"""Remove a directory tree, handling root-owned files from Docker containers."""
6262
try:
6363
shutil.rmtree(path)
6464
except PermissionError:
6565
# Files created by Docker containers are owned by root.
6666
# Use a Docker container to remove them since os.chmod will also fail.
67+
# Use the training image (already pulled) to avoid needing alpine.
68+
if image is None:
69+
logger.warning(
70+
"Failed to clean up root-owned files in %s. "
71+
"You may need to remove them manually with: sudo rm -rf %s",
72+
path, path,
73+
)
74+
raise
6775
try:
6876
subprocess.run(
69-
["docker", "run", "--rm", "-v", f"{path}:/delete", "alpine", "rm", "-rf", "/delete"],
77+
["docker", "run", "--rm", "-v", f"{path}:/delete", image, "rm", "-rf", "/delete"],
7078
check=True,
7179
capture_output=True,
7280
)
7381
# The mount point directory itself may remain — clean it up
7482
if os.path.exists(path):
7583
shutil.rmtree(path, ignore_errors=True)
7684
except Exception:
77-
logger.warning("Failed to clean up root-owned files in %s. You may need to remove them manually with: sudo rm -rf %s", path, path)
85+
logger.warning(
86+
"Failed to clean up root-owned files in %s. "
87+
"You may need to remove them manually with: sudo rm -rf %s",
88+
path, path,
89+
)
7890
raise
7991

8092

@@ -230,12 +242,12 @@ def train(
230242
# Print our Job Complete line
231243
logger.info("Local training job completed, output artifacts saved to %s", artifacts)
232244

233-
_rmtree(os.path.join(self.container_root, "input"))
234-
_rmtree(os.path.join(self.container_root, "shared"))
245+
_rmtree(os.path.join(self.container_root, "input"), self.image)
246+
_rmtree(os.path.join(self.container_root, "shared"), self.image)
235247
for host in self.hosts:
236-
_rmtree(os.path.join(self.container_root, host))
248+
_rmtree(os.path.join(self.container_root, host), self.image)
237249
for folder in self._temporary_folders:
238-
_rmtree(os.path.join(self.container_root, folder))
250+
_rmtree(os.path.join(self.container_root, folder), self.image)
239251
return artifacts
240252

241253
def retrieve_artifacts(

sagemaker-train/src/sagemaker/train/local/local_container.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -65,24 +65,36 @@
6565
SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE"
6666

6767

68-
def _rmtree(path):
68+
def _rmtree(path, image=None):
6969
"""Remove a directory tree, handling root-owned files from Docker containers."""
7070
try:
7171
shutil.rmtree(path)
7272
except PermissionError:
7373
# Files created by Docker containers are owned by root.
7474
# Use a Docker container to remove them since os.chmod will also fail.
75+
# Use the training image (already pulled) to avoid needing alpine.
76+
if image is None:
77+
logger.warning(
78+
"Failed to clean up root-owned files in %s. "
79+
"You may need to remove them manually with: sudo rm -rf %s",
80+
path, path,
81+
)
82+
raise
7583
try:
7684
subprocess.run(
77-
["docker", "run", "--rm", "-v", f"{path}:/delete", "alpine", "rm", "-rf", "/delete"],
85+
["docker", "run", "--rm", "-v", f"{path}:/delete", image, "rm", "-rf", "/delete"],
7886
check=True,
7987
capture_output=True,
8088
)
8189
# The mount point directory itself may remain — clean it up
8290
if os.path.exists(path):
8391
shutil.rmtree(path, ignore_errors=True)
8492
except Exception:
85-
logger.warning("Failed to clean up root-owned files in %s. You may need to remove them manually with: sudo rm -rf %s", path, path)
93+
logger.warning(
94+
"Failed to clean up root-owned files in %s. "
95+
"You may need to remove them manually with: sudo rm -rf %s",
96+
path, path,
97+
)
8698
raise
8799

88100

@@ -238,12 +250,12 @@ def train(
238250
# Print our Job Complete line
239251
logger.info("Local training job completed, output artifacts saved to %s", artifacts)
240252

241-
_rmtree(os.path.join(self.container_root, "input"))
242-
_rmtree(os.path.join(self.container_root, "shared"))
253+
_rmtree(os.path.join(self.container_root, "input"), self.image)
254+
_rmtree(os.path.join(self.container_root, "shared"), self.image)
243255
for host in self.hosts:
244-
_rmtree(os.path.join(self.container_root, host))
256+
_rmtree(os.path.join(self.container_root, host), self.image)
245257
for folder in self._temporary_folders:
246-
_rmtree(os.path.join(self.container_root, folder))
258+
_rmtree(os.path.join(self.container_root, folder), self.image)
247259
return artifacts
248260

249261
def retrieve_artifacts(

sagemaker-train/tests/unit/train/local/test_local_container.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,27 +15,29 @@
1515

1616
from sagemaker.train.local.local_container import _rmtree
1717

18+
IMAGE = "763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.1-cpu-py310"
19+
1820

1921
class TestRmtree:
2022
"""Test cases for _rmtree function."""
2123

2224
@patch("sagemaker.train.local.local_container.shutil.rmtree")
2325
def test_rmtree_success(self, mock_rmtree):
2426
"""Normal case — shutil.rmtree succeeds."""
25-
_rmtree("/tmp/test")
27+
_rmtree("/tmp/test", IMAGE)
2628
mock_rmtree.assert_called_once_with("/tmp/test")
2729

2830
@patch("sagemaker.train.local.local_container.shutil.rmtree")
2931
@patch("sagemaker.train.local.local_container.subprocess.run")
3032
@patch("sagemaker.train.local.local_container.os.path.exists", return_value=False)
3133
def test_rmtree_permission_error_docker_fallback(self, mock_exists, mock_run, mock_rmtree):
32-
"""PermissionError triggers docker fallback to remove root-owned files."""
34+
"""PermissionError triggers docker fallback using the training image."""
3335
mock_rmtree.side_effect = PermissionError("Permission denied")
3436

35-
_rmtree("/tmp/test")
37+
_rmtree("/tmp/test", IMAGE)
3638

3739
mock_run.assert_called_once_with(
38-
["docker", "run", "--rm", "-v", "/tmp/test:/delete", "alpine", "rm", "-rf", "/delete"],
40+
["docker", "run", "--rm", "-v", "/tmp/test:/delete", IMAGE, "rm", "-rf", "/delete"],
3941
check=True,
4042
capture_output=True,
4143
)
@@ -47,7 +49,7 @@ def test_rmtree_cleans_up_mount_point(self, mock_exists, mock_run, mock_rmtree):
4749
"""After docker cleanup, remaining mount point directory is removed."""
4850
mock_rmtree.side_effect = [PermissionError("Permission denied"), None]
4951

50-
_rmtree("/tmp/test")
52+
_rmtree("/tmp/test", IMAGE)
5153

5254
assert mock_rmtree.call_count == 2
5355
mock_rmtree.assert_has_calls([
@@ -60,7 +62,15 @@ def test_rmtree_cleans_up_mount_point(self, mock_exists, mock_run, mock_rmtree):
6062
def test_rmtree_docker_fallback_fails_raises(self, mock_run, mock_rmtree):
6163
"""If docker fallback also fails, the exception propagates."""
6264
mock_rmtree.side_effect = PermissionError("Permission denied")
63-
mock_run.side_effect = Exception("docker not available")
65+
mock_run.side_effect = Exception("docker failed")
66+
67+
with pytest.raises(Exception, match="docker failed"):
68+
_rmtree("/tmp/test", IMAGE)
69+
70+
@patch("sagemaker.train.local.local_container.shutil.rmtree")
71+
def test_rmtree_no_image_raises(self, mock_rmtree):
72+
"""PermissionError without image raises immediately."""
73+
mock_rmtree.side_effect = PermissionError("Permission denied")
6474

65-
with pytest.raises(Exception, match="docker not available"):
75+
with pytest.raises(PermissionError):
6676
_rmtree("/tmp/test")

0 commit comments

Comments
 (0)