Skip to content

Commit e965a1b

Browse files
fix: resolve PermissionError during local mode cleanup of root-owned Docker files (#5629)
* fix: use docker fallback to clean up root-owned files in local mode * Remove alpine * Use network flag * Add -mindepth * Use chmod -R 777 via Docker * Add unit test to sagemaker-core for permissionError docker fix
1 parent 6a1ba54 commit e965a1b

File tree

4 files changed

+202
-26
lines changed

4 files changed

+202
-26
lines changed

sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
import os
1818
import re
1919
import shutil
20-
import stat
2120
import subprocess
2221
from tempfile import TemporaryDirectory
2322
from typing import Any, Dict, List, Optional
@@ -58,15 +57,34 @@
5857
SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE"
5958

6059

61-
def _rmtree(path):
60+
def _rmtree(path, image=None, is_studio=False):
6261
"""Remove a directory tree, handling root-owned files from Docker containers."""
63-
def _onerror(func, path, exc_info):
64-
if isinstance(exc_info[1], PermissionError):
65-
os.chmod(path, stat.S_IRWXU)
66-
func(path)
67-
else:
68-
raise exc_info[1]
69-
shutil.rmtree(path, onerror=_onerror)
62+
try:
63+
shutil.rmtree(path)
64+
except PermissionError:
65+
# Files created by Docker containers are owned by root.
66+
# Use docker to chmod as root, then retry shutil.rmtree.
67+
if image is None:
68+
logger.warning(
69+
"Failed to clean up root-owned files in %s. "
70+
"You may need to remove them manually with: sudo rm -rf %s",
71+
path, path,
72+
)
73+
raise
74+
try:
75+
cmd = ["docker", "run", "--rm"]
76+
if is_studio:
77+
cmd += ["--network", "sagemaker"]
78+
cmd += ["-v", f"{path}:/delete", image, "chmod", "-R", "777", "/delete"]
79+
subprocess.run(cmd, check=True, capture_output=True)
80+
shutil.rmtree(path)
81+
except Exception:
82+
logger.warning(
83+
"Failed to clean up root-owned files in %s. "
84+
"You may need to remove them manually with: sudo rm -rf %s",
85+
path, path,
86+
)
87+
raise
7088

7189

7290
class _LocalContainer(BaseModel):
@@ -221,12 +239,12 @@ def train(
221239
# Print our Job Complete line
222240
logger.info("Local training job completed, output artifacts saved to %s", artifacts)
223241

224-
_rmtree(os.path.join(self.container_root, "input"))
225-
_rmtree(os.path.join(self.container_root, "shared"))
242+
_rmtree(os.path.join(self.container_root, "input"), self.image, self.is_studio)
243+
_rmtree(os.path.join(self.container_root, "shared"), self.image, self.is_studio)
226244
for host in self.hosts:
227-
_rmtree(os.path.join(self.container_root, host))
245+
_rmtree(os.path.join(self.container_root, host), self.image, self.is_studio)
228246
for folder in self._temporary_folders:
229-
_rmtree(os.path.join(self.container_root, folder))
247+
_rmtree(os.path.join(self.container_root, folder), self.image, self.is_studio)
230248
return artifacts
231249

232250
def retrieve_artifacts(

sagemaker-core/tests/unit/modules/local_core/test_local_container.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -941,3 +941,63 @@ def test_write_config_files_with_content_type(self, mock_session):
941941
calls = mock_write.call_args_list
942942
input_config_call = [c for c in calls if "inputdataconfig.json" in str(c)]
943943
assert len(input_config_call) > 0
944+
945+
946+
RMTREE_IMAGE = "763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.1-cpu-py310"
947+
MODULE = "sagemaker.core.modules.local_core.local_container"
948+
949+
950+
class TestRmtree:
951+
"""Test cases for _rmtree function."""
952+
953+
@patch(f"{MODULE}.shutil.rmtree")
954+
def test_rmtree_success(self, mock_rmtree):
955+
from sagemaker.core.modules.local_core.local_container import _rmtree
956+
_rmtree("/tmp/test", RMTREE_IMAGE)
957+
mock_rmtree.assert_called_once_with("/tmp/test")
958+
959+
@patch(f"{MODULE}.shutil.rmtree")
960+
@patch(f"{MODULE}.subprocess.run")
961+
def test_rmtree_permission_error_docker_chmod_fallback(self, mock_run, mock_rmtree):
962+
from sagemaker.core.modules.local_core.local_container import _rmtree
963+
mock_rmtree.side_effect = [PermissionError("Permission denied"), None]
964+
_rmtree("/tmp/test", RMTREE_IMAGE)
965+
mock_run.assert_called_once_with(
966+
["docker", "run", "--rm", "-v", "/tmp/test:/delete", RMTREE_IMAGE, "chmod", "-R", "777", "/delete"],
967+
check=True,
968+
capture_output=True,
969+
)
970+
assert mock_rmtree.call_count == 2
971+
972+
@patch(f"{MODULE}.shutil.rmtree")
973+
@patch(f"{MODULE}.subprocess.run")
974+
def test_rmtree_studio_adds_network(self, mock_run, mock_rmtree):
975+
from sagemaker.core.modules.local_core.local_container import _rmtree
976+
mock_rmtree.side_effect = [PermissionError("Permission denied"), None]
977+
_rmtree("/tmp/test", RMTREE_IMAGE, is_studio=True)
978+
mock_run.assert_called_once_with(
979+
[
980+
"docker", "run", "--rm",
981+
"--network", "sagemaker",
982+
"-v", "/tmp/test:/delete", RMTREE_IMAGE,
983+
"chmod", "-R", "777", "/delete",
984+
],
985+
check=True,
986+
capture_output=True,
987+
)
988+
989+
@patch(f"{MODULE}.shutil.rmtree")
990+
@patch(f"{MODULE}.subprocess.run")
991+
def test_rmtree_docker_fallback_fails_raises(self, mock_run, mock_rmtree):
992+
from sagemaker.core.modules.local_core.local_container import _rmtree
993+
mock_rmtree.side_effect = PermissionError("Permission denied")
994+
mock_run.side_effect = Exception("docker failed")
995+
with pytest.raises(Exception, match="docker failed"):
996+
_rmtree("/tmp/test", RMTREE_IMAGE)
997+
998+
@patch(f"{MODULE}.shutil.rmtree")
999+
def test_rmtree_no_image_raises(self, mock_rmtree):
1000+
from sagemaker.core.modules.local_core.local_container import _rmtree
1001+
mock_rmtree.side_effect = PermissionError("Permission denied")
1002+
with pytest.raises(PermissionError):
1003+
_rmtree("/tmp/test")

sagemaker-train/src/sagemaker/train/local/local_container.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
import os
1919
import re
2020
import shutil
21-
import stat
2221
import subprocess
2322
from tempfile import TemporaryDirectory
2423
from typing import Any, Dict, List, Optional
@@ -66,15 +65,34 @@
6665
SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE"
6766

6867

69-
def _rmtree(path):
68+
def _rmtree(path, image=None, is_studio=False):
7069
"""Remove a directory tree, handling root-owned files from Docker containers."""
71-
def _onerror(func, path, exc_info):
72-
if isinstance(exc_info[1], PermissionError):
73-
os.chmod(path, stat.S_IRWXU)
74-
func(path)
75-
else:
76-
raise exc_info[1]
77-
shutil.rmtree(path, onerror=_onerror)
70+
try:
71+
shutil.rmtree(path)
72+
except PermissionError:
73+
# Files created by Docker containers are owned by root.
74+
# Use docker to chmod as root, then retry shutil.rmtree.
75+
if image is None:
76+
logger.warning(
77+
"Failed to clean up root-owned files in %s. "
78+
"You may need to remove them manually with: sudo rm -rf %s",
79+
path, path,
80+
)
81+
raise
82+
try:
83+
cmd = ["docker", "run", "--rm"]
84+
if is_studio:
85+
cmd += ["--network", "sagemaker"]
86+
cmd += ["-v", f"{path}:/delete", image, "chmod", "-R", "777", "/delete"]
87+
subprocess.run(cmd, check=True, capture_output=True)
88+
shutil.rmtree(path)
89+
except Exception:
90+
logger.warning(
91+
"Failed to clean up root-owned files in %s. "
92+
"You may need to remove them manually with: sudo rm -rf %s",
93+
path, path,
94+
)
95+
raise
7896

7997

8098
class _LocalContainer(BaseModel):
@@ -229,12 +247,12 @@ def train(
229247
# Print our Job Complete line
230248
logger.info("Local training job completed, output artifacts saved to %s", artifacts)
231249

232-
_rmtree(os.path.join(self.container_root, "input"))
233-
_rmtree(os.path.join(self.container_root, "shared"))
250+
_rmtree(os.path.join(self.container_root, "input"), self.image, self.is_studio)
251+
_rmtree(os.path.join(self.container_root, "shared"), self.image, self.is_studio)
234252
for host in self.hosts:
235-
_rmtree(os.path.join(self.container_root, host))
253+
_rmtree(os.path.join(self.container_root, host), self.image, self.is_studio)
236254
for folder in self._temporary_folders:
237-
_rmtree(os.path.join(self.container_root, folder))
255+
_rmtree(os.path.join(self.container_root, folder), self.image, self.is_studio)
238256
return artifacts
239257

240258
def retrieve_artifacts(
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from unittest.mock import patch, call
14+
import pytest
15+
16+
from sagemaker.train.local.local_container import _rmtree
17+
18+
IMAGE = "763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.1-cpu-py310"
19+
20+
21+
class TestRmtree:
22+
"""Test cases for _rmtree function."""
23+
24+
@patch("sagemaker.train.local.local_container.shutil.rmtree")
25+
def test_rmtree_success(self, mock_rmtree):
26+
"""Normal case — shutil.rmtree succeeds."""
27+
_rmtree("/tmp/test", IMAGE)
28+
mock_rmtree.assert_called_once_with("/tmp/test")
29+
30+
@patch("sagemaker.train.local.local_container.shutil.rmtree")
31+
@patch("sagemaker.train.local.local_container.subprocess.run")
32+
def test_rmtree_permission_error_docker_chmod_fallback(self, mock_run, mock_rmtree):
33+
"""PermissionError triggers docker chmod then retry."""
34+
mock_rmtree.side_effect = [PermissionError("Permission denied"), None]
35+
36+
_rmtree("/tmp/test", IMAGE)
37+
38+
mock_run.assert_called_once_with(
39+
["docker", "run", "--rm", "-v", "/tmp/test:/delete", IMAGE, "chmod", "-R", "777", "/delete"],
40+
check=True,
41+
capture_output=True,
42+
)
43+
assert mock_rmtree.call_count == 2
44+
45+
@patch("sagemaker.train.local.local_container.shutil.rmtree")
46+
@patch("sagemaker.train.local.local_container.subprocess.run")
47+
def test_rmtree_studio_adds_network(self, mock_run, mock_rmtree):
48+
"""In Studio, docker run includes --network sagemaker."""
49+
mock_rmtree.side_effect = [PermissionError("Permission denied"), None]
50+
51+
_rmtree("/tmp/test", IMAGE, is_studio=True)
52+
53+
mock_run.assert_called_once_with(
54+
[
55+
"docker", "run", "--rm",
56+
"--network", "sagemaker",
57+
"-v", "/tmp/test:/delete", IMAGE,
58+
"chmod", "-R", "777", "/delete",
59+
],
60+
check=True,
61+
capture_output=True,
62+
)
63+
64+
@patch("sagemaker.train.local.local_container.shutil.rmtree")
65+
@patch("sagemaker.train.local.local_container.subprocess.run")
66+
def test_rmtree_docker_fallback_fails_raises(self, mock_run, mock_rmtree):
67+
"""If docker fallback also fails, the exception propagates."""
68+
mock_rmtree.side_effect = PermissionError("Permission denied")
69+
mock_run.side_effect = Exception("docker failed")
70+
71+
with pytest.raises(Exception, match="docker failed"):
72+
_rmtree("/tmp/test", IMAGE)
73+
74+
@patch("sagemaker.train.local.local_container.shutil.rmtree")
75+
def test_rmtree_no_image_raises(self, mock_rmtree):
76+
"""PermissionError without image raises immediately."""
77+
mock_rmtree.side_effect = PermissionError("Permission denied")
78+
79+
with pytest.raises(PermissionError):
80+
_rmtree("/tmp/test")

0 commit comments

Comments
 (0)