Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion sagemaker-core/src/sagemaker/core/workflow/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def get_code_hash(step: Entity) -> str:
source_code = model_trainer.source_code
if source_code:
source_dir = source_code.source_dir
requirements = source_code.requirements
requirements = source_code.requirements or []
Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good fix. However, for consistency and defense-in-depth, consider also updating the function signature of get_processing_code_hash to default dependencies to None (if it doesn't already) and document that None is acceptable. Currently the type hint says List[str] — it should be list[str] | None to accurately reflect that None is a valid input.

Also, get_training_code_hash should be checked — the PR description mentions it already has guards, but the requirements or [] fix on this line suggests it may not handle None in all paths either. Please verify.

entry_point = source_code.entry_script
return get_training_code_hash(entry_point, source_dir, requirements)
return None
Expand Down Expand Up @@ -209,6 +209,7 @@ def get_processing_code_hash(code: str, source_dir: str, dependencies: List[str]
Returns:
str: A hash string representing the unique code artifact(s) for the step
"""
dependencies = dependencies or []

# FrameworkProcessor
if source_dir:
Expand Down
69 changes: 69 additions & 0 deletions sagemaker-core/tests/unit/workflow/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,34 @@ def test_get_processing_code_hash_code_only(self):
finally:
os.unlink(temp_file)

Comment thread
aviruthen marked this conversation as resolved.
def test_get_processing_code_hash_with_none_dependencies_and_source_dir(self):
"""Test get_processing_code_hash with None dependencies and source_dir"""
with tempfile.TemporaryDirectory() as temp_dir:
code_file = Path(temp_dir, "script.py")
code_file.write_text("print('hello')")

result = get_processing_code_hash(
code=str(code_file), source_dir=temp_dir, dependencies=None
)

Comment thread
aviruthen marked this conversation as resolved.
assert result is not None
assert len(result) == 64

Comment thread
aviruthen marked this conversation as resolved.
def test_get_processing_code_hash_with_none_dependencies_and_code_only(self):
"""Test get_processing_code_hash with None dependencies and code only"""
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
f.write("print('hello')")
temp_file = f.name

try:
result = get_processing_code_hash(code=temp_file, source_dir=None, dependencies=None)
Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: There's an extra blank line here (two blank lines between methods inside a class). PEP 8 convention for methods within a class is one blank line. This is minor but worth noting since CI may flag it.


assert result is not None
assert len(result) == 64
finally:
os.unlink(temp_file)


def test_get_processing_code_hash_s3_uri(self):
"""Test get_processing_code_hash with S3 URI returns None"""
result = get_processing_code_hash(
Expand Down Expand Up @@ -308,6 +336,47 @@ def test_get_training_code_hash_entry_point_only(self):
assert len(result_with_deps) == 64
assert result_no_deps != result_with_deps

def test_get_code_hash_training_step_with_none_requirements(self):
"""Test get_code_hash with TrainingStep whose source_code has requirements=None"""
from sagemaker.core.workflow.utilities import get_code_hash

with tempfile.TemporaryDirectory() as temp_dir:
entry_file = Path(temp_dir, "train.py")
entry_file.write_text("print('training')")

mock_source_code = Mock()
mock_source_code.source_dir = temp_dir
mock_source_code.requirements = None
mock_source_code.entry_script = str(entry_file)

mock_model_trainer = Mock()
mock_model_trainer.source_code = mock_source_code

mock_step_args = Mock()
mock_step_args.func_args = [mock_model_trainer]

mock_step = Mock()
mock_step.step_args = mock_step_args

with patch("sagemaker.core.workflow.utilities.isinstance") as mock_isinstance:
def isinstance_side_effect(obj, cls):
Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is extremely fragile and overly complex. Patching builtins.isinstance via sagemaker.core.workflow.utilities.isinstance is a code smell — it's brittle, hard to maintain, and can break in unexpected ways.

Instead, consider constructing a real TrainingStep (or a minimal subclass/mock that passes isinstance checks naturally), or simply test get_code_hash indirectly by testing get_training_code_hash with dependencies=None directly. The requirements or [] fix on line 176 is already covered if you just call get_training_code_hash(entry_point, source_dir, None) in a simpler test.

def test_get_training_code_hash_with_none_dependencies(self):
    """Test get_training_code_hash with None dependencies does not raise TypeError"""
    with tempfile.TemporaryDirectory() as temp_dir:
        entry_file = Path(temp_dir, "train.py")
        entry_file.write_text("print('training')")

        result = get_training_code_hash(
            entry_point=str(entry_file),
            source_dir=temp_dir,
            dependencies=None,
        )

        assert result is not None
        assert len(result) == 64

from sagemaker.mlops.workflow.steps import TrainingStep, ProcessingStep
if cls is ProcessingStep:
return False
if cls is TrainingStep:
return obj is mock_step
return builtins_isinstance(obj, cls)

import builtins
builtins_isinstance = builtins.isinstance
Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Importing builtins and storing builtins.isinstance inside a side-effect closure that patches isinstance is very fragile. If the implementation of get_code_hash changes how it checks types, this test will silently break or give false positives. Please simplify this test as suggested above.

mock_isinstance.side_effect = isinstance_side_effect

result = get_code_hash(mock_step)

assert result is not None
assert len(result) == 64


def test_get_training_code_hash_s3_uri(self):
"""Test get_training_code_hash with S3 URI returns None"""
result = get_training_code_hash(
Expand Down
Loading