Nova,llmft training support

Roja Reddy Sareddy · Roja Reddy Sareddy · commit d246064aa06f · 2026-01-20T14:29:53.000-08:00
diff --git a/sagemaker-train/src/sagemaker/train/model_trainer.py b/sagemaker-train/src/sagemaker/train/model_trainer.py
@@ -107,6 +107,7 @@
     _get_args_from_recipe,
     _determine_device_type,
     _is_nova_recipe,
+    _is_llmft_recipe,
     _load_base_recipe,
 )
 
@@ -258,6 +259,7 @@ class ModelTrainer(BaseModel):
     _metric_definitions: Optional[List[MetricDefinition]] = PrivateAttr(default=None)
 
     _is_nova_recipe: Optional[bool] = PrivateAttr(default=None)
+    _is_llmft_recipe: Optional[bool] = PrivateAttr(default=None)
     # Private Attributes for Recipes
     _temp_recipe_train_dir: Optional[TemporaryDirectory] = PrivateAttr(default=None)
 
@@ -582,12 +584,12 @@ def _create_training_job_args(
 
             final_input_data_config = list(existing_channels.values()) + new_channels
 
-        if self._is_nova_recipe:
+        if self._is_nova_recipe or self._is_llmft_recipe:
             for input_data in final_input_data_config:
                 if input_data.channel_name == SM_RECIPE:
                     raise ValueError(
                         "Cannot use reserved channel name 'recipe' as an input channel name "
-                        " for Nova Recipe"
+                        " for Nova or LLMFT Recipe"
                     )
             recipe_file_path = os.path.join(self._temp_recipe_train_dir.name, SM_RECIPE_YAML)
             recipe_channel = self.create_input_data_channel(
@@ -596,7 +598,8 @@ def _create_training_job_args(
                 key_prefix=input_data_key_prefix,
             )
             final_input_data_config.append(recipe_channel)
-            self.hyperparameters.update({"sagemaker_recipe_local_path": SM_RECIPE_CONTAINER_PATH})
+            if self._is_nova_recipe or self._is_llmft_recipe:
+                self.hyperparameters.update({"sagemaker_recipe_local_path": SM_RECIPE_CONTAINER_PATH})
 
         if final_input_data_config:
             final_input_data_config = self._get_input_data_config(
@@ -1166,14 +1169,15 @@ def from_recipe(
             training_recipe=training_recipe, recipe_overrides=recipe_overrides
         )
         is_nova = _is_nova_recipe(recipe=recipe)
-        if device_type == "cpu" and not is_nova:
+        is_llmft = _is_llmft_recipe(recipe=recipe)
+        if device_type == "cpu" and not (is_nova or is_llmft):
             raise ValueError(
                 "Training recipes are not supported for CPU instances. "
                 "Please provide a GPU or Tranium instance type."
             )
 
-        if training_image is None and is_nova:
-            raise ValueError("training_image must be provided when using recipe for Nova.")
+        if training_image is None and (is_nova or is_llmft):
+            raise ValueError("training_image must be provided when using recipe for Nova or LLMFT")
 
         if training_image_config and training_image is None:
             raise ValueError("training_image must be provided when using training_image_config.")
@@ -1200,7 +1204,7 @@ def from_recipe(
 
         if hyperparameters and not is_nova:
             logger.warning(
-                "Hyperparameters are not supported for general training recipes. "
+                "Hyperparameters are not supported for general and LLMFT training recipes. "
                 + "Ignoring hyperparameters input."
             )
         if is_nova:
@@ -1227,6 +1231,7 @@ def from_recipe(
         )
 
         model_trainer._is_nova_recipe = is_nova
+        model_trainer._is_llmft_recipe = is_llmft
         model_trainer._temp_recipe_train_dir = tmp_dir
         return model_trainer
 
diff --git a/sagemaker-train/src/sagemaker/train/sm_recipes/utils.py b/sagemaker-train/src/sagemaker/train/sm_recipes/utils.py
@@ -280,6 +280,9 @@ def _get_args_from_recipe(
     if _is_nova_recipe(recipe):
         args, recipe_local_dir = _get_args_from_nova_recipe(recipe, compute, role=role)
         return args, recipe_local_dir
+    if _is_llmft_recipe(recipe):
+        args, recipe_local_dir = _get_args_from_llmft_recipe(recipe, compute)
+        return args, recipe_local_dir
 
     if "trainer" not in recipe:
         raise ValueError("Supplied recipe does not contain required field trainer.")
@@ -456,4 +459,74 @@ def _get_args_from_nova_recipe(
             "distributed": None,
         }
     )
-    return args, recipe_local_dir
+    return args, recipe_local_dir
+
+def _resolve_final_recipe(recipe: dictconfig.DictConfig):
+    """Resolve final recipe."""
+    final_recipe = _try_resolve_recipe(recipe)
+    if final_recipe is None:
+        final_recipe = _try_resolve_recipe(recipe, "recipes")
+    if final_recipe is None:
+        final_recipe = _try_resolve_recipe(recipe, "training")
+    if final_recipe is None:
+        raise RuntimeError("Could not resolve provided recipe.")
+
+    return final_recipe
+
+def _is_llmft_recipe(
+    recipe: dictconfig.DictConfig,
+) -> bool:
+    """Check if the recipe is a LLMFT recipe.
+
+    A recipe is considered a LLMFT recipe if it meets the following conditions:
+        1. Having a run section
+        2. The model_type in run is llm_finetuning_aws or verl
+        3. Having a training_config section
+
+    Args:
+        recipe (DictConfig): The loaded recipe configuration
+
+    Returns:
+        bool: True if the recipe is a LLMFT recipe, False otherwise
+    """
+    run_config = recipe.get("run", {})
+    model_type = run_config.get("model_type", "").lower()
+    has_llmft_model = model_type == "llm_finetuning_aws"
+    has_verl_model = model_type == "verl"
+    return (bool(has_llmft_model) or bool(has_verl_model)) and bool(recipe.get("training_config"))
+
+def _get_args_from_llmft_recipe(
+    recipe: dictconfig.DictConfig,
+    compute: Compute,
+) -> Tuple[Dict[str, Any], tempfile.TemporaryDirectory]:
+
+    if not compute.instance_count and not recipe.get("trainer", {}).get("num_nodes", None):
+        raise ValueError(
+            "Must set ``instance_count`` in compute or ``num_nodes`` in trainer in recipe."
+        )
+    if compute.instance_count and recipe.get("trainer", {}).get("num_nodes", None) is not None:
+        logger.warning(
+            f"Using Compute to set instance_count:\n{compute}."
+            "\nIgnoring trainer -> num_nodes in recipe."
+        )
+    compute.instance_count = compute.instance_count or recipe.get("trainer", {}).get("num_nodes")
+
+    args = dict()
+
+    _register_custom_resolvers()
+    final_recipe = _resolve_final_recipe(recipe)
+
+    # Save Final Recipe to tmp dir
+    recipe_local_dir = tempfile.TemporaryDirectory(prefix="recipe_")
+    final_recipe_path = os.path.join(recipe_local_dir.name, SM_RECIPE_YAML)
+    OmegaConf.save(config=final_recipe, f=final_recipe_path)
+
+    args.update(
+        {
+            "compute": compute,
+            "training_image": None,
+            "source_code": None,
+            "distributed": None,
+        }
+    )
+    return args, recipe_local_dir
diff --git a/sagemaker-train/tests/unit/train/sm_recipes/test_utils.py b/sagemaker-train/tests/unit/train/sm_recipes/test_utils.py
@@ -17,6 +17,7 @@
 from unittest.mock import patch, MagicMock
 
 import yaml
+from omegaconf import OmegaConf
 from urllib.request import urlretrieve
 from tempfile import NamedTemporaryFile
 
@@ -28,7 +29,9 @@
     _configure_trainium_args,
     _get_trainining_recipe_gpu_model_name_and_script,
     _is_nova_recipe,
+    _is_llmft_recipe,
     _get_args_from_nova_recipe,
+    _get_args_from_llmft_recipe,
 )
 from sagemaker.train.utils import _run_clone_command_silent
 from sagemaker.train.configs import Compute
@@ -272,3 +275,158 @@ def test_get_args_from_recipe_with_evaluation(temporary_recipe):
                 assert args["hyperparameters"]["lambda_arn"] == "arn:aws:lambda:us-east-1:123456789012:function:MyFunc"
     finally:
         os.unlink(recipe_path)
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        {
+            "recipe": {
+                "run": {
+                    "name": "dummy-model",
+                    "model_type": "llm_finetuning_aws",
+                },
+                "trainer": {"num_nodes": "12"},
+                "training_config": {"model_save_name": "xyz"},
+            },
+            "is_llmft": True,
+        },
+        {
+            "recipe": {
+                "run": {
+                    "name": "dummy-model",
+                    "model_type": "llm_finetuning_aws",
+                },
+                "training_config": {"model_save_name": "xyz"},
+            },
+            "is_llmft": True,
+        },
+        {
+            "recipe": {
+                "run": {
+                    "name": "dummy-model",
+                    "model_type": "llm_finetuning_aws",
+                },
+            },
+            "is_llmft": False,
+        },
+        {
+            "recipe": {
+                "run": {
+                    "name": "dummy-model",
+                    "model_type": "xyz",
+                },
+                "training_config": {"model_save_name": "xyz"},
+            },
+            "is_llmft": False,
+        },
+        {
+            "recipe": {
+                "run": {
+                    "name": "verl-grpo-llama",
+                    "model_type": "verl",
+                },
+                "trainer": {"num_nodes": "1"},
+                "training_config": {"trainer": {"total_epochs": 2}},
+            },
+            "is_llmft": True,
+        },
+        {
+            "recipe": {
+                "run": {
+                    "name": "verl-grpo-llama",
+                    "model_type": "verl",
+                },
+            },
+            "is_llmft": False,
+        },
+    ],
+    ids=[
+        "llmft_model",
+        "llmft_model_subtype",
+        "llmft_missing_training_config",
+        "non_llmft_model",
+        "verl_model",
+        "verl_missing_training_config",
+    ],
+)
+def test_is_llmft_recipe(test_case):
+    recipe = OmegaConf.create(test_case["recipe"])
+    is_llmft = _is_llmft_recipe(recipe)
+    assert is_llmft == test_case["is_llmft"]
+
+
+@patch("sagemaker.train.sm_recipes.utils._get_args_from_llmft_recipe")
+def test_get_args_from_recipe_with_llmft_and_role(mock_get_args_from_llmft_recipe):
+    # Set up mock return value
+    mock_args = {}
+    mock_dir = MagicMock()
+    mock_get_args_from_llmft_recipe.return_value = (mock_args, mock_dir)
+
+    recipe = {
+        "run": {
+            "name": "dummy-model",
+            "model_type": "llm_finetuning_aws",
+        },
+        "trainer": {"num_nodes": "12"},
+        "training_config": {"model_save_name": "xyz"},
+    }
+    compute = Compute(instance_type="ml.g5.xlarge")
+    role = "arn:aws:iam::123456789012:role/SageMakerRole"
+
+    # Mock the LLMFT recipe detection to return True
+    with patch("sagemaker.train.sm_recipes.utils._is_llmft_recipe", return_value=True):
+        _get_args_from_recipe(
+            training_recipe=recipe,
+            compute=compute,
+            region_name="us-west-2",
+            recipe_overrides=None,
+            requirements=None,
+            role=role,
+        )
+
+        # Verify _get_args_from_llmft_recipe was called
+        mock_get_args_from_llmft_recipe.assert_called_once_with(recipe, compute)
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        {
+            "recipe": {
+                "run": {
+                    "name": "dummy-model",
+                    "model_type": "llm_finetuning_aws",
+                },
+                "trainer": {"num_nodes": "12"},
+                "training_config": {"model_save_name": "xyz"},
+            },
+            "compute": Compute(instance_type="ml.m5.xlarge", instance_count=2),
+            "expected_args": {
+                "compute": Compute(instance_type="ml.m5.xlarge", instance_count=2),
+                "training_image": None,
+                "source_code": None,
+                "distributed": None,
+            },
+        },
+        {
+            "recipe": {
+                "run": {
+                    "name": "dummy-model",
+                    "model_type": "llm_finetuning_aws",
+                },
+                "training_config": {"model_save_name": "xyz"},
+            },
+            "compute": Compute(instance_type="ml.m5.xlarge", instance_count=2),
+            "expected_args": {
+                "compute": Compute(instance_type="ml.m5.xlarge", instance_count=2),
+                "training_image": None,
+                "source_code": None,
+                "distributed": None,
+            },
+        },
+    ],
+)
+def test_get_args_from_llmft_recipe(test_case):
+    recipe = OmegaConf.create(test_case["recipe"])
+    args, _ = _get_args_from_llmft_recipe(recipe=recipe, compute=test_case["compute"])
+    assert args == test_case["expected_args"]
diff --git a/sagemaker-train/tests/unit/train/test_model_trainer.py b/sagemaker-train/tests/unit/train/test_model_trainer.py