Skip to content

Commit b6f1365

Browse files
authored
test: unskip gpu intensive tests (aws#5892)
* test: unskip gpu intensive tests * tests: add new marker and new github action for 3 times per day gpu intensive tests
1 parent d18005d commit b6f1365

14 files changed

Lines changed: 113 additions & 50 deletions
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
name: GPU Integ Tests
2+
on:
3+
schedule:
4+
- cron: "0 */8 * * *"
5+
workflow_dispatch:
6+
7+
permissions:
8+
id-token: write # This is required for requesting the JWT
9+
10+
jobs:
11+
gpu-integ-tests:
12+
runs-on: ubuntu-latest
13+
steps:
14+
- name: Configure AWS Credentials
15+
uses: aws-actions/configure-aws-credentials@v4
16+
with:
17+
role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }}
18+
aws-region: us-west-2
19+
role-duration-seconds: 10800
20+
- name: Run GPU Integ Tests
21+
uses: aws-actions/aws-codebuild-run-build@v1
22+
with:
23+
project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests
24+
source-version: refs/heads/master

sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ def _resolve_mlflow_resource_arn(sagemaker_session, mlflow_resource_arn: Optiona
105105
return mlflow_resource_arn
106106

107107
try:
108+
108109
mlflow_apps = MlflowApp.get_all(
109110
session=sagemaker_session.boto_session,
110111
region=sagemaker_session.boto_session.region_name

sagemaker-train/src/sagemaker/train/rlvr_trainer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,9 @@ def _process_hyperparameters(self):
171171
if hasattr(self.hyperparameters, 'reward_lambda_arn'):
172172
delattr(self.hyperparameters, 'reward_lambda_arn')
173173
self.hyperparameters._specs.pop('reward_lambda_arn', None)
174+
if hasattr(self.hyperparameters, 'preset_reward_function'):
175+
delattr(self.hyperparameters, 'preset_reward_function')
176+
self.hyperparameters._specs.pop('preset_reward_function', None)
174177
if hasattr(self.hyperparameters, 'data_path'):
175178
delattr(self.hyperparameters, 'data_path')
176179
self.hyperparameters._specs.pop('data_path', None)

sagemaker-train/tests/integ/train/conftest.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,13 @@ def sagemaker_session():
3838

3939
if region_manual_set and "AWS_DEFAULT_REGION" in os.environ:
4040
del os.environ["AWS_DEFAULT_REGION"]
41+
42+
43+
NOVA_REGION = "us-east-1"
44+
45+
46+
@pytest.fixture(scope="module")
47+
def sagemaker_session_us_east_1():
48+
"""Create a SageMaker session in us-east-1 for Nova model tests."""
49+
boto_session = boto3.Session(region_name=NOVA_REGION)
50+
return Session(boto_session=boto_session)

sagemaker-train/tests/integ/train/test_benchmark_evaluator.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
EvaluationPipelineExecution,
2424
)
2525

26+
pytestmark = pytest.mark.gpu_intensive
27+
2628
# Configure logging
2729
logging.basicConfig(
2830
level=logging.INFO,

sagemaker-train/tests/integ/train/test_custom_scorer_evaluator.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
EvaluationPipelineExecution,
2323
)
2424

25+
pytestmark = pytest.mark.gpu_intensive
26+
2527
# Configure logging
2628
logging.basicConfig(
2729
level=logging.INFO,

sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,21 @@
2121
from sagemaker.train.common import TrainingType
2222
import pytest
2323

24+
pytestmark = pytest.mark.gpu_intensive
25+
2426

25-
@pytest.mark.skip(reason="Skipping GPU resource intensive test")
2627
def test_dpo_trainer_lora_complete_workflow(sagemaker_session):
2728
"""Test complete DPO training workflow with LORA."""
29+
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"
2830
# Create DPOTrainer instance with comprehensive configuration
2931
trainer = DPOTrainer(
3032
model="meta-textgeneration-llama-3-2-1b-instruct",
3133
training_type=TrainingType.LORA,
3234
model_package_group="sdk-test-finetuned-models",
33-
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1",
35+
training_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl",
3436
s3_output_path="s3://mc-flows-sdk-testing/output/",
35-
accept_eula=True
37+
accept_eula=True,
38+
base_job_name=f"dpo-lora-integ-{unique_id}",
3639
)
3740

3841
# Customize hyperparameters for quick training
@@ -61,18 +64,19 @@ def test_dpo_trainer_lora_complete_workflow(sagemaker_session):
6164
assert training_job.output_model_package_arn is not None
6265

6366

64-
@pytest.mark.skip(reason="Skipping GPU resource intensive test")
6567
def test_dpo_trainer_with_validation_dataset(sagemaker_session):
6668
"""Test DPO trainer with both training and validation datasets."""
69+
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"
6770

6871
dpo_trainer = DPOTrainer(
6972
model="meta-textgeneration-llama-3-2-1b-instruct",
7073
training_type=TrainingType.LORA,
7174
model_package_group="sdk-test-finetuned-models",
72-
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1",
73-
validation_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1",
75+
training_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl",
76+
validation_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl",
7477
s3_output_path="s3://mc-flows-sdk-testing/output/",
75-
accept_eula=True
78+
accept_eula=True,
79+
base_job_name=f"dpo-val-integ-{unique_id}",
7680
)
7781

7882
# Customize hyperparameters for quick training

sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
EvaluationPipelineExecution,
2929
)
3030

31+
pytestmark = pytest.mark.gpu_intensive
32+
3133
# Configure logging
3234
logging.basicConfig(
3335
level=logging.INFO,

sagemaker-train/tests/integ/train/test_llm_as_judge_evaluator.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
EvaluationPipelineExecution,
2323
)
2424

25+
pytestmark = pytest.mark.gpu_intensive
26+
2527
# Configure logging
2628
logging.basicConfig(
2729
level=logging.INFO,

sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,19 @@
1414
from __future__ import absolute_import
1515

1616
import time
17+
import random
1718
import boto3
1819
from sagemaker.core.helper.session_helper import Session
1920
from sagemaker.train.rlaif_trainer import RLAIFTrainer
2021
from sagemaker.train.common import TrainingType
2122
import pytest
2223

24+
pytestmark = pytest.mark.gpu_intensive
25+
2326

24-
@pytest.mark.skip(reason="Skipping GPU resource intensive test")
2527
def test_rlaif_trainer_lora_complete_workflow(sagemaker_session):
2628
"""Test complete RLAIF training workflow with LORA."""
29+
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"
2730

2831
rlaif_trainer = RLAIFTrainer(
2932
model="meta-textgeneration-llama-3-2-1b-instruct",
@@ -33,9 +36,10 @@ def test_rlaif_trainer_lora_complete_workflow(sagemaker_session):
3336
reward_prompt='Builtin.Summarize',
3437
mlflow_experiment_name="test-rlaif-finetuned-models-exp",
3538
mlflow_run_name="test-rlaif-finetuned-models-run",
36-
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1",
39+
training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl",
3740
s3_output_path="s3://mc-flows-sdk-testing/output/",
38-
accept_eula=True
41+
accept_eula=True,
42+
base_job_name=f"rlaif-lora-integ-{unique_id}",
3943
)
4044

4145
# Create training job
@@ -61,9 +65,9 @@ def test_rlaif_trainer_lora_complete_workflow(sagemaker_session):
6165
assert training_job.output_model_package_arn is not None
6266

6367

64-
@pytest.mark.skip(reason="Skipping GPU resource intensive test")
6568
def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session):
6669
"""Test RLAIF trainer with different reward model and prompt."""
70+
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"
6771

6872
rlaif_trainer = RLAIFTrainer(
6973
model="meta-textgeneration-llama-3-2-1b-instruct",
@@ -73,9 +77,10 @@ def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session):
7377
reward_prompt="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/rlaif-test-prompt/0.0.1",
7478
mlflow_experiment_name="test-rlaif-finetuned-models-exp",
7579
mlflow_run_name="test-rlaif-finetuned-models-run",
76-
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1",
80+
training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl",
7781
s3_output_path="s3://mc-flows-sdk-testing/output/",
78-
accept_eula=True
82+
accept_eula=True,
83+
base_job_name=f"rlaif-rwd-integ-{unique_id}",
7984
)
8085

8186
training_job = rlaif_trainer.train(wait=False)
@@ -100,9 +105,9 @@ def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session):
100105
assert training_job.output_model_package_arn is not None
101106

102107

103-
@pytest.mark.skip(reason="Skipping GPU resource intensive test")
104108
def test_rlaif_trainer_continued_finetuning(sagemaker_session):
105109
"""Test complete RLAIF training workflow with LORA."""
110+
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"
106111

107112
rlaif_trainer = RLAIFTrainer(
108113
model="arn:aws:sagemaker:us-west-2:729646638167:model-package/sdk-test-finetuned-models/1",
@@ -112,9 +117,10 @@ def test_rlaif_trainer_continued_finetuning(sagemaker_session):
112117
reward_prompt='Builtin.Summarize',
113118
mlflow_experiment_name="test-rlaif-finetuned-models-exp",
114119
mlflow_run_name="test-rlaif-finetuned-models-run",
115-
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1",
120+
training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl",
116121
s3_output_path="s3://mc-flows-sdk-testing/output/",
117-
accept_eula=True
122+
accept_eula=True,
123+
base_job_name=f"rlaif-cont-integ-{unique_id}",
118124
)
119125

120126
# Create training job

0 commit comments

Comments
 (0)