tests: add new marker and new github action for 3 times per day gpu intensive tests

lucasjia-aws · lucasjia-aws · commit 181df7138b70 · 2026-05-26T14:15:44.000-07:00
diff --git a/.github/workflows/gpu-integ-tests.yml b/.github/workflows/gpu-integ-tests.yml
@@ -0,0 +1,24 @@
+name: GPU Integ Tests
+on:
+  schedule:
+    - cron: "0 */8 * * *"
+  workflow_dispatch:
+
+permissions:
+    id-token: write # This is required for requesting the JWT
+
+jobs:
+  gpu-integ-tests:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }}
+          aws-region: us-west-2
+          role-duration-seconds: 10800
+      - name: Run GPU Integ Tests
+        uses: aws-actions/aws-codebuild-run-build@v1
+        with:
+          project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests
+          source-version: refs/heads/master
diff --git a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py
@@ -23,6 +23,8 @@
     EvaluationPipelineExecution,
 )
 
+pytestmark = pytest.mark.gpu_intensive
+
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
diff --git a/sagemaker-train/tests/integ/train/test_custom_scorer_evaluator.py b/sagemaker-train/tests/integ/train/test_custom_scorer_evaluator.py
@@ -22,6 +22,8 @@
     EvaluationPipelineExecution,
 )
 
+pytestmark = pytest.mark.gpu_intensive
+
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
diff --git a/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py b/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py
@@ -21,6 +21,8 @@
 from sagemaker.train.common import TrainingType
 import pytest
 
+pytestmark = pytest.mark.gpu_intensive
+
 
 def test_dpo_trainer_lora_complete_workflow(sagemaker_session):
     """Test complete DPO training workflow with LORA."""
diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
@@ -28,6 +28,8 @@
     EvaluationPipelineExecution,
 )
 
+pytestmark = pytest.mark.gpu_intensive
+
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_evaluator.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_evaluator.py
@@ -22,6 +22,8 @@
     EvaluationPipelineExecution,
 )
 
+pytestmark = pytest.mark.gpu_intensive
+
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
diff --git a/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py
@@ -21,6 +21,8 @@
 from sagemaker.train.common import TrainingType
 import pytest
 
+pytestmark = pytest.mark.gpu_intensive
+
 
 def test_rlaif_trainer_lora_complete_workflow(sagemaker_session):
     """Test complete RLAIF training workflow with LORA."""
diff --git a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py
@@ -21,6 +21,8 @@
 from sagemaker.train.rlvr_trainer import RLVRTrainer
 from sagemaker.train.common import TrainingType
 
+pytestmark = pytest.mark.gpu_intensive
+
 
 def test_rlvr_trainer_lora_complete_workflow(sagemaker_session):
     """Test complete RLVR training workflow with LORA."""
diff --git a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py
@@ -21,6 +21,8 @@
 from sagemaker.train.sft_trainer import SFTTrainer
 from sagemaker.train.common import TrainingType
 
+pytestmark = pytest.mark.gpu_intensive
+
 
 def test_sft_trainer_lora_complete_workflow(sagemaker_session):
     """Test complete SFT training workflow with LORA."""
diff --git a/sagemaker-train/tests/integ/train/test_tuner_distributed.py b/sagemaker-train/tests/integ/train/test_tuner_distributed.py
@@ -28,6 +28,8 @@
 from sagemaker.train.configs import SourceCode, Compute
 from sagemaker.train.distributed import Torchrun
 from sagemaker.train.tuner import HyperparameterTuner
+
+pytestmark = pytest.mark.gpu_intensive
 from sagemaker.core.parameter import ContinuousParameter
 
 logger = logging.getLogger(__name__)
diff --git a/sagemaker-train/tox.ini b/sagemaker-train/tox.ini
@@ -62,6 +62,7 @@ markers =
     slow_test
     release
     image_uris_unit_test
+    gpu_intensive: mark a test as GPU resource intensive (runs on scheduled CI, not PR checks).
     timeout: mark a test as a timeout.
     serial: marks tests that must run serially (not in parallel)
 

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,8 @@`
`23`	`23`	`EvaluationPipelineExecution,`
`24`	`24`	`)`
`25`	`25`
	`26`	`+pytestmark = pytest.mark.gpu_intensive`
	`27`	`+`
`26`	`28`	`# Configure logging`
`27`	`29`	`logging.basicConfig(`
`28`	`30`	`level=logging.INFO,`
Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,8 @@`
`22`	`22`	`EvaluationPipelineExecution,`
`23`	`23`	`)`
`24`	`24`
	`25`	`+pytestmark = pytest.mark.gpu_intensive`
	`26`	`+`
`25`	`27`	`# Configure logging`
`26`	`28`	`logging.basicConfig(`
`27`	`29`	`level=logging.INFO,`
Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,8 @@`
`28`	`28`	`EvaluationPipelineExecution,`
`29`	`29`	`)`
`30`	`30`
	`31`	`+pytestmark = pytest.mark.gpu_intensive`
	`32`	`+`
`31`	`33`	`# Configure logging`
`32`	`34`	`logging.basicConfig(`
`33`	`35`	`level=logging.INFO,`