Merge branch 'aws:master' into master-mtrl-eval-issue-fix

rsareddy0329 · web-flow · commit 77fb3afc7647 · 2026-06-15T13:42:40.000-07:00
diff --git a/.github/workflows/gpu-integ-tests.yml b/.github/workflows/gpu-integ-tests.yml
@@ -1,14 +1,53 @@
 name: GPU Integ Tests
 on:
   schedule:
-    - cron: "0 */8 * * *"
+    # US Pacific (PST, UTC-8): 10:00 PM / 1:00 AM / 4:00 AM -> 06/09/12 UTC.
+    # All three fire within the same UTC day so the run-level CloudWatch metric
+    # (GpuIntegRunFailure) aggregates correctly per day.
+    - cron: "0 6 * * *"
+    - cron: "0 9 * * *"
+    - cron: "0 12 * * *"
   workflow_dispatch:
 
 permissions:
-    id-token: write # This is required for requesting the JWT
+  id-token: write # This is required for requesting the JWT
+  actions: read   # required for the gate job to query prior runs of this workflow
 
 jobs:
+  # Gate: if an earlier scheduled run already succeeded today, skip the rest of
+  # today's scheduled runs. Manual (workflow_dispatch) runs always proceed.
+  check-prior-success:
+    runs-on: ubuntu-latest
+    outputs:
+      already_succeeded: ${{ steps.check.outputs.already_succeeded }}
+    steps:
+      - name: Check for a successful scheduled run earlier today
+        id: check
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          if [ "${{ github.event_name }}" != "schedule" ]; then
+            echo "Not a scheduled run; proceeding."
+            echo "already_succeeded=false" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          today=$(date -u +%Y-%m-%d)
+          count=$(gh api -X GET \
+            "/repos/${{ github.repository }}/actions/workflows/gpu-integ-tests.yml/runs" \
+            -f event=schedule \
+            -f status=success \
+            -f "created=>=${today}T00:00:00Z" \
+            --jq '.workflow_runs | length')
+          echo "Successful scheduled runs today: $count"
+          if [ "$count" -gt 0 ]; then
+            echo "already_succeeded=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "already_succeeded=false" >> "$GITHUB_OUTPUT"
+          fi
+
   gpu-integ-tests:
+    needs: check-prior-success
+    if: needs.check-prior-success.outputs.already_succeeded != 'true'
     runs-on: ubuntu-latest
     steps:
       - name: Configure AWS Credentials
@@ -24,6 +63,8 @@ jobs:
           source-version: refs/heads/master
 
   gpu-integ-tests-us-east-1:
+    needs: check-prior-success
+    if: needs.check-prior-success.outputs.already_succeeded != 'true'
     runs-on: ubuntu-latest
     steps:
       - name: Configure AWS Credentials (us-east-1)
@@ -37,3 +78,48 @@ jobs:
         with:
           project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests
           source-version: refs/heads/master
+
+  # Run-level result: a run is successful only if BOTH region jobs succeeded.
+  # Emits GpuIntegRunFailure = 1 (failed) / 0 (succeeded) to CloudWatch in
+  # us-west-2. The CDK alarm (GpuIntegRunAlarm) sums this over a UTC day and
+  # cuts a daytime sev2 when all of the day's runs failed. Skipped when the gate
+  # short-circuited today's run (an earlier run already succeeded).
+  report-result:
+    needs: [check-prior-success, gpu-integ-tests, gpu-integ-tests-us-east-1]
+    # Only emit the daily alarm metric for scheduled runs that actually executed
+    # the test jobs:
+    #   - check-prior-success.result == 'success': if the gate job itself failed,
+    #     the test jobs are skipped; without this guard always() would still run
+    #     report-result and read those skips as a (false) failure -> emit 1.
+    #   - already_succeeded != 'true': an earlier run today already passed, so the
+    #     gate short-circuited this run; nothing to report.
+    if: always() && needs.check-prior-success.result == 'success' && needs.check-prior-success.outputs.already_succeeded != 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{ secrets.MONITORING_ROLE_ARN }}
+          aws-region: us-west-2
+      - name: Emit run-level pass/fail metric
+        run: |
+          # Manual (workflow_dispatch) runs must not contribute to the daily
+          # GpuIntegRunFailure count that drives GpuIntegRunAlarm; only scheduled
+          # runs count toward the "all of today's scheduled runs failed" alarm.
+          if [ "${{ github.event_name }}" != "schedule" ]; then
+            echo "Not a scheduled run (${{ github.event_name }}); skipping metric emission."
+            exit 0
+          fi
+          if [ "${{ needs.gpu-integ-tests.result }}" == "success" ] && \
+             [ "${{ needs.gpu-integ-tests-us-east-1.result }}" == "success" ]; then
+            value=0
+            echo "Both region jobs succeeded; emitting GpuIntegRunFailure=0"
+          else
+            value=1
+            echo "At least one region job did not succeed; emitting GpuIntegRunFailure=1"
+          fi
+          aws cloudwatch put-metric-data \
+            --namespace GpuIntegRunMetrics \
+            --metric-name GpuIntegRunFailure \
+            --value "$value" \
+            --unit Count
diff --git a/.github/workflows/pr-checks-master.yml b/.github/workflows/pr-checks-master.yml
@@ -118,26 +118,40 @@ jobs:
             fi
           }
           
-          # Check which submodules changed and add them plus their dependents
-          if echo "$CHANGES" | grep -q "^sagemaker-core/"; then
-            echo "sagemaker-core changed - will add core and all dependents"
-            add_module_and_dependents "sagemaker-core"
-          fi
+          # Determine whether a module has any non-test changes. A change counts
+          # as a source change if it touches anything under the module other than
+          # its tests/ directory (e.g. src/, pyproject.toml, tox.ini, VERSION).
+          # This is intentionally conservative: only changes confined entirely to
+          # tests/ are treated as test-only.
+          is_source_changed() {
+            local module=$1
+            echo "$CHANGES" | grep "^$module/" | grep -qv "^$module/tests/"
+          }
           
-          if echo "$CHANGES" | grep -q "^sagemaker-train/"; then
-            echo "sagemaker-train changed - will add train and all dependents"
-            add_module_and_dependents "sagemaker-train"
-          fi
+          all_modules=("sagemaker-core" "sagemaker-train" "sagemaker-serve" "sagemaker-mlops")
           
-          if echo "$CHANGES" | grep -q "^sagemaker-serve/"; then
-            echo "sagemaker-serve changed - will add serve and all dependents"
-            add_module_and_dependents "sagemaker-serve"
-          fi
+          # Pass 1: modules with source changes pull in themselves plus every
+          # module that (transitively) depends on them, since a source change can
+          # affect downstream behaviour. This preserves the original logic.
+          for module in "${all_modules[@]}"; do
+            if is_source_changed "$module"; then
+              echo "$module has source changes - adding it and all dependents"
+              add_module_and_dependents "$module"
+            fi
+          done
           
-          if echo "$CHANGES" | grep -q "^sagemaker-mlops/"; then
-            echo "sagemaker-mlops changed - will add mlops"
-            add_module_and_dependents "sagemaker-mlops"
-          fi
+          # Pass 2: modules with test-only changes add only themselves and skip
+          # dependency propagation, since changing a module's tests cannot affect
+          # other modules. Run after Pass 1 so source-change propagation is never
+          # short-circuited by a test-only module already being in the set.
+          for module in "${all_modules[@]}"; do
+            if echo "$CHANGES" | grep -q "^$module/" && ! is_source_changed "$module"; then
+              if [ -z "${SUBMODULES_SET[$module]}" ]; then
+                echo "$module has test-only changes - adding only $module"
+                SUBMODULES_SET["$module"]=1
+              fi
+            fi
+          done
           
           # Convert associative array to JSON array
           SUBMODULES='[]'
diff --git a/README.rst b/README.rst
@@ -220,7 +220,6 @@ Supported Python Versions
 
 SageMaker Python SDK is tested on:
 
-- Python 3.9
 - Python 3.10
 - Python 3.11
 - Python 3.12
diff --git a/docs/installation.rst b/docs/installation.rst
@@ -16,7 +16,7 @@ Prerequisites
 ---------------
 
 **Python Version**
-  SageMaker Python SDK V3 supports Python 3.9, 3.10, 3.11, and 3.12
+  SageMaker Python SDK V3 supports Python 3.10, 3.11, and 3.12
 
 **Operating Systems**
   - Linux
diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -6,7 +6,7 @@ Get started with SageMaker Python SDK V3 in minutes. This guide walks you throug
 Prerequisites
 -------------
 
-* Python 3.9+ installed
+* Python 3.10+ installed
 * AWS account with appropriate permissions
 * AWS credentials configured
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ name = "sagemaker"
 dynamic = ["version"]
 description = "Open source library for training and deploying models on Amazon SageMaker."
 readme = "README.rst"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 authors = [
   { name = "Amazon Web Services" },
 ]
@@ -27,7 +27,6 @@ classifiers = [
   "License :: OSI Approved :: Apache Software License",
   "Natural Language :: English",
   "Programming Language :: Python",
-  "Programming Language :: Python :: 3.9",
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
diff --git a/sagemaker-core/pyproject.toml b/sagemaker-core/pyproject.toml
@@ -39,12 +39,11 @@ dependencies = [
     "tblib>=1.7.0",
     "cryptography>=46.0.0",
 ]
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 classifiers = [
     "Development Status :: 3 - Alpha",
     "Intended Audience :: Developers",
     "License :: OSI Approved :: Apache Software License",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
diff --git a/sagemaker-core/src/sagemaker/core/user_agent.py b/sagemaker-core/src/sagemaker/core/user_agent.py
@@ -74,3 +74,5 @@ def get_user_agent_extra_suffix():
         suffix = "{} md/{}#{}".format(suffix, STUDIO_PREFIX, studio_app_type)
 
     return suffix
+
+# Trigger PR check: run full integ test suite.
diff --git a/sagemaker-core/tox.ini b/sagemaker-core/tox.ini
@@ -5,7 +5,7 @@
 
 [tox]
 isolated_build = true
-envlist = black-format,flake8,pylint,docstyle,sphinx,doc8,twine,py39,py310,py311,py312
+envlist = black-format,flake8,pylint,docstyle,sphinx,doc8,twine,py310,py311,py312
 skip_missing_interpreters = False
 
 [flake8]
@@ -86,7 +86,7 @@ allowlist_externals =
     pytest
 commands =
     python -c "import os; os.system('install-custom-pkgs --install-boto-wheels')"
-    pip install 'apache-airflow==2.10.4' --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.10.4/constraints-3.9.txt"
+    pip install 'apache-airflow==2.10.4' --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.10.4/constraints-3.10.txt"
     pip install 'torch==2.3.1+cpu' -f 'https://download.pytorch.org/whl/torch_stable.html'
     pip install 'torchvision==0.18.1+cpu' -f 'https://download.pytorch.org/whl/torch_stable.html'
     pip install 'dill>=0.3.9'
@@ -98,7 +98,7 @@ deps =
     .[test]
     mock
 depends =
-    {py39,py310,py311,py312}: clean
+    {py310,py311,py312}: clean
 
 [testenv:py312]
 basepython = python3.12
diff --git a/sagemaker-mlops/pyproject.toml b/sagemaker-mlops/pyproject.toml
@@ -8,7 +8,7 @@ dynamic = ["version"]
 description = "SageMaker MLOps package for workflow orchestration and model building"
 readme = "README.md"
 license = {file = "LICENSE"}
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 authors = [
     {name = "Amazon Web Services"},
 ]
@@ -17,7 +17,6 @@ classifiers = [
     "Intended Audience :: Developers",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
diff --git a/sagemaker-mlops/tox.ini b/sagemaker-mlops/tox.ini
@@ -5,7 +5,7 @@
 
 [tox]
 isolated_build = true
-envlist = black-format,flake8,pylint,docstyle,sphinx,doc8,twine,py39,py310,py311,py312
+envlist = black-format,flake8,pylint,docstyle,sphinx,doc8,twine,py310,py311,py312
 
 skip_missing_interpreters = False
 
@@ -88,7 +88,7 @@ allowlist_externals =
     pytest
 commands =
     python -c "import os; os.system('install-custom-pkgs --install-boto-wheels')"
-    pip install 'apache-airflow==2.10.4' --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.10.4/constraints-3.9.txt"
+    pip install 'apache-airflow==2.10.4' --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.10.4/constraints-3.10.txt"
     pip install 'torch==2.8.0' 'torchvision==0.23.0'
     pip install 'dill>=0.3.9'
 
@@ -101,7 +101,7 @@ deps =
     .[test]
     mock
 depends =
-    {py39,py310,py311,py312}: clean
+    {py310,py311,py312}: clean
 
 [testenv:py312]
 basepython = python3.12
diff --git a/sagemaker-serve/pyproject.toml b/sagemaker-serve/pyproject.toml
@@ -8,7 +8,7 @@ dynamic = ["version"]
 description = "SageMaker Serve package for model serving and deployment"
 readme = "README.md"
 license = {file = "LICENSE"}
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 authors = [
     {name = "Amazon Web Services"},
 ]
@@ -17,7 +17,6 @@ classifiers = [
     "Intended Audience :: Developers",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
diff --git a/sagemaker-serve/tests/integ/test_bedrock_provisioned_throughput.py b/sagemaker-serve/tests/integ/test_bedrock_provisioned_throughput.py
@@ -17,6 +17,7 @@
 import time
 import random
 import logging
+from datetime import datetime, timezone, timedelta
 from urllib.parse import urlparse
 
 import boto3
@@ -43,10 +44,59 @@ def role_arn():
     return get_execution_role()
 
 
+# Prefix used for all provisioned throughputs created by this test module.
+PT_TEST_PREFIX = "test-pt-integ-"
+# Provisioned throughputs older than this are considered leaked and reaped on setup.
+PT_STALE_AGE = timedelta(hours=2)
+
+
 @pytest.fixture(scope="module")
 def bedrock_client():
-    """Create Bedrock client."""
-    return boto3.client("bedrock", region_name=AWS_REGION)
+    """Create Bedrock client and eagerly reap leaked test provisioned throughputs.
+
+    Provisioned throughputs cost money and consume a small, easily-exhausted
+    model-unit quota. A test process killed before its teardown runs (CodeBuild
+    timeout, worker crash, etc.) leaks its PT, and these accumulate across runs
+    until the quota is full and CreateProvisionedModelThroughput starts failing.
+
+    To stay self-healing, on setup we delete any ``test-pt-integ-*`` PT older
+    than PT_STALE_AGE. The age guard avoids racing a PT that another concurrent
+    run just created.
+    """
+    client = boto3.client("bedrock", region_name=AWS_REGION)
+
+    try:
+        cutoff = datetime.now(timezone.utc) - PT_STALE_AGE
+        paginator_token = None
+        while True:
+            params = {"maxResults": 100}
+            if paginator_token:
+                params["nextToken"] = paginator_token
+            response = client.list_provisioned_model_throughputs(**params)
+            for pt in response.get("provisionedModelSummaries", []):
+                name = pt.get("provisionedModelName", "")
+                if not name.startswith(PT_TEST_PREFIX):
+                    continue
+                created = pt.get("creationTime")
+                if created and created >= cutoff:
+                    continue
+                # Only InService/Failed PTs can be deleted.
+                if pt.get("status") not in ("InService", "Failed"):
+                    continue
+                try:
+                    logger.info("Eager cleanup of stale provisioned throughput: %s", name)
+                    client.delete_provisioned_model_throughput(
+                        provisionedModelId=pt["provisionedModelArn"]
+                    )
+                except Exception as e:
+                    logger.warning("Eager cleanup failed for %s: %s", name, e)
+            paginator_token = response.get("nextToken")
+            if not paginator_token:
+                break
+    except Exception as e:
+        logger.warning("Failed to list provisioned throughputs for eager cleanup: %s", e)
+
+    return client
 
 
 @pytest.fixture(scope="module")
diff --git a/sagemaker-serve/tox.ini b/sagemaker-serve/tox.ini
diff --git a/sagemaker-train/README.rst b/sagemaker-train/README.rst
diff --git a/sagemaker-train/pyproject.toml b/sagemaker-train/pyproject.toml
diff --git a/sagemaker-train/tests/integ/train/test_mtrl_evaluator.py b/sagemaker-train/tests/integ/train/test_mtrl_evaluator.py
diff --git a/sagemaker-train/tox.ini b/sagemaker-train/tox.ini