Skip to content

Commit 77fb3af

Browse files
authored
Merge branch 'aws:master' into master-mtrl-eval-issue-fix
2 parents b8558b0 + a15a449 commit 77fb3af

18 files changed

Lines changed: 229 additions & 55 deletions

File tree

.github/workflows/gpu-integ-tests.yml

Lines changed: 88 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,53 @@
11
name: GPU Integ Tests
22
on:
33
schedule:
4-
- cron: "0 */8 * * *"
4+
# US Pacific (PST, UTC-8): 10:00 PM / 1:00 AM / 4:00 AM -> 06/09/12 UTC.
5+
# All three fire within the same UTC day so the run-level CloudWatch metric
6+
# (GpuIntegRunFailure) aggregates correctly per day.
7+
- cron: "0 6 * * *"
8+
- cron: "0 9 * * *"
9+
- cron: "0 12 * * *"
510
workflow_dispatch:
611

712
permissions:
8-
id-token: write # This is required for requesting the JWT
13+
id-token: write # This is required for requesting the JWT
14+
actions: read # required for the gate job to query prior runs of this workflow
915

1016
jobs:
17+
# Gate: if an earlier scheduled run already succeeded today, skip the rest of
18+
# today's scheduled runs. Manual (workflow_dispatch) runs always proceed.
19+
check-prior-success:
20+
runs-on: ubuntu-latest
21+
outputs:
22+
already_succeeded: ${{ steps.check.outputs.already_succeeded }}
23+
steps:
24+
- name: Check for a successful scheduled run earlier today
25+
id: check
26+
env:
27+
GH_TOKEN: ${{ github.token }}
28+
run: |
29+
if [ "${{ github.event_name }}" != "schedule" ]; then
30+
echo "Not a scheduled run; proceeding."
31+
echo "already_succeeded=false" >> "$GITHUB_OUTPUT"
32+
exit 0
33+
fi
34+
today=$(date -u +%Y-%m-%d)
35+
count=$(gh api -X GET \
36+
"/repos/${{ github.repository }}/actions/workflows/gpu-integ-tests.yml/runs" \
37+
-f event=schedule \
38+
-f status=success \
39+
-f "created=>=${today}T00:00:00Z" \
40+
--jq '.workflow_runs | length')
41+
echo "Successful scheduled runs today: $count"
42+
if [ "$count" -gt 0 ]; then
43+
echo "already_succeeded=true" >> "$GITHUB_OUTPUT"
44+
else
45+
echo "already_succeeded=false" >> "$GITHUB_OUTPUT"
46+
fi
47+
1148
gpu-integ-tests:
49+
needs: check-prior-success
50+
if: needs.check-prior-success.outputs.already_succeeded != 'true'
1251
runs-on: ubuntu-latest
1352
steps:
1453
- name: Configure AWS Credentials
@@ -24,6 +63,8 @@ jobs:
2463
source-version: refs/heads/master
2564

2665
gpu-integ-tests-us-east-1:
66+
needs: check-prior-success
67+
if: needs.check-prior-success.outputs.already_succeeded != 'true'
2768
runs-on: ubuntu-latest
2869
steps:
2970
- name: Configure AWS Credentials (us-east-1)
@@ -37,3 +78,48 @@ jobs:
3778
with:
3879
project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests
3980
source-version: refs/heads/master
81+
82+
# Run-level result: a run is successful only if BOTH region jobs succeeded.
83+
# Emits GpuIntegRunFailure = 1 (failed) / 0 (succeeded) to CloudWatch in
84+
# us-west-2. The CDK alarm (GpuIntegRunAlarm) sums this over a UTC day and
85+
# cuts a daytime sev2 when all of the day's runs failed. Skipped when the gate
86+
# short-circuited today's run (an earlier run already succeeded).
87+
report-result:
88+
needs: [check-prior-success, gpu-integ-tests, gpu-integ-tests-us-east-1]
89+
# Only emit the daily alarm metric for scheduled runs that actually executed
90+
# the test jobs:
91+
# - check-prior-success.result == 'success': if the gate job itself failed,
92+
# the test jobs are skipped; without this guard always() would still run
93+
# report-result and read those skips as a (false) failure -> emit 1.
94+
# - already_succeeded != 'true': an earlier run today already passed, so the
95+
# gate short-circuited this run; nothing to report.
96+
if: always() && needs.check-prior-success.result == 'success' && needs.check-prior-success.outputs.already_succeeded != 'true'
97+
runs-on: ubuntu-latest
98+
steps:
99+
- name: Configure AWS Credentials
100+
uses: aws-actions/configure-aws-credentials@v4
101+
with:
102+
role-to-assume: ${{ secrets.MONITORING_ROLE_ARN }}
103+
aws-region: us-west-2
104+
- name: Emit run-level pass/fail metric
105+
run: |
106+
# Manual (workflow_dispatch) runs must not contribute to the daily
107+
# GpuIntegRunFailure count that drives GpuIntegRunAlarm; only scheduled
108+
# runs count toward the "all of today's scheduled runs failed" alarm.
109+
if [ "${{ github.event_name }}" != "schedule" ]; then
110+
echo "Not a scheduled run (${{ github.event_name }}); skipping metric emission."
111+
exit 0
112+
fi
113+
if [ "${{ needs.gpu-integ-tests.result }}" == "success" ] && \
114+
[ "${{ needs.gpu-integ-tests-us-east-1.result }}" == "success" ]; then
115+
value=0
116+
echo "Both region jobs succeeded; emitting GpuIntegRunFailure=0"
117+
else
118+
value=1
119+
echo "At least one region job did not succeed; emitting GpuIntegRunFailure=1"
120+
fi
121+
aws cloudwatch put-metric-data \
122+
--namespace GpuIntegRunMetrics \
123+
--metric-name GpuIntegRunFailure \
124+
--value "$value" \
125+
--unit Count

.github/workflows/pr-checks-master.yml

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -118,26 +118,40 @@ jobs:
118118
fi
119119
}
120120
121-
# Check which submodules changed and add them plus their dependents
122-
if echo "$CHANGES" | grep -q "^sagemaker-core/"; then
123-
echo "sagemaker-core changed - will add core and all dependents"
124-
add_module_and_dependents "sagemaker-core"
125-
fi
121+
# Determine whether a module has any non-test changes. A change counts
122+
# as a source change if it touches anything under the module other than
123+
# its tests/ directory (e.g. src/, pyproject.toml, tox.ini, VERSION).
124+
# This is intentionally conservative: only changes confined entirely to
125+
# tests/ are treated as test-only.
126+
is_source_changed() {
127+
local module=$1
128+
echo "$CHANGES" | grep "^$module/" | grep -qv "^$module/tests/"
129+
}
126130
127-
if echo "$CHANGES" | grep -q "^sagemaker-train/"; then
128-
echo "sagemaker-train changed - will add train and all dependents"
129-
add_module_and_dependents "sagemaker-train"
130-
fi
131+
all_modules=("sagemaker-core" "sagemaker-train" "sagemaker-serve" "sagemaker-mlops")
131132
132-
if echo "$CHANGES" | grep -q "^sagemaker-serve/"; then
133-
echo "sagemaker-serve changed - will add serve and all dependents"
134-
add_module_and_dependents "sagemaker-serve"
135-
fi
133+
# Pass 1: modules with source changes pull in themselves plus every
134+
# module that (transitively) depends on them, since a source change can
135+
# affect downstream behaviour. This preserves the original logic.
136+
for module in "${all_modules[@]}"; do
137+
if is_source_changed "$module"; then
138+
echo "$module has source changes - adding it and all dependents"
139+
add_module_and_dependents "$module"
140+
fi
141+
done
136142
137-
if echo "$CHANGES" | grep -q "^sagemaker-mlops/"; then
138-
echo "sagemaker-mlops changed - will add mlops"
139-
add_module_and_dependents "sagemaker-mlops"
140-
fi
143+
# Pass 2: modules with test-only changes add only themselves and skip
144+
# dependency propagation, since changing a module's tests cannot affect
145+
# other modules. Run after Pass 1 so source-change propagation is never
146+
# short-circuited by a test-only module already being in the set.
147+
for module in "${all_modules[@]}"; do
148+
if echo "$CHANGES" | grep -q "^$module/" && ! is_source_changed "$module"; then
149+
if [ -z "${SUBMODULES_SET[$module]}" ]; then
150+
echo "$module has test-only changes - adding only $module"
151+
SUBMODULES_SET["$module"]=1
152+
fi
153+
fi
154+
done
141155
142156
# Convert associative array to JSON array
143157
SUBMODULES='[]'

README.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,6 @@ Supported Python Versions
220220

221221
SageMaker Python SDK is tested on:
222222

223-
- Python 3.9
224223
- Python 3.10
225224
- Python 3.11
226225
- Python 3.12

docs/installation.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Prerequisites
1616
---------------
1717

1818
**Python Version**
19-
SageMaker Python SDK V3 supports Python 3.9, 3.10, 3.11, and 3.12
19+
SageMaker Python SDK V3 supports Python 3.10, 3.11, and 3.12
2020

2121
**Operating Systems**
2222
- Linux

docs/quickstart.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ Get started with SageMaker Python SDK V3 in minutes. This guide walks you throug
66
Prerequisites
77
-------------
88

9-
* Python 3.9+ installed
9+
* Python 3.10+ installed
1010
* AWS account with appropriate permissions
1111
* AWS credentials configured
1212

pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ name = "sagemaker"
77
dynamic = ["version"]
88
description = "Open source library for training and deploying models on Amazon SageMaker."
99
readme = "README.rst"
10-
requires-python = ">=3.9"
10+
requires-python = ">=3.10"
1111
authors = [
1212
{ name = "Amazon Web Services" },
1313
]
@@ -27,7 +27,6 @@ classifiers = [
2727
"License :: OSI Approved :: Apache Software License",
2828
"Natural Language :: English",
2929
"Programming Language :: Python",
30-
"Programming Language :: Python :: 3.9",
3130
"Programming Language :: Python :: 3.10",
3231
"Programming Language :: Python :: 3.11",
3332
"Programming Language :: Python :: 3.12",

sagemaker-core/pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,11 @@ dependencies = [
3939
"tblib>=1.7.0",
4040
"cryptography>=46.0.0",
4141
]
42-
requires-python = ">=3.9"
42+
requires-python = ">=3.10"
4343
classifiers = [
4444
"Development Status :: 3 - Alpha",
4545
"Intended Audience :: Developers",
4646
"License :: OSI Approved :: Apache Software License",
47-
"Programming Language :: Python :: 3.9",
4847
"Programming Language :: Python :: 3.10",
4948
"Programming Language :: Python :: 3.11",
5049
"Programming Language :: Python :: 3.12",

sagemaker-core/src/sagemaker/core/user_agent.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,5 @@ def get_user_agent_extra_suffix():
7474
suffix = "{} md/{}#{}".format(suffix, STUDIO_PREFIX, studio_app_type)
7575

7676
return suffix
77+
78+
# Trigger PR check: run full integ test suite.

sagemaker-core/tox.ini

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
[tox]
77
isolated_build = true
8-
envlist = black-format,flake8,pylint,docstyle,sphinx,doc8,twine,py39,py310,py311,py312
8+
envlist = black-format,flake8,pylint,docstyle,sphinx,doc8,twine,py310,py311,py312
99
skip_missing_interpreters = False
1010

1111
[flake8]
@@ -86,7 +86,7 @@ allowlist_externals =
8686
pytest
8787
commands =
8888
python -c "import os; os.system('install-custom-pkgs --install-boto-wheels')"
89-
pip install 'apache-airflow==2.10.4' --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.10.4/constraints-3.9.txt"
89+
pip install 'apache-airflow==2.10.4' --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.10.4/constraints-3.10.txt"
9090
pip install 'torch==2.3.1+cpu' -f 'https://download.pytorch.org/whl/torch_stable.html'
9191
pip install 'torchvision==0.18.1+cpu' -f 'https://download.pytorch.org/whl/torch_stable.html'
9292
pip install 'dill>=0.3.9'
@@ -98,7 +98,7 @@ deps =
9898
.[test]
9999
mock
100100
depends =
101-
{py39,py310,py311,py312}: clean
101+
{py310,py311,py312}: clean
102102

103103
[testenv:py312]
104104
basepython = python3.12

sagemaker-mlops/pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ dynamic = ["version"]
88
description = "SageMaker MLOps package for workflow orchestration and model building"
99
readme = "README.md"
1010
license = {file = "LICENSE"}
11-
requires-python = ">=3.9"
11+
requires-python = ">=3.10"
1212
authors = [
1313
{name = "Amazon Web Services"},
1414
]
@@ -17,7 +17,6 @@ classifiers = [
1717
"Intended Audience :: Developers",
1818
"License :: OSI Approved :: Apache Software License",
1919
"Programming Language :: Python :: 3",
20-
"Programming Language :: Python :: 3.9",
2120
"Programming Language :: Python :: 3.10",
2221
"Programming Language :: Python :: 3.11",
2322
"Programming Language :: Python :: 3.12",

0 commit comments

Comments
 (0)