Skip to content

Commit bb61cd4

Browse files
author
AWS
committed
Amazon SageMaker Service Update: Added support for placement strategy and consolidation for SageMaker inference component endpoints. Customers can now configure how inference component copies are distributed across instances and availability zones (AZs), and enable automatic consolidation to optimizes resource utilization.
1 parent 66ec306 commit bb61cd4

File tree

2 files changed

+103
-0
lines changed

2 files changed

+103
-0
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"type": "feature",
3+
"category": "Amazon SageMaker Service",
4+
"contributor": "",
5+
"description": "Added support for placement strategy and consolidation for SageMaker inference component endpoints. Customers can now configure how inference component copies are distributed across instances and availability zones (AZs), and enable automatic consolidation to optimizes resource utilization."
6+
}

services/sagemaker/src/main/resources/codegen-resources/service-2.json

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7014,6 +7014,16 @@
70147014
"min":1,
70157015
"pattern":"[a-z]+\\-[0-9a-z\\-]+"
70167016
},
7017+
"AvailabilityZoneBalanceEnforcementMode":{
7018+
"type":"string",
7019+
"enum":["PERMISSIVE"]
7020+
},
7021+
"AvailabilityZoneBalanceMaxImbalance":{
7022+
"type":"integer",
7023+
"box":true,
7024+
"max":100,
7025+
"min":0
7026+
},
70177027
"AvailabilityZoneId":{
70187028
"type":"string",
70197029
"pattern":"[a-z]{3}\\d-az\\d"
@@ -25960,6 +25970,21 @@
2596025970
"max":2048,
2596125971
"min":20
2596225972
},
25973+
"InferenceComponentAvailabilityZoneBalance":{
25974+
"type":"structure",
25975+
"required":["EnforcementMode"],
25976+
"members":{
25977+
"EnforcementMode":{
25978+
"shape":"AvailabilityZoneBalanceEnforcementMode",
25979+
"documentation":"<p>Determines how strictly the Availability Zone balance constraint is enforced.</p> <dl> <dt>PERMISSIVE</dt> <dd> <p>The endpoint attempts to balance copies across Availability Zones but proceeds with scheduling even if balance can't be achieved due to available capacity or instance distribution across Availability Zones.</p> </dd> </dl>"
25980+
},
25981+
"MaxImbalance":{
25982+
"shape":"AvailabilityZoneBalanceMaxImbalance",
25983+
"documentation":"<p>The maximum allowed difference in the number of inference component copies between any two Availability Zones. This parameter applies only when the endpoint has instances across two or more Availability Zones. A copy placement is allowed if it reduces imbalance or the resulting imbalance is within this value.</p> <p>Default value: <code>0</code>.</p>"
25984+
}
25985+
},
25986+
"documentation":"<p>Configuration for balancing inference component copies across Availability Zones.</p>"
25987+
},
2596325988
"InferenceComponentCapacitySize":{
2596425989
"type":"structure",
2596525990
"required":[
@@ -26104,6 +26129,13 @@
2610426129
"min":0,
2610526130
"pattern":"[a-zA-Z0-9-]+"
2610626131
},
26132+
"InferenceComponentPlacementStrategy":{
26133+
"type":"string",
26134+
"enum":[
26135+
"SPREAD",
26136+
"BINPACK"
26137+
]
26138+
},
2610726139
"InferenceComponentRollingUpdatePolicy":{
2610826140
"type":"structure",
2610926141
"required":[
@@ -26155,6 +26187,21 @@
2615526187
},
2615626188
"documentation":"<p>Details about the runtime settings for the model that is deployed with the inference component.</p>"
2615726189
},
26190+
"InferenceComponentSchedulingConfig":{
26191+
"type":"structure",
26192+
"required":["PlacementStrategy"],
26193+
"members":{
26194+
"PlacementStrategy":{
26195+
"shape":"InferenceComponentPlacementStrategy",
26196+
"documentation":"<p>The strategy for placing inference component copies across available instances. If you also set <code>AvailabilityZoneBalance</code>, this strategy applies to placement within each Availability Zone.</p> <dl> <dt>SPREAD</dt> <dd> <p>Distributes copies evenly across available instances for better resilience.</p> </dd> <dt>BINPACK</dt> <dd> <p>Packs copies onto fewer instances to optimize resource utilization.</p> </dd> </dl>"
26197+
},
26198+
"AvailabilityZoneBalance":{
26199+
"shape":"InferenceComponentAvailabilityZoneBalance",
26200+
"documentation":"<p>Configuration for balancing inference component copies across Availability Zones.</p>"
26201+
}
26202+
},
26203+
"documentation":"<p>The scheduling configuration that determines how inference component copies are placed across available instances when copies are added or removed.</p>"
26204+
},
2615826205
"InferenceComponentSortKey":{
2615926206
"type":"string",
2616026207
"enum":[
@@ -26189,6 +26236,10 @@
2618926236
"DataCacheConfig":{
2619026237
"shape":"InferenceComponentDataCacheConfig",
2619126238
"documentation":"<p>Settings that affect how the inference component caches data.</p>"
26239+
},
26240+
"SchedulingConfig":{
26241+
"shape":"InferenceComponentSchedulingConfig",
26242+
"documentation":"<p>The scheduling configuration that determines how inference component copies are placed across available instances when copies are added or removed.</p>"
2619226243
}
2619326244
},
2619426245
"documentation":"<p>Details about the resources to deploy with this inference component, including the model, container, and compute resources.</p>"
@@ -26219,6 +26270,10 @@
2621926270
"DataCacheConfig":{
2622026271
"shape":"InferenceComponentDataCacheConfigSummary",
2622126272
"documentation":"<p>Settings that affect how the inference component caches data.</p>"
26273+
},
26274+
"SchedulingConfig":{
26275+
"shape":"InferenceComponentSchedulingConfig",
26276+
"documentation":"<p>The scheduling configuration that determines how inference component copies are placed across available instances when copies are added or removed.</p>"
2622226277
}
2622326278
},
2622426279
"documentation":"<p>Details about the resources that are deployed with this inference component.</p>"
@@ -32319,16 +32374,35 @@
3231932374
"min":0,
3232032375
"pattern":"\\d+\\.\\d+"
3232132376
},
32377+
"ManagedInstanceScalingCooldownInMinutes":{
32378+
"type":"integer",
32379+
"box":true,
32380+
"max":1440,
32381+
"min":5
32382+
},
3232232383
"ManagedInstanceScalingMaxInstanceCount":{
3232332384
"type":"integer",
3232432385
"box":true,
3232532386
"min":1
3232632387
},
32388+
"ManagedInstanceScalingMaximumStepSize":{
32389+
"type":"integer",
32390+
"box":true,
32391+
"max":100,
32392+
"min":1
32393+
},
3232732394
"ManagedInstanceScalingMinInstanceCount":{
3232832395
"type":"integer",
3232932396
"box":true,
3233032397
"min":0
3233132398
},
32399+
"ManagedInstanceScalingScaleInStrategy":{
32400+
"type":"string",
32401+
"enum":[
32402+
"IDLE_RELEASE",
32403+
"CONSOLIDATION"
32404+
]
32405+
},
3233232406
"ManagedInstanceScalingStatus":{
3233332407
"type":"string",
3233432408
"enum":[
@@ -38532,10 +38606,33 @@
3853238606
"MaxInstanceCount":{
3853338607
"shape":"ManagedInstanceScalingMaxInstanceCount",
3853438608
"documentation":"<p>The maximum number of instances that the endpoint can provision when it scales up to accommodate an increase in traffic.</p>"
38609+
},
38610+
"ScaleInPolicy":{
38611+
"shape":"ProductionVariantManagedInstanceScalingScaleInPolicy",
38612+
"documentation":"<p>Configures the scale-in behavior for managed instance scaling.</p>"
3853538613
}
3853638614
},
3853738615
"documentation":"<p>Settings that control the range in the number of instances that the endpoint provisions as it scales up or down to accommodate traffic. </p>"
3853838616
},
38617+
"ProductionVariantManagedInstanceScalingScaleInPolicy":{
38618+
"type":"structure",
38619+
"required":["Strategy"],
38620+
"members":{
38621+
"Strategy":{
38622+
"shape":"ManagedInstanceScalingScaleInStrategy",
38623+
"documentation":"<p>The strategy for scaling in instances.</p> <dl> <dt>IDLE_RELEASE</dt> <dd> <p>Releases instances that have no hosted inference component copies.</p> </dd> <dt>CONSOLIDATION</dt> <dd> <p>Consolidates inference component copies onto fewer instances to release more instances. Consolidation honors the scheduling configuration of each inference component. For example, if an inference component specifies Availability Zone balance, consolidation only proceeds when the resulting distribution does not increase the imbalance.</p> </dd> </dl>"
38624+
},
38625+
"MaximumStepSize":{
38626+
"shape":"ManagedInstanceScalingMaximumStepSize",
38627+
"documentation":"<p>The maximum number of instances that the endpoint can terminate at a time during a consolidation scale-in operation.</p> <p>Default value: <code>1</code>.</p>"
38628+
},
38629+
"CooldownInMinutes":{
38630+
"shape":"ManagedInstanceScalingCooldownInMinutes",
38631+
"documentation":"<p>The cooldown period, in minutes, after the last endpoint operation before the endpoint evaluates consolidation scale-in opportunities.</p> <p>Default value: <code>20</code>.</p>"
38632+
}
38633+
},
38634+
"documentation":"<p>Configures the scale-in behavior for managed instance scaling.</p>"
38635+
},
3853938636
"ProductionVariantModelDataDownloadTimeoutInSeconds":{
3854038637
"type":"integer",
3854138638
"box":true,

0 commit comments

Comments
 (0)