Skip to content

Commit 58b9a9f

Browse files
authored
Decouple bandwidth reservations from oversubscription (#238)
1 parent 4acbfdc commit 58b9a9f

5 files changed

Lines changed: 100 additions & 46 deletions

File tree

deploy/aws/cloudformation/template.yaml

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ Metadata:
1717
- InstanceType
1818
- RootVolumeSize
1919
- DataVolumeSize
20+
- DataVolumeIops
21+
- DataVolumeThroughput
2022
- AmiSsmParameter
2123
- Label:
2224
default: Access
@@ -51,6 +53,10 @@ Metadata:
5153
default: Root volume size
5254
DataVolumeSize:
5355
default: Hypeman data volume size
56+
DataVolumeIops:
57+
default: Hypeman data volume IOPS
58+
DataVolumeThroughput:
59+
default: Hypeman data volume throughput
5460
HypemanVersion:
5561
default: Hypeman release
5662
HypemanBranch:
@@ -109,6 +115,16 @@ Parameters:
109115
MinValue: 50
110116
MaxValue: 16384
111117
Description: Hypeman data EBS volume size in GiB. This volume is formatted as XFS and mounted at /var/lib/hypeman.
118+
DataVolumeIops:
119+
Type: String
120+
Default: ""
121+
AllowedPattern: "^$|^[0-9]+$"
122+
Description: Optional provisioned IOPS for the Hypeman data gp3 EBS volume. Leave empty for the EC2 default.
123+
DataVolumeThroughput:
124+
Type: String
125+
Default: ""
126+
AllowedPattern: "^$|^[0-9]+$"
127+
Description: Optional provisioned throughput in MiB/s for the Hypeman data gp3 EBS volume. When set, Hypeman disk I/O capacity is configured to the same value.
112128
HypemanVersion:
113129
Type: String
114130
Default: latest
@@ -285,18 +301,34 @@ Resources:
285301
stack_uuid = event["StackId"].rsplit("/", 1)[-1]
286302
return f"{event['ResourceProperties']['NamePrefix']}-{stack_uuid}"
287303
288-
def create_launch_template(name):
304+
def put_if_set(payload, key, value):
305+
if value:
306+
payload[key] = value
307+
308+
def create_launch_template(name, props):
289309
payload = {
290310
"Action": "CreateLaunchTemplate",
291311
"Version": "2016-11-15",
292312
"LaunchTemplateName": name,
293313
"LaunchTemplateData.CpuOptions.NestedVirtualization": "enabled",
314+
"LaunchTemplateData.BlockDeviceMapping.1.DeviceName": "/dev/sda1",
315+
"LaunchTemplateData.BlockDeviceMapping.1.Ebs.VolumeSize": props["RootVolumeSize"],
316+
"LaunchTemplateData.BlockDeviceMapping.1.Ebs.VolumeType": "gp3",
317+
"LaunchTemplateData.BlockDeviceMapping.1.Ebs.Encrypted": "true",
318+
"LaunchTemplateData.BlockDeviceMapping.1.Ebs.DeleteOnTermination": "true",
319+
"LaunchTemplateData.BlockDeviceMapping.2.DeviceName": "/dev/sdf",
320+
"LaunchTemplateData.BlockDeviceMapping.2.Ebs.VolumeSize": props["DataVolumeSize"],
321+
"LaunchTemplateData.BlockDeviceMapping.2.Ebs.VolumeType": "gp3",
322+
"LaunchTemplateData.BlockDeviceMapping.2.Ebs.Encrypted": "true",
323+
"LaunchTemplateData.BlockDeviceMapping.2.Ebs.DeleteOnTermination": "true",
294324
"TagSpecification.1.ResourceType": "launch-template",
295325
"TagSpecification.1.Tag.1.Key": "Name",
296326
"TagSpecification.1.Tag.1.Value": name,
297327
"TagSpecification.1.Tag.2.Key": "hypeman:deployment",
298328
"TagSpecification.1.Tag.2.Value": "aws",
299329
}
330+
put_if_set(payload, "LaunchTemplateData.BlockDeviceMapping.2.Ebs.Iops", props.get("DataVolumeIops", ""))
331+
put_if_set(payload, "LaunchTemplateData.BlockDeviceMapping.2.Ebs.Throughput", props.get("DataVolumeThroughput", ""))
300332
xml = ec2_query(payload)
301333
root = ET.fromstring(xml)
302334
launch_template_id = root.find(".//{*}launchTemplateId")
@@ -326,7 +358,7 @@ Resources:
326358
return
327359
if request_type == "Update":
328360
delete_launch_template(physical_id)
329-
data = create_launch_template(launch_template_name(event))
361+
data = create_launch_template(launch_template_name(event), event["ResourceProperties"])
330362
send(event, context, "SUCCESS", data, physical_id=data["LaunchTemplateId"])
331363
except Exception as exc:
332364
traceback.print_exc()
@@ -337,6 +369,10 @@ Resources:
337369
Properties:
338370
ServiceToken: !GetAtt NestedVirtualizationLaunchTemplateFunction.Arn
339371
NamePrefix: hypeman
372+
RootVolumeSize: !Ref RootVolumeSize
373+
DataVolumeSize: !Ref DataVolumeSize
374+
DataVolumeIops: !Ref DataVolumeIops
375+
DataVolumeThroughput: !Ref DataVolumeThroughput
340376

341377
HypemanHost:
342378
Type: AWS::EC2::Instance
@@ -351,19 +387,6 @@ Resources:
351387
- !Ref HypemanSecurityGroup
352388
IamInstanceProfile: !Ref HypemanInstanceProfile
353389
KeyName: !If [UseSSH, !Ref KeyName, !Ref AWS::NoValue]
354-
BlockDeviceMappings:
355-
- DeviceName: /dev/sda1
356-
Ebs:
357-
VolumeSize: !Ref RootVolumeSize
358-
VolumeType: gp3
359-
Encrypted: true
360-
DeleteOnTermination: true
361-
- DeviceName: /dev/sdf
362-
Ebs:
363-
VolumeSize: !Ref DataVolumeSize
364-
VolumeType: gp3
365-
Encrypted: true
366-
DeleteOnTermination: true
367390
Tags:
368391
- Key: Name
369392
Value: !Sub ${AWS::StackName}-hypeman
@@ -445,6 +468,16 @@ Resources:
445468
fi
446469
curl -fsSL https://raw.githubusercontent.com/kernel/hypeman/main/scripts/install.sh | bash
447470

471+
if [ -n "${DataVolumeThroughput}" ]; then
472+
install -d -m 755 /etc/systemd/system/hypeman.service.d
473+
cat >/etc/systemd/system/hypeman.service.d/disk-io-capacity.conf <<EOF
474+
[Service]
475+
Environment="CAPACITY__DISK_IO=${DataVolumeThroughput}MB/s"
476+
EOF
477+
systemctl daemon-reload
478+
systemctl restart hypeman
479+
fi
480+
448481
install -d -m 755 /opt/hypeman/deploy
449482
cat >/usr/local/bin/hypeman-create-token <<'SCRIPT'
450483
#!/usr/bin/env bash

deploy/aws/cloudformation/template_test.go

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ func TestQuickstartParameters(t *testing.T) {
2020
assertDefault(t, parameters, "AllowedSshCidr", "127.0.0.1/32")
2121
assertDefault(t, parameters, "RootVolumeSize", "30")
2222
assertDefault(t, parameters, "DataVolumeSize", "100")
23+
assertDefault(t, parameters, "DataVolumeIops", "")
24+
assertDefault(t, parameters, "DataVolumeThroughput", "")
2325
assertDefault(t, parameters, "HypemanVersion", "latest")
2426
assertDefault(t, parameters, "HypemanCliVersion", "latest")
2527

@@ -87,6 +89,16 @@ func TestCloudFormationLaunchContract(t *testing.T) {
8789
zipFile := scalar(t, requireField(t, code, "ZipFile"))
8890
assertContains(t, zipFile, `"Action": "CreateLaunchTemplate"`)
8991
assertContains(t, zipFile, `"LaunchTemplateData.CpuOptions.NestedVirtualization": "enabled"`)
92+
assertContains(t, zipFile, `"LaunchTemplateData.BlockDeviceMapping.1.Ebs.VolumeSize": props["RootVolumeSize"]`)
93+
assertContains(t, zipFile, `"LaunchTemplateData.BlockDeviceMapping.2.Ebs.VolumeSize": props["DataVolumeSize"]`)
94+
assertContains(t, zipFile, `"LaunchTemplateData.BlockDeviceMapping.2.Ebs.Iops"`)
95+
assertContains(t, zipFile, `"LaunchTemplateData.BlockDeviceMapping.2.Ebs.Throughput"`)
96+
97+
launchTemplateProperties := requireMapping(t, requireField(t, launchTemplate, "Properties"))
98+
assertRef(t, requireField(t, launchTemplateProperties, "RootVolumeSize"), "RootVolumeSize")
99+
assertRef(t, requireField(t, launchTemplateProperties, "DataVolumeSize"), "DataVolumeSize")
100+
assertRef(t, requireField(t, launchTemplateProperties, "DataVolumeIops"), "DataVolumeIops")
101+
assertRef(t, requireField(t, launchTemplateProperties, "DataVolumeThroughput"), "DataVolumeThroughput")
90102

91103
host := requireMapping(t, requireField(t, resources, "HypemanHost"))
92104
if got := scalar(t, requireField(t, host, "Type")); got != "AWS::EC2::Instance" {
@@ -97,19 +109,10 @@ func TestCloudFormationLaunchContract(t *testing.T) {
97109
assertGetAtt(t, requireField(t, hostLaunchTemplate, "LaunchTemplateId"), "NestedVirtualizationLaunchTemplate.LaunchTemplateId")
98110
assertGetAtt(t, requireField(t, hostLaunchTemplate, "Version"), "NestedVirtualizationLaunchTemplate.VersionNumber")
99111

100-
blockDeviceMappings := requireSequence(t, requireField(t, hostProperties, "BlockDeviceMappings"))
101-
if len(blockDeviceMappings.Content) != 2 {
102-
t.Fatalf("expected root and Hypeman data block device mappings, got %d", len(blockDeviceMappings.Content))
103-
}
104-
dataDevice := requireMapping(t, blockDeviceMappings.Content[1])
105-
if got := scalar(t, requireField(t, dataDevice, "DeviceName")); got != "/dev/sdf" {
106-
t.Fatalf("data device name = %q, want /dev/sdf", got)
107-
}
108-
dataEBS := requireMapping(t, requireField(t, dataDevice, "Ebs"))
109-
assertRef(t, requireField(t, dataEBS, "VolumeSize"), "DataVolumeSize")
110-
111112
userData := nodeText(requireField(t, hostProperties, "UserData"))
112113
assertContains(t, userData, "curl -fsSL https://raw.githubusercontent.com/kernel/hypeman/main/scripts/install.sh | bash")
114+
assertContains(t, userData, `if [ -n "${DataVolumeThroughput}" ]; then`)
115+
assertContains(t, userData, `Environment="CAPACITY__DISK_IO=${DataVolumeThroughput}MB/s"`)
113116
assertContains(t, userData, "xfsprogs")
114117
assertContains(t, userData, "mkfs.xfs -f")
115118
assertContains(t, userData, "/var/lib/hypeman")

lib/resources/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ Per-VM disk I/O rate limiting with burst support:
7373

7474
- **Cloud Hypervisor**: Uses native `RateLimiterConfig` with token bucket
7575
- **QEMU**: Uses drive `throttling.bps-total` options
76-
- **Default**: Proportional to CPU: `(vcpus / cpu_capacity) * disk_io_capacity * 2.0`
76+
- **Default**: Proportional to CPU: `(vcpus / cpu_capacity) * disk_io_capacity`
7777
- **Burst**: 4x sustained rate (allows fast cold starts)
7878

7979
## Example: Default Limits
@@ -84,9 +84,9 @@ Per-VM disk I/O rate limiting with burst support:
8484

8585
| Resource | Calculation | Default Limit |
8686
|----------|-------------|---------------|
87-
| Network (down/up) | 10Gbps × 2.0 × 12.5% | 2.5 Gbps (312 MB/s) |
88-
| Disk I/O (sustained) | 1GB/s × 2.0 × 12.5% | 250 MB/s |
89-
| Disk I/O (burst) | 250 MB/s × 4 | 1 GB/s |
87+
| Network (down/up) | 10Gbps × 12.5% | 1.25 Gbps (156 MB/s) |
88+
| Disk I/O (sustained) | 1GB/s × 12.5% | 125 MB/s |
89+
| Disk I/O (burst) | 125 MB/s × 4 | 500 MB/s |
9090

9191
## Effective Limits
9292

lib/resources/resource.go

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -787,7 +787,7 @@ func (m *Manager) DiskIOCapacity() int64 {
787787

788788
// DefaultNetworkBandwidth calculates the default network bandwidth for an instance
789789
// based on its CPU allocation proportional to host CPU capacity.
790-
// Formula: (instanceVcpus / hostCpuCapacity) * networkCapacity * oversubRatio
790+
// Formula: (instanceVcpus / hostCpuCapacity) * networkCapacity
791791
// Returns symmetric download/upload limits.
792792
func (m *Manager) DefaultNetworkBandwidth(vcpus int) (downloadBps, uploadBps int64) {
793793
cpuCapacity := m.CPUCapacity()
@@ -800,19 +800,15 @@ func (m *Manager) DefaultNetworkBandwidth(vcpus int) (downloadBps, uploadBps int
800800
return 0, 0
801801
}
802802

803-
ratio := m.GetOversubRatio(ResourceNetwork)
804-
effectiveNet := int64(float64(netCapacity) * ratio)
805-
806-
// Proportional to CPU: (vcpus / cpuCapacity) * effectiveNet
807-
bandwidth := (int64(vcpus) * effectiveNet) / cpuCapacity
803+
bandwidth := (int64(vcpus) * netCapacity) / cpuCapacity
808804

809805
// Symmetric limits by default
810806
return bandwidth, bandwidth
811807
}
812808

813809
// DefaultDiskIOBandwidth calculates the default disk I/O bandwidth for an instance
814810
// based on its CPU allocation proportional to host CPU capacity.
815-
// Formula: (instanceVcpus / hostCpuCapacity) * diskIOCapacity * oversubRatio
811+
// Formula: (instanceVcpus / hostCpuCapacity) * diskIOCapacity
816812
// Returns sustained rate and burst rate (4x sustained).
817813
func (m *Manager) DefaultDiskIOBandwidth(vcpus int) (ioBps, burstBps int64) {
818814
cpuCapacity := m.CPUCapacity()
@@ -825,14 +821,7 @@ func (m *Manager) DefaultDiskIOBandwidth(vcpus int) (ioBps, burstBps int64) {
825821
return 0, 0
826822
}
827823

828-
ratio := m.cfg.Oversubscription.DiskIO
829-
if ratio <= 0 {
830-
ratio = 2.0 // Default 2x oversubscription for disk I/O
831-
}
832-
effectiveIO := int64(float64(ioCapacity) * ratio)
833-
834-
// Proportional to CPU: (vcpus / cpuCapacity) * effectiveIO
835-
sustained := (int64(vcpus) * effectiveIO) / cpuCapacity
824+
sustained := (int64(vcpus) * ioCapacity) / cpuCapacity
836825

837826
// Burst is 4x sustained (allows fast cold starts)
838827
burst := sustained * 4

lib/resources/resource_test.go

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ func TestDefaultNetworkBandwidth(t *testing.T) {
7878
cfg := &config.Config{
7979
DataDir: t.TempDir(),
8080
Oversubscription: config.OversubscriptionConfig{
81-
CPU: 1.0, Memory: 1.0, Disk: 1.0, Network: 1.0,
81+
CPU: 1.0, Memory: 1.0, Disk: 1.0, Network: 4.0,
8282
},
8383
Capacity: config.CapacityConfig{Network: "10Gbps"}, // 1.25 GB/s = 1,250,000,000 bytes/sec
8484
}
@@ -123,6 +123,35 @@ func TestDefaultNetworkBandwidth_ZeroCPU(t *testing.T) {
123123
assert.Equal(t, int64(0), uploadBw, "Should return 0 when CPU capacity is 0")
124124
}
125125

126+
func TestDefaultDiskIOBandwidthIgnoresAdmissionOversubscription(t *testing.T) {
127+
cfg := &config.Config{
128+
DataDir: t.TempDir(),
129+
Oversubscription: config.OversubscriptionConfig{
130+
CPU: 1.0, Memory: 1.0, Disk: 1.0, Network: 1.0, DiskIO: 4.0,
131+
},
132+
Capacity: config.CapacityConfig{DiskIO: "1GB/s"},
133+
}
134+
p := paths.New(cfg.DataDir)
135+
136+
mgr := NewManager(cfg, p)
137+
mgr.SetInstanceLister(&mockInstanceLister{})
138+
mgr.SetImageLister(&mockImageLister{})
139+
mgr.SetVolumeLister(&mockVolumeLister{})
140+
141+
err := mgr.Initialize(context.Background())
142+
require.NoError(t, err)
143+
144+
cpuCapacity := mgr.CPUCapacity()
145+
ioCapacity := mgr.DiskIOCapacity()
146+
147+
if cpuCapacity > 0 && ioCapacity > 0 {
148+
ioBps, burstBps := mgr.DefaultDiskIOBandwidth(2)
149+
expected := (int64(2) * ioCapacity) / cpuCapacity
150+
assert.Equal(t, expected, ioBps)
151+
assert.Equal(t, expected*4, burstBps)
152+
}
153+
}
154+
126155
func TestParseBandwidth(t *testing.T) {
127156
tests := []struct {
128157
input string

0 commit comments

Comments
 (0)