Skip to content

Commit f83f078

Browse files
authored
Merge branch 'GoogleCloudPlatform:master' into swap-encryption-pr2-swap-capability_new
2 parents 805dfbe + ae6156e commit f83f078

111 files changed

Lines changed: 4563 additions & 979 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CHANGES.next.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@
127127
bringing this variant more in-line with others.
128128
- Rename GKE Autopilot & EKS Auto mode's cluster_type within PKB from
129129
'Autopilot' to 'Auto'.
130+
- Duplicate metadata for all samples.
130131

131132
### New features:
132133

@@ -284,10 +285,13 @@
284285
- Re-enable support for Rocky Linux 8, 9, and 10 for the Azure provider.
285286
- Add Ubuntu 26.04 support for GCP, AWS, and Azure Providers.
286287
- Add a kubernetes-native benchmark for MySQL using sysbench
288+
- Add `kafka_benchmark` support.
287289

288290
### Enhancements:
289291

290292
- Add numactl and perf support to netperf for analysis
293+
- Add support for multi-NIC setups in Redis Memtier benchmark to distribute
294+
load across multiple network interfaces.
291295
- Additions to MongoDB in Artemis/PKB to facilitate workload analysis
292296
- Updated `sar` switch to efficiently collect all sar metrics during the run,
293297
and download the file for hands-on analysis (no parsing).
@@ -464,9 +468,13 @@
464468
creation can be retried on stock outs.
465469
- Add support for deploying VMs inside managed VM groups with
466470
`--use_managed_vm_groups`.
471+
- Add support for configuring IMDSv2 Http Tokens on AWS VMs via
472+
`aws_metadata_http_tokens`.
473+
- Add aggregate memtier results to `kubernetes_redis_memtier` benchmark.
467474

468475
### Bug fixes and maintenance updates:
469476

477+
- Update `sysbench_thread_init_timeout` default to 180 seconds.
470478
- Add 'runcpu --update' and 'runcpu --version' commands to install phase.
471479
- Set the command to download preprovisioned data to be robust and have a five
472480
minute timeout.
@@ -676,3 +684,4 @@
676684
- Set firewall rule as PKB-created before deletion when using
677685
--gce_firewall_rules_clean_all.
678686
- Added gke_kubernetes_nginx to default benchmark config.
687+
- Added gke_kubernetes_redis_memtier to default benchmark config.

perfkitbenchmarker/benchmark_spec.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ def __init__(
245245
self.data_dir: str
246246
self.ckpt_dir: str
247247
# Used by redis_memtier and keydb_memtier
248-
self.redis_endpoint_ip: str
248+
self.redis_endpoint_ip: list[str]
249249
self.keydb_endpoint_ip: str
250250
# Used by mongodb_ycsb
251251
self.mongodb_url: str
@@ -823,13 +823,17 @@ def ConstructVirtualMachineGroup(
823823
# If the VM group is managed, create a managed VM group and return.
824824
if is_managed:
825825
managed_vm_group_class = managed_vm_group.GetManagedVmGroupClass(cloud)
826-
# TODO(pclay): support multiple zones:
827-
if FLAGS.zone:
828-
assert len(FLAGS.zone) == 1, 'Managed VM groups only support one zone.'
829-
group_spec.vm_spec.zone = FLAGS.zone[0]
830-
vm_config = self._CreateVirtualMachine(group_spec.vm_spec, os_type, cloud)
826+
zones = FLAGS.zone or [group_spec.vm_spec.zone]
827+
vm_configs = []
828+
# VM groups needs a VM for each zone to create subnets in each zone.
829+
for zone in zones:
830+
spec = copy.copy(group_spec.vm_spec)
831+
spec.zone = zone
832+
vm_config = self._CreateVirtualMachine(spec, os_type, cloud)
833+
vm_config.zone = zone
834+
vm_configs.append(vm_config)
831835
group = managed_vm_group_class(
832-
group_spec, vm_config
836+
group_spec, vm_configs
833837
) # pytype: disable=not-instantiable
834838
self.managed_vm_groups[group_name] = group
835839
# Report resource provisioning times.
@@ -1244,6 +1248,11 @@ def Delete(self):
12441248
'Got an exception deleting CapacityReservations. '
12451249
'Attempting to continue tearing down.'
12461250
)
1251+
if self.managed_vm_groups:
1252+
background_tasks.RunThreaded(
1253+
lambda vm_group: vm_group.Delete(),
1254+
list(self.managed_vm_groups.values())
1255+
)
12471256

12481257
if self.vms:
12491258
try:

perfkitbenchmarker/configs/default_benchmark_config.yaml

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,21 @@ gke_kubernetes_nginx:
321321
nginx_content_size: 1024
322322
nginx_p99_latency_threshold: 100
323323

324+
gke_kubernetes_redis_memtier:
325+
name: kubernetes_redis_memtier
326+
flags:
327+
redis_server_version: '7.4.7'
328+
memtier_protocol: 'redis'
329+
memtier_clients: 12
330+
memtier_threads: 32
331+
memtier_ratio: '1:4'
332+
memtier_data_size: 1024
333+
memtier_key_maximum: 6400000
334+
memtier_run_duration: 15
335+
redis_server_io_threads: '4'
336+
redis_server_io_threads_do_reads: True
337+
kubernetes_redis_memtier_save: '""'
338+
324339
redis_memtier_session_storage_cluster:
325340
name: redis_memtier
326341
flags:
@@ -461,22 +476,30 @@ mysql_sysbench_oltp_read_only_lssd:
461476
sysbench_testname: oltp_read_only
462477
<<: *sysbench_oltp_base_lssd
463478

464-
mysql_sysbench_oltp_read_only_lssd_with_fio_prefill:
479+
mysql_sysbench_oltp_read_only_lssd_with_7T_fio_prefill:
465480
name: unmanaged_mysql_sysbench_lssd
466481
flags:
467482
sysbench_testname: oltp_read_only
468483
<<: *sysbench_oltp_base_lssd
469484
sysbench_tables: 210
470485
sysbench_load_threads: 210
471486

472-
mysql_sysbench_oltp_read_write_lssd_with_fio_prefill:
487+
mysql_sysbench_oltp_read_write_lssd_with_4T_fio_prefill:
473488
name: unmanaged_mysql_sysbench_lssd
474489
flags:
475490
sysbench_testname: oltp_read_write
476491
<<: *sysbench_oltp_base_lssd
477492
sysbench_tables: 210
478493
sysbench_load_threads: 210
479494

495+
mysql_sysbench_oltp_read_write_lssd_with_7T_fio_prefill:
496+
name: unmanaged_mysql_sysbench_lssd
497+
flags:
498+
sysbench_testname: oltp_read_write
499+
<<: *sysbench_oltp_base_lssd
500+
sysbench_tables: 420
501+
sysbench_load_threads: 420
502+
480503
mysql_sysbench_oltp_read_write_lssd:
481504
name: unmanaged_mysql_sysbench
482505
flags:
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
# AWS Neuron Device Plugin DaemonSet.
2+
# Based on upstream aws-neuron-sdk k8s manifests
3+
# (https://github.com/aws-neuron/aws-neuron-sdk).
4+
# Image: neuron_device_plugin_image (ApplyManifest kwarg); default in template.
5+
apiVersion: apps/v1
6+
kind: DaemonSet
7+
metadata:
8+
name: neuron-device-plugin-daemonset
9+
namespace: kube-system
10+
spec:
11+
selector:
12+
matchLabels:
13+
name: neuron-device-plugin-ds
14+
template:
15+
metadata:
16+
labels:
17+
name: neuron-device-plugin-ds
18+
spec:
19+
serviceAccountName: neuron-device-plugin
20+
tolerations:
21+
- key: aws.amazon.com/neuron
22+
operator: Exists
23+
effect: NoSchedule
24+
priorityClassName: system-node-critical
25+
affinity:
26+
nodeAffinity:
27+
requiredDuringSchedulingIgnoredDuringExecution:
28+
nodeSelectorTerms:
29+
- matchExpressions:
30+
# Relies on the eks.amazonaws.com/instance-family label set
31+
# by Karpenter and EKS Auto Mode on Neuron-capable nodes.
32+
- key: eks.amazonaws.com/instance-family
33+
operator: In
34+
values:
35+
- inf1
36+
- inf2
37+
- trn1
38+
- trn1n
39+
- trn2
40+
- trn2u
41+
containers:
42+
- image: {{ neuron_device_plugin_image | default('public.ecr.aws/neuron/neuron-device-plugin:2.22.4.0') }}
43+
imagePullPolicy: Always
44+
name: neuron-device-plugin
45+
env:
46+
- name: KUBECONFIG
47+
value: /etc/kubernetes/kubelet.conf
48+
- name: NODE_NAME
49+
valueFrom:
50+
fieldRef:
51+
fieldPath: spec.nodeName
52+
securityContext:
53+
allowPrivilegeEscalation: false
54+
capabilities:
55+
drop: ["ALL"]
56+
volumeMounts:
57+
- name: device-plugin
58+
mountPath: /var/lib/kubelet/device-plugins
59+
- name: infa-map
60+
mountPath: /run
61+
volumes:
62+
- name: device-plugin
63+
hostPath:
64+
path: /var/lib/kubelet/device-plugins
65+
- name: infa-map
66+
hostPath:
67+
path: /run
68+
---
69+
apiVersion: v1
70+
kind: ServiceAccount
71+
metadata:
72+
name: neuron-device-plugin
73+
namespace: kube-system
74+
---
75+
apiVersion: rbac.authorization.k8s.io/v1
76+
kind: ClusterRole
77+
metadata:
78+
name: neuron-device-plugin
79+
rules:
80+
- apiGroups: [""]
81+
resources: ["nodes"]
82+
verbs: ["get", "list", "watch", "update", "patch"]
83+
# Required for Driver Mode: patch capacity/allocatable (aws.amazon.com/neurondevice).
84+
- apiGroups: [""]
85+
resources: ["nodes/status"]
86+
verbs: ["get", "patch", "update"]
87+
- apiGroups: [""]
88+
resources: ["events"]
89+
verbs: ["create", "patch"]
90+
- apiGroups: [""]
91+
resources: ["pods"]
92+
verbs: ["get", "list", "watch", "update", "patch"]
93+
- apiGroups: [""]
94+
resources: ["pods/status"]
95+
verbs: ["patch", "update"]
96+
---
97+
apiVersion: rbac.authorization.k8s.io/v1
98+
kind: ClusterRoleBinding
99+
metadata:
100+
name: neuron-device-plugin
101+
roleRef:
102+
apiGroup: rbac.authorization.k8s.io
103+
kind: ClusterRole
104+
name: neuron-device-plugin
105+
subjects:
106+
- kind: ServiceAccount
107+
name: neuron-device-plugin
108+
namespace: kube-system
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
apiVersion: v1
2+
kind: Secret
3+
metadata:
4+
name: azure-blob-csi-static-secret
5+
namespace: default
6+
type: Opaque
7+
data:
8+
azurestorageaccountname: {{encoded_account_name}}
9+
azurestorageaccountkey: {{encoded_account_key}}
10+
---
11+
apiVersion: v1
12+
kind: PersistentVolume
13+
metadata:
14+
name: azure-blob-csi-pv
15+
annotations:
16+
pv.kubernetes.io/provisioned-by: blob.csi.azure.com
17+
spec:
18+
capacity:
19+
storage: 5Gi
20+
accessModes:
21+
- ReadWriteMany
22+
persistentVolumeReclaimPolicy: Retain
23+
storageClassName: empty-storage-class
24+
mountOptions:
25+
- -o allow_other
26+
- --file-cache-timeout-in-seconds=120
27+
- --use-attr-cache=true
28+
- --cancel-list-on-mount-seconds=10
29+
- --log-level=LOG_WARNING
30+
csi:
31+
driver: blob.csi.azure.com
32+
volumeHandle: {{storage_account}}_{{blob_container}}
33+
volumeAttributes:
34+
resourceGroup: {{resource_group}}
35+
storageAccount: {{storage_account}}
36+
containerName: {{blob_container}}
37+
protocol: fuse2
38+
nodeStageSecretRef:
39+
name: azure-blob-csi-static-secret
40+
namespace: default
41+
claimRef:
42+
name: {{pvc_name}}
43+
namespace: default
44+
---
45+
apiVersion: v1
46+
kind: PersistentVolumeClaim
47+
metadata:
48+
name: {{pvc_name}}
49+
namespace: default
50+
spec:
51+
accessModes:
52+
- ReadWriteMany
53+
resources:
54+
requests:
55+
storage: 5Gi
56+
storageClassName: empty-storage-class
57+
volumeName: azure-blob-csi-pv
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
apiVersion: v1
2+
kind: PersistentVolume
3+
metadata:
4+
name: s3-csi-pv
5+
spec:
6+
accessModes:
7+
- ReadOnlyMany
8+
capacity:
9+
# S3 does not enforce capacity limits and does not charge based on this
10+
# value. The field is required by the Kubernetes API but ignored by the
11+
# S3 CSI driver. Set to any value; it has no effect on cost or usage.
12+
storage: 1Ti
13+
storageClassName: ""
14+
mountOptions:
15+
- allow-other
16+
- region {{ s3_region }}
17+
csi:
18+
driver: s3.csi.aws.com
19+
volumeAttributes:
20+
bucketName: {{ s3_bucket }}
21+
volumeHandle: s3-csi-volume-handle
22+
claimRef:
23+
name: s3-csi-static-pvc
24+
namespace: default
25+
---
26+
apiVersion: v1
27+
kind: PersistentVolumeClaim
28+
metadata:
29+
name: s3-csi-static-pvc
30+
namespace: default
31+
spec:
32+
accessModes:
33+
- ReadOnlyMany
34+
resources:
35+
requests:
36+
storage: 1Ti
37+
storageClassName: ""
38+
volumeName: s3-csi-pv

perfkitbenchmarker/data/container/kubernetes_mysql_sysbench/kubernetes_mysql_sysbench.yaml.j2

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,11 @@ spec:
146146
app: mysql-app
147147
app.kubernetes.io/managed-by: pkb
148148
spec:
149+
tolerations:
150+
- key: "kubernetes.io/arch"
151+
operator: "Equal"
152+
value: "arm64"
153+
effect: "NoSchedule"
149154
securityContext:
150155
sysctls:
151156
- name: net.ipv4.tcp_fin_timeout
File renamed without changes.
File renamed without changes.
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
FROM nvcr.io/nvidia/nemo:25.09.00
2+
ENV NCCL_TUNER_CONFIG_PATH="/usr/local/gib/configs/tuner_config_a4.txtpb"
3+
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /etc/apt/trusted.gpg.d/cloud.google.gpg
4+
RUN echo 'deb https://packages.cloud.google.com/apt gpudirect-gib-apt main' | tee /etc/apt/sources.list.d/nccl-gib.list
5+
RUN apt update
6+
RUN apt install nccl-gib
7+
ENV LD_LIBRARY_PATH=/usr/local/gib/lib64:\$LD_LIBRARY_PATH
8+
ENV NCCL_DEBUG=INFO \
9+
NCCL_LIB_DIR=/usr/local/gib/scripts \
10+
OMPI_MCA_btl=tcp,self \
11+
OMPI_MCA_mtl=^ofi \
12+
OMPI_MCA_pml=^ucx \
13+
OMPI_MCA_btl_tcp_if_include=enp0s19 \
14+
PMIX_MCA_gds=^ds12 \
15+
NCCL_SOCKET_IFNAME=enp0s19,enp192s20
16+
ENV NCCL_NET=gIB
17+
ENV NCCL_CROSS_NIC=0
18+
ENV NCCL_NET_GDR_LEVEL=PIX
19+
ENV NCCL_P2P_NET_CHUNKSIZE=131072
20+
ENV NCCL_NVLS_CHUNKSIZE=524288
21+
ENV NCCL_IB_ADAPTIVE_ROUTING=1
22+
ENV NCCL_IB_QPS_PER_CONNECTION=4
23+
ENV NCCL_IB_TC=52
24+
ENV NCCL_IB_FIFO_TC=84
25+
ENV NCCL_TUNER_CONFIG_PATH="/usr/local/gib/configs/tuner_config_a4.txtpb"

0 commit comments

Comments
 (0)