GoogleCloudPlatform
diff --git a/‎CHANGES.next.md‎
Lines changed: 9 additions & 0 deletions b/‎CHANGES.next.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎perfkitbenchmarker/benchmark_spec.py‎
Lines changed: 16 additions & 7 deletions b/‎perfkitbenchmarker/benchmark_spec.py‎
Lines changed: 16 additions & 7 deletions
diff --git a/‎perfkitbenchmarker/configs/default_benchmark_config.yaml‎
Lines changed: 25 additions & 2 deletions b/‎perfkitbenchmarker/configs/default_benchmark_config.yaml‎
Lines changed: 25 additions & 2 deletions
diff --git a/‎perfkitbenchmarker/data/container/aws/neuron-device-plugin.yaml.j2‎
Lines changed: 108 additions & 0 deletions b/‎perfkitbenchmarker/data/container/aws/neuron-device-plugin.yaml.j2‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎perfkitbenchmarker/data/container/azure/blobfuse-pv-pvc.yaml.j2‎
Lines changed: 57 additions & 0 deletions b/‎perfkitbenchmarker/data/container/azure/blobfuse-pv-pvc.yaml.j2‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎perfkitbenchmarker/data/container/kubernetes_ai_inference/s3_pv_pvc.yaml.j2‎
Lines changed: 38 additions & 0 deletions b/‎perfkitbenchmarker/data/container/kubernetes_ai_inference/s3_pv_pvc.yaml.j2‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎perfkitbenchmarker/data/container/kubernetes_mysql_sysbench/kubernetes_mysql_sysbench.yaml.j2‎
Lines changed: 5 additions & 0 deletions b/‎perfkitbenchmarker/data/container/kubernetes_mysql_sysbench/kubernetes_mysql_sysbench.yaml.j2‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎…rfkitbenchmarker/data/dgxc/a4.dockerfile‎ ‎…/dgxc/nvidia+nemo+25.07.01.a4.dockerfile‎perfkitbenchmarker/data/dgxc/a4.dockerfile renamed to perfkitbenchmarker/data/dgxc/nvidia+nemo+25.07.01.a4.dockerfile b/‎…rfkitbenchmarker/data/dgxc/a4.dockerfile‎ ‎…/dgxc/nvidia+nemo+25.07.01.a4.dockerfile‎perfkitbenchmarker/data/dgxc/a4.dockerfile renamed to perfkitbenchmarker/data/dgxc/nvidia+nemo+25.07.01.a4.dockerfile
diff --git a/‎…rfkitbenchmarker/data/dgxc/p6.dockerfile‎ ‎…/dgxc/nvidia+nemo+25.07.01.p6.dockerfile‎perfkitbenchmarker/data/dgxc/p6.dockerfile renamed to perfkitbenchmarker/data/dgxc/nvidia+nemo+25.07.01.p6.dockerfile b/‎…rfkitbenchmarker/data/dgxc/p6.dockerfile‎ ‎…/dgxc/nvidia+nemo+25.07.01.p6.dockerfile‎perfkitbenchmarker/data/dgxc/p6.dockerfile renamed to perfkitbenchmarker/data/dgxc/nvidia+nemo+25.07.01.p6.dockerfile
diff --git a/‎perfkitbenchmarker/data/dgxc/nvidia+nemo+25.09.00.a4.dockerfile‎
Lines changed: 25 additions & 0 deletions b/‎perfkitbenchmarker/data/dgxc/nvidia+nemo+25.09.00.a4.dockerfile‎
Lines changed: 25 additions & 0 deletions
@@ -127,6 +127,7 @@
     bringing this variant more in-line with others.
 -   Rename GKE Autopilot & EKS Auto mode's cluster_type within PKB from
     'Autopilot' to 'Auto'.
+-   Duplicate metadata for all samples.
 
 ### New features:
 
@@ -284,10 +285,13 @@
 -   Re-enable support for Rocky Linux 8, 9, and 10 for the Azure provider.
 -   Add Ubuntu 26.04 support for GCP, AWS, and Azure Providers.
 -   Add a kubernetes-native benchmark for MySQL using sysbench
+-   Add `kafka_benchmark` support.
 
 ### Enhancements:
 
 -   Add numactl and perf support to netperf for analysis
+-   Add support for multi-NIC setups in Redis Memtier benchmark to distribute
+    load across multiple network interfaces.
 -   Additions to MongoDB in Artemis/PKB to facilitate workload analysis
 -   Updated `sar` switch to efficiently collect all sar metrics during the run,
     and download the file for hands-on analysis (no parsing).
@@ -464,9 +468,13 @@
     creation can be retried on stock outs.
 -   Add support for deploying VMs inside managed VM groups with
     `--use_managed_vm_groups`.
+-   Add support for configuring IMDSv2 Http Tokens on AWS VMs via
+    `aws_metadata_http_tokens`.
+-   Add aggregate memtier results to `kubernetes_redis_memtier` benchmark.
 
 ### Bug fixes and maintenance updates:
 
+-   Update `sysbench_thread_init_timeout` default to 180 seconds.
 -   Add 'runcpu --update' and 'runcpu --version' commands to install phase.
 -   Set the command to download preprovisioned data to be robust and have a five
     minute timeout.
@@ -676,3 +684,4 @@
 -   Set firewall rule as PKB-created before deletion when using
     --gce_firewall_rules_clean_all.
 -   Added gke_kubernetes_nginx to default benchmark config.
+-   Added gke_kubernetes_redis_memtier to default benchmark config.
@@ -245,7 +245,7 @@ def __init__(
     self.data_dir: str
     self.ckpt_dir: str
     # Used by redis_memtier and keydb_memtier
-    self.redis_endpoint_ip: str
+    self.redis_endpoint_ip: list[str]
     self.keydb_endpoint_ip: str
     # Used by mongodb_ycsb
     self.mongodb_url: str
@@ -823,13 +823,17 @@ def ConstructVirtualMachineGroup(
     # If the VM group is managed, create a managed VM group and return.
     if is_managed:
       managed_vm_group_class = managed_vm_group.GetManagedVmGroupClass(cloud)
-      # TODO(pclay): support multiple zones:
-      if FLAGS.zone:
-        assert len(FLAGS.zone) == 1, 'Managed VM groups only support one zone.'
-        group_spec.vm_spec.zone = FLAGS.zone[0]
-      vm_config = self._CreateVirtualMachine(group_spec.vm_spec, os_type, cloud)
+      zones = FLAGS.zone or [group_spec.vm_spec.zone]
+      vm_configs = []
+      # VM groups needs a VM for each zone to create subnets in each zone.
+      for zone in zones:
+        spec = copy.copy(group_spec.vm_spec)
+        spec.zone = zone
+        vm_config = self._CreateVirtualMachine(spec, os_type, cloud)
+        vm_config.zone = zone
+        vm_configs.append(vm_config)
       group = managed_vm_group_class(
-          group_spec, vm_config
+          group_spec, vm_configs
       )  # pytype: disable=not-instantiable
       self.managed_vm_groups[group_name] = group
       # Report resource provisioning times.
@@ -1244,6 +1248,11 @@ def Delete(self):
             'Got an exception deleting CapacityReservations. '
             'Attempting to continue tearing down.'
         )
+    if self.managed_vm_groups:
+      background_tasks.RunThreaded(
+          lambda vm_group: vm_group.Delete(),
+          list(self.managed_vm_groups.values())
+      )
 
     if self.vms:
       try:
 
@@ -321,6 +321,21 @@ gke_kubernetes_nginx:
     nginx_content_size: 1024
     nginx_p99_latency_threshold: 100
 
+gke_kubernetes_redis_memtier:
+  name: kubernetes_redis_memtier
+  flags:
+    redis_server_version: '7.4.7'
+    memtier_protocol: 'redis'
+    memtier_clients: 12
+    memtier_threads: 32
+    memtier_ratio: '1:4'
+    memtier_data_size: 1024
+    memtier_key_maximum: 6400000
+    memtier_run_duration: 15
+    redis_server_io_threads: '4'
+    redis_server_io_threads_do_reads: True
+    kubernetes_redis_memtier_save: '""'
+
 redis_memtier_session_storage_cluster:
   name: redis_memtier
   flags:
@@ -461,22 +476,30 @@ mysql_sysbench_oltp_read_only_lssd:
     sysbench_testname: oltp_read_only
     <<: *sysbench_oltp_base_lssd
 
-mysql_sysbench_oltp_read_only_lssd_with_fio_prefill:
+mysql_sysbench_oltp_read_only_lssd_with_7T_fio_prefill:
   name: unmanaged_mysql_sysbench_lssd
   flags:
     sysbench_testname: oltp_read_only
     <<: *sysbench_oltp_base_lssd
     sysbench_tables: 210
     sysbench_load_threads: 210
 
-mysql_sysbench_oltp_read_write_lssd_with_fio_prefill:
+mysql_sysbench_oltp_read_write_lssd_with_4T_fio_prefill:
   name: unmanaged_mysql_sysbench_lssd
   flags:
     sysbench_testname: oltp_read_write
     <<: *sysbench_oltp_base_lssd
     sysbench_tables: 210
     sysbench_load_threads: 210
 
+mysql_sysbench_oltp_read_write_lssd_with_7T_fio_prefill:
+  name: unmanaged_mysql_sysbench_lssd
+  flags:
+    sysbench_testname: oltp_read_write
+    <<: *sysbench_oltp_base_lssd
+    sysbench_tables: 420
+    sysbench_load_threads: 420
+
 mysql_sysbench_oltp_read_write_lssd:
   name: unmanaged_mysql_sysbench
   flags:
 
@@ -0,0 +1,108 @@
+# AWS Neuron Device Plugin DaemonSet.
+# Based on upstream aws-neuron-sdk k8s manifests
+# (https://github.com/aws-neuron/aws-neuron-sdk).
+# Image: neuron_device_plugin_image (ApplyManifest kwarg); default in template.
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: neuron-device-plugin-daemonset
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      name: neuron-device-plugin-ds
+  template:
+    metadata:
+      labels:
+        name: neuron-device-plugin-ds
+    spec:
+      serviceAccountName: neuron-device-plugin
+      tolerations:
+        - key: aws.amazon.com/neuron
+          operator: Exists
+          effect: NoSchedule
+      priorityClassName: system-node-critical
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  # Relies on the eks.amazonaws.com/instance-family label set
+                  # by Karpenter and EKS Auto Mode on Neuron-capable nodes.
+                  - key: eks.amazonaws.com/instance-family
+                    operator: In
+                    values:
+                      - inf1
+                      - inf2
+                      - trn1
+                      - trn1n
+                      - trn2
+                      - trn2u
+      containers:
+        - image: {{ neuron_device_plugin_image | default('public.ecr.aws/neuron/neuron-device-plugin:2.22.4.0') }}
+          imagePullPolicy: Always
+          name: neuron-device-plugin
+          env:
+            - name: KUBECONFIG
+              value: /etc/kubernetes/kubelet.conf
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: ["ALL"]
+          volumeMounts:
+            - name: device-plugin
+              mountPath: /var/lib/kubelet/device-plugins
+            - name: infa-map
+              mountPath: /run
+      volumes:
+        - name: device-plugin
+          hostPath:
+            path: /var/lib/kubelet/device-plugins
+        - name: infa-map
+          hostPath:
+            path: /run
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: neuron-device-plugin
+  namespace: kube-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: neuron-device-plugin
+rules:
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["get", "list", "watch", "update", "patch"]
+  # Required for Driver Mode: patch capacity/allocatable (aws.amazon.com/neurondevice).
+  - apiGroups: [""]
+    resources: ["nodes/status"]
+    verbs: ["get", "patch", "update"]
+  - apiGroups: [""]
+    resources: ["events"]
+    verbs: ["create", "patch"]
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list", "watch", "update", "patch"]
+  - apiGroups: [""]
+    resources: ["pods/status"]
+    verbs: ["patch", "update"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: neuron-device-plugin
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: neuron-device-plugin
+subjects:
+  - kind: ServiceAccount
+    name: neuron-device-plugin
+    namespace: kube-system
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: azure-blob-csi-static-secret
+  namespace: default
+type: Opaque
+data:
+  azurestorageaccountname: {{encoded_account_name}}
+  azurestorageaccountkey: {{encoded_account_key}}
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: azure-blob-csi-pv
+  annotations:
+    pv.kubernetes.io/provisioned-by: blob.csi.azure.com
+spec:
+  capacity:
+    storage: 5Gi
+  accessModes:
+    - ReadWriteMany
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: empty-storage-class
+  mountOptions:
+    - -o allow_other
+    - --file-cache-timeout-in-seconds=120
+    - --use-attr-cache=true
+    - --cancel-list-on-mount-seconds=10
+    - --log-level=LOG_WARNING
+  csi:
+    driver: blob.csi.azure.com
+    volumeHandle: {{storage_account}}_{{blob_container}}
+    volumeAttributes:
+      resourceGroup: {{resource_group}}
+      storageAccount: {{storage_account}}
+      containerName: {{blob_container}}
+      protocol: fuse2
+    nodeStageSecretRef:
+      name: azure-blob-csi-static-secret
+      namespace: default
+  claimRef:
+    name: {{pvc_name}}
+    namespace: default
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{pvc_name}}
+  namespace: default
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 5Gi
+  storageClassName: empty-storage-class
+  volumeName: azure-blob-csi-pv
@@ -0,0 +1,38 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: s3-csi-pv
+spec:
+  accessModes:
+    - ReadOnlyMany
+  capacity:
+    # S3 does not enforce capacity limits and does not charge based on this
+    # value. The field is required by the Kubernetes API but ignored by the
+    # S3 CSI driver. Set to any value; it has no effect on cost or usage.
+    storage: 1Ti
+  storageClassName: ""
+  mountOptions:
+    - allow-other
+    - region {{ s3_region }}
+  csi:
+    driver: s3.csi.aws.com
+    volumeAttributes:
+      bucketName: {{ s3_bucket }}
+    volumeHandle: s3-csi-volume-handle
+  claimRef:
+    name: s3-csi-static-pvc
+    namespace: default
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: s3-csi-static-pvc
+  namespace: default
+spec:
+  accessModes:
+    - ReadOnlyMany
+  resources:
+    requests:
+      storage: 1Ti
+  storageClassName: ""
+  volumeName: s3-csi-pv
@@ -146,6 +146,11 @@ spec:
         app: mysql-app
         app.kubernetes.io/managed-by: pkb
     spec:
+      tolerations:
+      - key: "kubernetes.io/arch"
+        operator: "Equal"
+        value: "arm64"
+        effect: "NoSchedule"
       securityContext:
         sysctls:
         - name: net.ipv4.tcp_fin_timeout
 
@@ -0,0 +1,25 @@
+FROM nvcr.io/nvidia/nemo:25.09.00
+ENV NCCL_TUNER_CONFIG_PATH="/usr/local/gib/configs/tuner_config_a4.txtpb"
+RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /etc/apt/trusted.gpg.d/cloud.google.gpg
+RUN echo 'deb https://packages.cloud.google.com/apt gpudirect-gib-apt main' | tee /etc/apt/sources.list.d/nccl-gib.list
+RUN apt update
+RUN apt install nccl-gib
+ENV LD_LIBRARY_PATH=/usr/local/gib/lib64:\$LD_LIBRARY_PATH
+ENV NCCL_DEBUG=INFO \
+    NCCL_LIB_DIR=/usr/local/gib/scripts \
+    OMPI_MCA_btl=tcp,self \
+    OMPI_MCA_mtl=^ofi \
+    OMPI_MCA_pml=^ucx \
+    OMPI_MCA_btl_tcp_if_include=enp0s19 \
+    PMIX_MCA_gds=^ds12 \
+    NCCL_SOCKET_IFNAME=enp0s19,enp192s20
+ENV NCCL_NET=gIB
+ENV NCCL_CROSS_NIC=0
+ENV NCCL_NET_GDR_LEVEL=PIX
+ENV NCCL_P2P_NET_CHUNKSIZE=131072
+ENV NCCL_NVLS_CHUNKSIZE=524288
+ENV NCCL_IB_ADAPTIVE_ROUTING=1
+ENV NCCL_IB_QPS_PER_CONNECTION=4
+ENV NCCL_IB_TC=52
+ENV NCCL_IB_FIFO_TC=84
+ENV NCCL_TUNER_CONFIG_PATH="/usr/local/gib/configs/tuner_config_a4.txtpb"