Skip to content

Commit b46e687

Browse files
TheRayquazaclaude
andcommitted
[local] runtime-rs: port Go patches (CPU shares, block device annotation mounts)
CPU shares fallback: when no quota or cpuset is set, calculate vCPUs from CPU shares (shares/1024) mirroring Go's CalculateCPUsF(). Block device annotation mounts: parse io.katacontainers.volume.block-mounts annotation and convert matching volumeDevices into agent Storage objects. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent f09414e commit b46e687

4 files changed

Lines changed: 227 additions & 4 deletions

File tree

src/libs/kata-types/src/annotations/mod.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ use crate::config::TomlConfig;
1717
use crate::initdata::add_hypervisor_initdata_overrides;
1818
use crate::sl;
1919

20-
use self::cri_containerd::{SANDBOX_CPU_PERIOD_KEY, SANDBOX_CPU_QUOTA_KEY, SANDBOX_MEM_KEY};
20+
use self::cri_containerd::{
21+
SANDBOX_CPU_PERIOD_KEY, SANDBOX_CPU_QUOTA_KEY, SANDBOX_CPU_SHARE_KEY, SANDBOX_MEM_KEY,
22+
};
2123

2224
/// CRI-containerd specific annotations.
2325
pub mod cri_containerd;
@@ -443,6 +445,14 @@ impl Annotation {
443445
value.unwrap_or(0)
444446
}
445447

448+
/// Get the annotation of cpu shares for sandbox
449+
pub fn get_sandbox_cpu_shares(&self) -> u64 {
450+
let value = self
451+
.get_value::<u64>(SANDBOX_CPU_SHARE_KEY)
452+
.unwrap_or(Some(0));
453+
value.unwrap_or(0)
454+
}
455+
446456
/// Get the annotation of memory for sandbox
447457
pub fn get_sandbox_mem(&self) -> i64 {
448458
let value = self.get_value::<i64>(SANDBOX_MEM_KEY).unwrap_or(Some(0));

src/runtime-rs/crates/resource/src/cpu_mem/cpu.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,24 @@ impl CpuResource {
168168
return Ok(cpuset_vcpu.len() as f32);
169169
}
170170

171+
// Fallback to CPU shares when quota and cpuset are both absent.
172+
// This handles pods with no resource limits but with CPU shares set.
173+
// Mirrors the Go runtime's CalculateCPUsF() fallback logic.
174+
if total_quota == 0.0 && cpuset_vcpu.is_empty() {
175+
let total_shares: u64 = resources
176+
.values()
177+
.map(|cpu_resource| cpu_resource.shares())
178+
.sum();
179+
if total_shares > 0 {
180+
let shares_vcpu = total_shares as f32 / 1024.0;
181+
info!(
182+
sl!(),
183+
"(from cpu_shares) get vcpus # {}", shares_vcpu
184+
);
185+
return Ok(shares_vcpu.max(1.0));
186+
}
187+
}
188+
171189
// When quota is set: calculate vCPUs as quota/period after normalization
172190
if total_quota > 0.0 && max_period > 0.0 {
173191
let quota_vcpu = total_quota / max_period;

src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ impl TryFrom<&HashMap<String, String>> for InitialSize {
3232

3333
let annotation = Annotation::new(an.clone());
3434
let (period, quota, memory) =
35-
get_sizing_info(annotation).context("failed to get sizing info")?;
35+
get_sizing_info(annotation.clone()).context("failed to get sizing info")?;
3636
let mut cpu = oci::LinuxCpu::default();
3737
cpu.set_period(Some(period));
3838
cpu.set_quota(Some(quota));
@@ -42,6 +42,17 @@ impl TryFrom<&HashMap<String, String>> for InitialSize {
4242
if let Ok(cpu_resource) = LinuxContainerCpuResources::try_from(&cpu) {
4343
vcpu = get_nr_vcpu(&cpu_resource);
4444
}
45+
46+
// When neither quota/period nor cpuset constrain the CPU, fall back to
47+
// the sandbox CPU-shares annotation (mirrors Go's CalculateSandboxSizing
48+
// calling CalculateCPUsF(quota, period, shares)).
49+
if vcpu == 0.0 {
50+
let shares = annotation.get_sandbox_cpu_shares();
51+
if shares > 0 {
52+
vcpu = shares as f32 / 1024.0;
53+
}
54+
}
55+
4556
let mem_mb = convert_memory_to_mb(memory);
4657

4758
Ok(Self {
@@ -259,6 +270,71 @@ mod tests {
259270
.to_vec()
260271
}
261272

273+
// Test that sandbox CPU-shares annotation is used as vcpu fallback when
274+
// quota and period are absent. Mirrors Go's CalculateCPUsF(0, 0, shares).
275+
#[test]
276+
fn test_initial_size_sandbox_cpu_shares_fallback() {
277+
let cases: &[(&str, u64, f32)] = &[
278+
("1024 shares = 1.0 vcpu", 1024, 1.0),
279+
("2048 shares = 2.0 vcpu", 2048, 2.0),
280+
("512 shares = 0.5 vcpu", 512, 0.5),
281+
("0 shares = 0.0 vcpu (no fallback)", 0, 0.0),
282+
];
283+
284+
for (desc, shares, expected_vcpu) in cases {
285+
let annotations = HashMap::from([
286+
(
287+
cri_containerd::CONTAINER_TYPE_LABEL_KEY.to_string(),
288+
cri_containerd::SANDBOX.to_string(),
289+
),
290+
(
291+
cri_containerd::SANDBOX_CPU_SHARE_KEY.to_string(),
292+
format!("{}", shares),
293+
),
294+
]);
295+
296+
let initial_size = InitialSize::try_from(&annotations).unwrap();
297+
assert_eq!(
298+
initial_size.vcpu, *expected_vcpu,
299+
"{}: got vcpu={}, expected {}",
300+
desc, initial_size.vcpu, expected_vcpu
301+
);
302+
}
303+
}
304+
305+
// Verify quota/period take precedence over shares when both are present.
306+
#[test]
307+
fn test_initial_size_sandbox_quota_wins_over_shares() {
308+
let annotations = HashMap::from([
309+
(
310+
cri_containerd::CONTAINER_TYPE_LABEL_KEY.to_string(),
311+
cri_containerd::SANDBOX.to_string(),
312+
),
313+
(
314+
cri_containerd::SANDBOX_CPU_PERIOD_KEY.to_string(),
315+
"100000".to_string(),
316+
),
317+
(
318+
cri_containerd::SANDBOX_CPU_QUOTA_KEY.to_string(),
319+
"220000".to_string(),
320+
),
321+
// shares set too — quota should win
322+
(
323+
cri_containerd::SANDBOX_CPU_SHARE_KEY.to_string(),
324+
"4096".to_string(), // would be 4.0 if shares were used
325+
),
326+
]);
327+
328+
let initial_size = InitialSize::try_from(&annotations).unwrap();
329+
// 220_000/100_000 = 2.2 → ceil = 3.0, not 4.0 from shares
330+
assert_eq!(
331+
initial_size.vcpu.ceil(),
332+
3.0,
333+
"quota should take precedence over shares, got vcpu={}",
334+
initial_size.vcpu
335+
);
336+
}
337+
262338
#[test]
263339
fn test_initial_size_sandbox() {
264340
let tests = get_test_data();

src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs

Lines changed: 121 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
use std::collections::HashMap;
88
use std::sync::Arc;
99

10-
use agent::Agent;
10+
use agent::{Agent, Storage};
1111
use anyhow::{anyhow, Context, Result};
1212
use common::{
1313
error::Error,
@@ -26,7 +26,8 @@ use oci_spec::runtime as oci;
2626

2727
use oci::{LinuxResources, Process as OCIProcess};
2828
use resource::{
29-
cdi_devices::container_device::annotate_container_devices, ResourceManager, ResourceUpdateOp,
29+
cdi_devices::{container_device::annotate_container_devices, ContainerDevice},
30+
ResourceManager, ResourceUpdateOp,
3031
};
3132
use tokio::sync::RwLock;
3233

@@ -202,6 +203,15 @@ impl Container {
202203
.resource_manager
203204
.handler_devices(&config.container_id, linux)
204205
.await?;
206+
207+
// Handle annotation-based block device mounts.
208+
// Devices listed in the annotation are mounted as filesystems by the agent
209+
// rather than passed as raw block devices, so filter them out first.
210+
let (container_devices, annotation_storages) =
211+
handle_block_mount_annotation(&updated_annotations, container_devices, &mut spec)
212+
.context("handle block mount annotation")?;
213+
storages.extend(annotation_storages);
214+
205215
let devices_agent = annotate_container_devices(&mut spec, container_devices)
206216
.context("annotate container devices failed")?;
207217

@@ -643,6 +653,115 @@ impl Container {
643653
}
644654
}
645655

656+
const BLOCK_DEVICE_MOUNTS_ANNOTATION: &str = "io.katacontainers.volume.block-mounts";
657+
658+
#[derive(Debug, serde::Deserialize)]
659+
struct BlockMountConfig {
660+
mount: String,
661+
#[serde(default)]
662+
fstype: String,
663+
#[serde(default)]
664+
options: Vec<String>,
665+
}
666+
667+
// Parses the block mount annotation and processes any matching container devices.
668+
//
669+
// Devices listed in the annotation are meant to be mounted as filesystems by the agent
670+
// (not passed as raw block devices). This function:
671+
// 1. Splits container_devices into annotated and remaining sets.
672+
// 2. Creates a Storage object for each annotated device.
673+
// 3. Adds a bind mount in the OCI spec from the guest storage path to the destination.
674+
// 4. Removes matching devices from spec.linux.devices.
675+
//
676+
// Returns (remaining_devices, new_storages).
677+
fn handle_block_mount_annotation(
678+
annotations: &HashMap<String, String>,
679+
container_devices: Vec<ContainerDevice>,
680+
spec: &mut oci::Spec,
681+
) -> Result<(Vec<ContainerDevice>, Vec<Storage>)> {
682+
let raw = match annotations.get(BLOCK_DEVICE_MOUNTS_ANNOTATION) {
683+
Some(v) if !v.is_empty() => v.as_str(),
684+
_ => return Ok((container_devices, vec![])),
685+
};
686+
687+
let block_mounts: HashMap<String, BlockMountConfig> =
688+
serde_json::from_str(raw).context("failed to parse block mount annotation")?;
689+
690+
if block_mounts.is_empty() {
691+
return Ok((container_devices, vec![]));
692+
}
693+
694+
let mut storages = Vec::new();
695+
let mut mounted_paths: HashMap<String, bool> = HashMap::new();
696+
697+
let (annotated, remaining): (Vec<_>, Vec<_>) = container_devices
698+
.into_iter()
699+
.partition(|cd| block_mounts.contains_key(&cd.device.container_path));
700+
701+
for cd in annotated {
702+
let config = block_mounts
703+
.get(&cd.device.container_path)
704+
.expect("partition guarantees key exists");
705+
706+
let source = cd.device.id.clone();
707+
let driver = cd.device.field_type.clone();
708+
let fstype = if config.fstype.is_empty() {
709+
"ext4".to_string()
710+
} else {
711+
config.fstype.clone()
712+
};
713+
let options = if config.options.is_empty() {
714+
vec!["rw".to_string()]
715+
} else {
716+
config.options.clone()
717+
};
718+
719+
// Build a unique, valid path component from the PCI source string.
720+
let sanitized: String = source
721+
.chars()
722+
.map(|c| if c.is_alphanumeric() || c == '-' { c } else { '_' })
723+
.collect();
724+
let guest_mount_point = format!("/run/kata-containers/storage/{}", sanitized);
725+
726+
storages.push(Storage {
727+
driver,
728+
source,
729+
fs_type: fstype,
730+
options,
731+
mount_point: guest_mount_point.clone(),
732+
..Default::default()
733+
});
734+
735+
// Add bind mount: agent mounts the block device at guest_mount_point, then
736+
// the bind mount exposes it at the container destination path.
737+
let mut bind_mount = oci::Mount::default();
738+
bind_mount.set_destination(std::path::PathBuf::from(&config.mount));
739+
bind_mount.set_typ(Some("bind".to_string()));
740+
bind_mount.set_source(Some(std::path::PathBuf::from(&guest_mount_point)));
741+
bind_mount.set_options(Some(vec!["bind".to_string()]));
742+
743+
let mut mounts = spec.mounts().clone().unwrap_or_default();
744+
mounts.push(bind_mount);
745+
spec.set_mounts(Some(mounts));
746+
747+
mounted_paths.insert(cd.device.container_path, true);
748+
}
749+
750+
// Remove matched devices from spec.linux.devices so the agent doesn't
751+
// see them as character/block devices in the container namespace.
752+
if !mounted_paths.is_empty() {
753+
if let Some(linux) = spec.linux_mut() {
754+
if let Some(devices) = linux.devices_mut() {
755+
devices.retain(|d| {
756+
!mounted_paths.contains_key(&d.path().display().to_string())
757+
});
758+
}
759+
}
760+
}
761+
762+
Ok((remaining, storages))
763+
}
764+
646765
fn amend_spec(
647766
spec: &mut oci::Spec,
648767
disable_guest_seccomp: bool,

0 commit comments

Comments
 (0)