Skip to content

Commit 72a5b18

Browse files
committed
mantle/system/nproc: account for page cache in cgroup available memory
The cgroup available memory calculation used memory.current (total cgroup usage) directly, which includes page cache (file-backed memory). Since inactive page cache is reclaimable by the kernel under memory pressure, it should not count as unavailable. This caused GetCurrentMemAvailableMiB() to significantly underestimate available memory, making QEMU instance scheduling overly conservative. Read the "inactive_file" field from /sys/fs/cgroup/memory.stat, which reports the page cache size that can be reclaimed easily in bytes, and subtract it from current usage before computing available memory. The effective formula becomes: available = limit - (current - inactive_file) This mirrors how /proc/meminfo computes MemAvailable by considering reclaimable caches. A new helper getCgroupMemoryStatField() is added for parsing individual fields from memory.stat, returning 0 gracefully if the file or field is absent. Written-by: <anthropic/claude-opus-4.6>
1 parent 486c64b commit 72a5b18

1 file changed

Lines changed: 60 additions & 3 deletions

File tree

mantle/system/nproc.go

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,15 @@ func getCgroupMemoryLimitMiB() (uint, error) {
176176
}
177177

178178
// getCgroupMemoryAvailableMiB returns the available memory within the
179-
// cgroup v2 in MiB (limit - current usage), or math.MaxUint if no limit.
179+
// cgroup v2 in MiB, or math.MaxUint if no limit is set. It computes
180+
// available memory as: limit - (current - inactive_file) where inactive_file
181+
// is not actively used file caches that can be evicted if needed.
182+
// (current - inactive_file) is similar to the "workingSet" calculation over in [1].
183+
// More context on this also in [2]. This is similar to how /proc/meminfo computes
184+
// MemAvailable by considering reclaimable caches.
185+
//
186+
// [1] https://github.com/kubernetes/kubernetes/blob/ac10370ad2aebde82c2d268dd80d08df0ffc2532/test/e2e/node/node_problem_detector.go#L290-L344
187+
// [2] https://github.com/kata-containers/kata-containers/issues/10280
180188
func getCgroupMemoryAvailableMiB() (uint, error) {
181189
maxBuf, err := os.ReadFile("/sys/fs/cgroup/memory.max")
182190
if os.IsNotExist(err) {
@@ -200,8 +208,57 @@ func getCgroupMemoryAvailableMiB() (uint, error) {
200208
if err != nil {
201209
return 0, fmt.Errorf("invalid memory.current value: %w", err)
202210
}
203-
if current >= limit {
211+
212+
// Read inactive_file size from memory.stat to exclude reclaimable
213+
// file-backed memory from the usage calculation.
214+
inactiveFile, err := getCgroupMemoryStatField("inactive_file")
215+
if err != nil {
216+
return 0, err
217+
}
218+
219+
// Subtract the inactive_file size from the memory.current. This
220+
// cache should always be less than the memory.current but add
221+
// a check and do nothing just in case.
222+
usage := current
223+
if inactiveFile < usage {
224+
usage -= inactiveFile
225+
}
226+
227+
// This also shouldn't happen, but in case the usage is larger
228+
// than the limit let's just return that there's 0 available memory.
229+
if usage >= limit {
230+
return 0, nil
231+
}
232+
return uint((limit - usage) / (1024 * 1024)), nil
233+
}
234+
235+
// getCgroupMemoryStatField reads a specific field from
236+
// /sys/fs/cgroup/memory.stat and returns its value in bytes.
237+
// The file contains key-value pairs like "file 123456789".
238+
// Returns 0 if the file does not exist or the field is not found.
239+
func getCgroupMemoryStatField(field string) (uint64, error) {
240+
f, err := os.Open("/sys/fs/cgroup/memory.stat")
241+
if os.IsNotExist(err) {
204242
return 0, nil
243+
} else if err != nil {
244+
return 0, fmt.Errorf("reading memory.stat: %w", err)
245+
}
246+
defer f.Close()
247+
248+
scanner := bufio.NewScanner(f)
249+
for scanner.Scan() {
250+
parts := strings.Fields(scanner.Text())
251+
if len(parts) == 2 && parts[0] == field {
252+
val, err := strconv.ParseUint(parts[1], 10, 64)
253+
if err != nil {
254+
return 0, fmt.Errorf("parsing memory.stat field %s: %w", field, err)
255+
}
256+
return val, nil
257+
}
258+
}
259+
if err := scanner.Err(); err != nil {
260+
return 0, fmt.Errorf("scanning memory.stat: %w", err)
205261
}
206-
return uint((limit - current) / (1024 * 1024)), nil
262+
// Field not found; return 0 so callers degrade gracefully.
263+
return 0, nil
207264
}

0 commit comments

Comments
 (0)