fix container metrics for hosting envs with nested cgroup hierarchies | add missing uptime monitor metric

mvogttech · mvogttech · commit e44ba4a708ad · 2026-03-02T12:43:48.000-06:00
diff --git a/.versions b/.versions
@@ -46,7 +46,7 @@ reactive-var@1.0.13
 reload@1.3.2
 retry@1.1.1
 routepolicy@1.1.2
-skysignal:agent@1.0.19
+skysignal:agent@1.0.20
 socket-stream-client@0.6.1
 tracker@1.3.4
 typescript@5.9.3
diff --git a/README.md b/README.md
@@ -854,6 +854,12 @@ Main agent singleton instance.
 
 ## Changelog
 
+### v1.0.21 (Nested Cgroup Fix & Uptime Metric)
+
+- **Fix container metrics on Galaxy and nested cgroup hierarchies** - The cgroup detection in v1.0.18/v1.0.20 hardcoded root paths (`/sys/fs/cgroup/memory.max`, `/sys/fs/cgroup/cpu.max`). On Galaxy and other platforms that use nested cgroup hierarchies (e.g., `/sys/fs/cgroup/kubepods.slice/kubepods-pod123.slice/...`), the root files return the parent slice limit (often 512 MB or unlimited) instead of the per-container limit (e.g., 2 GB on Galaxy "Double" plan). This caused SkySignal to report 512 MB / 92% (Critical) when the real container had 2 GB at ~23% usage. Added `_getCgroupBase()` which parses `/proc/self/cgroup` to resolve the actual cgroup path for the current process, handling both cgroup v2 (`0::/` lines) and cgroup v1 (`:memory:` controller lines). All four cgroup detection methods (`_detectMemoryLimit`, `_detectCpuQuota`, `_getContainerMemoryUsage`, `_detectCgroupMemUsagePath`) now try the resolved nested path first, then fall back to root paths for simple container setups. This is the same technique used by cAdvisor, Kubernetes metrics-server, and Galaxy's own dashboard.
+- **New `uptime` metric field** - Now collects `process.uptime()` (seconds since the Node.js process started) each collection cycle. Previously the System tab showed "Uptime: 0m" because this field was never sent by the agent.
+- **`process.constrainedMemory()` safety check** - Added `limit < Number.MAX_SAFE_INTEGER` guard to the Node 19+ `constrainedMemory()` strategy, preventing false positives when the function returns a sentinel value indicating no cgroup limit.
+
 ### v1.0.20 (Publication Context & Observer Leak Detection & Container-Aware Metrics)
 
 - **Container memory detection** - When the agent runs inside a Docker container (e.g., Meteor Galaxy), `os.totalmem()` / `os.freemem()` report host machine values, not container limits. The agent now detects cgroup memory limits via a 3-strategy fallback: `process.constrainedMemory()` (Node 19+), cgroup v2 (`/sys/fs/cgroup/memory.max`), cgroup v1 (`/sys/fs/cgroup/memory/memory.limit_in_bytes`). When a limit is found, `memoryTotal`, `memoryUsed`, `memoryFree`, and `memoryUsage` report container-level values instead of host-level values.
diff --git a/lib/collectors/SystemMetricsCollector.js b/lib/collectors/SystemMetricsCollector.js
@@ -10,7 +10,7 @@ import v8 from "v8";
 const execAsync = promisify(exec);
 
 // Agent version - must be updated alongside package.js on each release
-const AGENT_VERSION = '1.0.20';
+const AGENT_VERSION = '1.0.21';
 
 // cgroup v1 "unlimited" sentinel: values >= 2^62 mean no limit is set
 const CGROUP_V1_UNLIMITED = 2 ** 62;
@@ -273,6 +273,7 @@ export default class SystemMetricsCollector {
         ...diskStats,
         ...networkStats,
         processCount,
+        uptime: Math.floor(process.uptime()), // Process uptime in seconds
         appVersion: this.appVersion,
         buildHash: this.buildHash,
         meteorVersion,
@@ -669,6 +670,43 @@ export default class SystemMetricsCollector {
     }
   }
 
+  /**
+   * Get the container's cgroup base path by parsing /proc/self/cgroup.
+   * Works on both cgroup v1 and v2, including nested setups (e.g. Galaxy).
+   * @returns {Promise<{version: number, basePath: string}|null>}
+   * @private
+   */
+  async _getCgroupBase() {
+    if (this.platform !== 'linux') return null;
+
+    try {
+      const data = await fs.readFile('/proc/self/cgroup', 'utf8');
+
+      // cgroup v2: line starts with "0::/"
+      const v2Line = data.split('\n').find(l => l.startsWith('0::/'));
+      if (v2Line) {
+        const relPath = v2Line.split('::')[1].trim() || '/';
+        const base = relPath === '/'
+          ? '/sys/fs/cgroup'
+          : `/sys/fs/cgroup${relPath}`;
+        return { version: 2, basePath: base };
+      }
+
+      // cgroup v1: look for memory controller line
+      const memLine = data.split('\n').find(l => l.includes(':memory:'));
+      if (memLine) {
+        const relPath = memLine.split(':')[2].trim() || '/';
+        const base = relPath === '/'
+          ? '/sys/fs/cgroup/memory'
+          : `/sys/fs/cgroup/memory${relPath}`;
+        return { version: 1, basePath: base };
+      }
+    } catch (e) {
+      this._log('Failed to read /proc/self/cgroup');
+    }
+    return null;
+  }
+
   /**
    * Detect container limits at startup (run once, cache results).
    * Sets this._containerLimits = { isContainerized, memoryLimit, cpuQuota }
@@ -681,6 +719,9 @@ export default class SystemMetricsCollector {
     if (this.platform !== 'linux') return;
 
     try {
+      // Resolve the real cgroup base path first (handles nested hierarchies like Galaxy)
+      this._cgroupBase = await this._getCgroupBase();
+
       const memoryLimit = await this._detectMemoryLimit();
       const cpuQuota = await this._detectCpuQuota();
 
@@ -690,31 +731,54 @@ export default class SystemMetricsCollector {
           memoryLimit,
           cpuQuota, // may be null if no CPU quota is set
         };
-        this._log(`Container detected: memory=${memoryLimit}, cpuQuota=${cpuQuota}`);
+        this._log(`Container detected: memory=${memoryLimit}, cpuQuota=${cpuQuota}` +
+          (this._cgroupBase ? `, cgroupBase=${this._cgroupBase.basePath}` : ''));
       }
     } catch (e) {
       this._log('Container detection failed:', e.message);
     }
   }
 
   /**
-   * Detect container memory limit using 3-strategy fallback:
-   * 1. process.constrainedMemory() (Node 19+)
-   * 2. cgroup v2: /sys/fs/cgroup/memory.max
-   * 3. cgroup v1: /sys/fs/cgroup/memory/memory.limit_in_bytes
+   * Detect container memory limit using 4-strategy fallback:
+   * 1. process.constrainedMemory() (Node 19+, reads the real cgroup internally)
+   * 2. Nested cgroup path from /proc/self/cgroup (Galaxy, nested Docker/K8s)
+   * 3. Root cgroup v2: /sys/fs/cgroup/memory.max
+   * 4. Root cgroup v1: /sys/fs/cgroup/memory/memory.limit_in_bytes
    * @returns {Promise<number|null>} Memory limit in bytes, or null if not constrained
    * @private
    */
   async _detectMemoryLimit() {
     // Strategy 1: process.constrainedMemory() (Node 19+)
+    // Node internally resolves the correct cgroup path, so this is the best source
     try {
       if (typeof process.constrainedMemory === 'function') {
         const limit = process.constrainedMemory();
-        if (limit && limit > 0) return limit;
+        if (limit && limit > 0 && limit < Number.MAX_SAFE_INTEGER) return limit;
       }
     } catch (e) { /* ignore */ }
 
-    // Strategy 2: cgroup v2
+    // Strategy 2: Use resolved cgroup base path (handles nested hierarchies)
+    if (this._cgroupBase) {
+      try {
+        if (this._cgroupBase.version === 2) {
+          const data = await fs.readFile(`${this._cgroupBase.basePath}/memory.max`, 'utf8');
+          const trimmed = data.trim();
+          if (trimmed !== 'max') {
+            const limit = parseInt(trimmed, 10);
+            if (!isNaN(limit) && limit > 0) return limit;
+          }
+        } else {
+          const data = await fs.readFile(`${this._cgroupBase.basePath}/memory.limit_in_bytes`, 'utf8');
+          const limit = parseInt(data.trim(), 10);
+          if (!isNaN(limit) && limit > 0 && limit < CGROUP_V1_UNLIMITED) return limit;
+        }
+      } catch (e) {
+        this._log(`Nested cgroup memory limit not readable: ${e.message}`);
+      }
+    }
+
+    // Strategy 3: Root cgroup v2 fallback
     try {
       const data = await fs.readFile('/sys/fs/cgroup/memory.max', 'utf8');
       const trimmed = data.trim();
@@ -724,26 +788,40 @@ export default class SystemMetricsCollector {
       }
     } catch (e) { /* file doesn't exist — not cgroup v2 */ }
 
-    // Strategy 3: cgroup v1
+    // Strategy 4: Root cgroup v1 fallback
     try {
       const data = await fs.readFile('/sys/fs/cgroup/memory/memory.limit_in_bytes', 'utf8');
       const limit = parseInt(data.trim(), 10);
-      // cgroup v1 sets a sentinel >= 2^62 when no limit is configured
       if (!isNaN(limit) && limit > 0 && limit < CGROUP_V1_UNLIMITED) return limit;
     } catch (e) { /* file doesn't exist — not cgroup v1 */ }
 
     return null;
   }
 
   /**
-   * Detect container CPU quota using 2-strategy fallback:
-   * 1. cgroup v2: /sys/fs/cgroup/cpu.max ("quota period")
-   * 2. cgroup v1: cpu.cfs_quota_us / cpu.cfs_period_us
+   * Detect container CPU quota using 4-strategy fallback:
+   * 1. Nested cgroup v2: {basePath}/cpu.max
+   * 2. Root cgroup v2: /sys/fs/cgroup/cpu.max
+   * 3. Nested cgroup v1: {basePath}/../cpu/cpu.cfs_quota_us (v1 has per-controller dirs)
+   * 4. Root cgroup v1: /sys/fs/cgroup/cpu/cpu.cfs_quota_us
    * @returns {Promise<number|null>} Effective CPU count (float), or null if no quota
    * @private
    */
   async _detectCpuQuota() {
-    // Strategy 1: cgroup v2
+    // Strategy 1: Nested cgroup v2 path
+    if (this._cgroupBase && this._cgroupBase.version === 2) {
+      try {
+        const data = await fs.readFile(`${this._cgroupBase.basePath}/cpu.max`, 'utf8');
+        const [quotaStr, periodStr] = data.trim().split(/\s+/);
+        if (quotaStr !== 'max') {
+          const quota = parseInt(quotaStr, 10);
+          const period = parseInt(periodStr, 10);
+          if (quota > 0 && period > 0) return quota / period;
+        }
+      } catch (e) { /* not available at nested path */ }
+    }
+
+    // Strategy 2: Root cgroup v2
     try {
       const data = await fs.readFile('/sys/fs/cgroup/cpu.max', 'utf8');
       const parts = data.trim().split(/\s+/);
@@ -754,7 +832,23 @@ export default class SystemMetricsCollector {
       }
     } catch (e) { /* not cgroup v2 */ }
 
-    // Strategy 2: cgroup v1
+    // Strategy 3: Nested cgroup v1 (cpu controller has its own mount)
+    if (this._cgroupBase && this._cgroupBase.version === 1) {
+      try {
+        // v1 memory basePath is /sys/fs/cgroup/memory/{relPath}
+        // cpu controller is at /sys/fs/cgroup/cpu/{relPath}
+        const cpuBase = this._cgroupBase.basePath.replace('/cgroup/memory', '/cgroup/cpu');
+        const [quotaData, periodData] = await Promise.all([
+          fs.readFile(`${cpuBase}/cpu.cfs_quota_us`, 'utf8'),
+          fs.readFile(`${cpuBase}/cpu.cfs_period_us`, 'utf8'),
+        ]);
+        const quota = parseInt(quotaData.trim(), 10);
+        const period = parseInt(periodData.trim(), 10);
+        if (quota > 0 && period > 0) return quota / period;
+      } catch (e) { /* not available at nested path */ }
+    }
+
+    // Strategy 4: Root cgroup v1
     try {
       const [quotaData, periodData] = await Promise.all([
         fs.readFile('/sys/fs/cgroup/cpu/cpu.cfs_quota_us', 'utf8'),
@@ -811,17 +905,32 @@ export default class SystemMetricsCollector {
 
   /**
    * Detect which cgroup memory usage file exists (called once, result cached).
+   * Prioritizes the nested cgroup path (for Galaxy/nested Docker) over root paths.
    * @returns {string|null}
    * @private
    */
   _detectCgroupMemUsagePath() {
-    const paths = [
-      '/sys/fs/cgroup/memory.current',          // cgroup v2
-      '/sys/fs/cgroup/memory/memory.usage_in_bytes', // cgroup v1
-    ];
+    const paths = [];
+
+    // Prioritize nested cgroup path if resolved
+    if (this._cgroupBase) {
+      if (this._cgroupBase.version === 2) {
+        paths.push(`${this._cgroupBase.basePath}/memory.current`);
+      } else {
+        paths.push(`${this._cgroupBase.basePath}/memory.usage_in_bytes`);
+      }
+    }
+
+    // Fallback to root cgroup paths
+    paths.push(
+      '/sys/fs/cgroup/memory.current',               // root cgroup v2
+      '/sys/fs/cgroup/memory/memory.usage_in_bytes',  // root cgroup v1
+    );
+
+    const fsSync = require('fs');
     for (const p of paths) {
       try {
-        require('fs').accessSync(p);
+        fsSync.accessSync(p);
         return p;
       } catch (e) { /* not available */ }
     }