Skip to content

Commit e44ba4a

Browse files
committed
fix container metrics for hosting envs with nested cgroup hierarchies | add missing uptime monitor metric
1 parent 471e99b commit e44ba4a

3 files changed

Lines changed: 136 additions & 21 deletions

File tree

.versions

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ reactive-var@1.0.13
4646
reload@1.3.2
4747
retry@1.1.1
4848
routepolicy@1.1.2
49-
skysignal:agent@1.0.19
49+
skysignal:agent@1.0.20
5050
socket-stream-client@0.6.1
5151
tracker@1.3.4
5252
typescript@5.9.3

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -854,6 +854,12 @@ Main agent singleton instance.
854854

855855
## Changelog
856856

857+
### v1.0.21 (Nested Cgroup Fix & Uptime Metric)
858+
859+
- **Fix container metrics on Galaxy and nested cgroup hierarchies** - The cgroup detection in v1.0.18/v1.0.20 hardcoded root paths (`/sys/fs/cgroup/memory.max`, `/sys/fs/cgroup/cpu.max`). On Galaxy and other platforms that use nested cgroup hierarchies (e.g., `/sys/fs/cgroup/kubepods.slice/kubepods-pod123.slice/...`), the root files return the parent slice limit (often 512 MB or unlimited) instead of the per-container limit (e.g., 2 GB on Galaxy "Double" plan). This caused SkySignal to report 512 MB / 92% (Critical) when the real container had 2 GB at ~23% usage. Added `_getCgroupBase()` which parses `/proc/self/cgroup` to resolve the actual cgroup path for the current process, handling both cgroup v2 (`0::/` lines) and cgroup v1 (`:memory:` controller lines). All four cgroup detection methods (`_detectMemoryLimit`, `_detectCpuQuota`, `_getContainerMemoryUsage`, `_detectCgroupMemUsagePath`) now try the resolved nested path first, then fall back to root paths for simple container setups. This is the same technique used by cAdvisor, Kubernetes metrics-server, and Galaxy's own dashboard.
860+
- **New `uptime` metric field** - Now collects `process.uptime()` (seconds since the Node.js process started) each collection cycle. Previously the System tab showed "Uptime: 0m" because this field was never sent by the agent.
861+
- **`process.constrainedMemory()` safety check** - Added `limit < Number.MAX_SAFE_INTEGER` guard to the Node 19+ `constrainedMemory()` strategy, preventing false positives when the function returns a sentinel value indicating no cgroup limit.
862+
857863
### v1.0.20 (Publication Context & Observer Leak Detection & Container-Aware Metrics)
858864

859865
- **Container memory detection** - When the agent runs inside a Docker container (e.g., Meteor Galaxy), `os.totalmem()` / `os.freemem()` report host machine values, not container limits. The agent now detects cgroup memory limits via a 3-strategy fallback: `process.constrainedMemory()` (Node 19+), cgroup v2 (`/sys/fs/cgroup/memory.max`), cgroup v1 (`/sys/fs/cgroup/memory/memory.limit_in_bytes`). When a limit is found, `memoryTotal`, `memoryUsed`, `memoryFree`, and `memoryUsage` report container-level values instead of host-level values.

lib/collectors/SystemMetricsCollector.js

Lines changed: 129 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import v8 from "v8";
1010
const execAsync = promisify(exec);
1111

1212
// Agent version - must be updated alongside package.js on each release
13-
const AGENT_VERSION = '1.0.20';
13+
const AGENT_VERSION = '1.0.21';
1414

1515
// cgroup v1 "unlimited" sentinel: values >= 2^62 mean no limit is set
1616
const CGROUP_V1_UNLIMITED = 2 ** 62;
@@ -273,6 +273,7 @@ export default class SystemMetricsCollector {
273273
...diskStats,
274274
...networkStats,
275275
processCount,
276+
uptime: Math.floor(process.uptime()), // Process uptime in seconds
276277
appVersion: this.appVersion,
277278
buildHash: this.buildHash,
278279
meteorVersion,
@@ -669,6 +670,43 @@ export default class SystemMetricsCollector {
669670
}
670671
}
671672

673+
/**
674+
* Get the container's cgroup base path by parsing /proc/self/cgroup.
675+
* Works on both cgroup v1 and v2, including nested setups (e.g. Galaxy).
676+
* @returns {Promise<{version: number, basePath: string}|null>}
677+
* @private
678+
*/
679+
async _getCgroupBase() {
680+
if (this.platform !== 'linux') return null;
681+
682+
try {
683+
const data = await fs.readFile('/proc/self/cgroup', 'utf8');
684+
685+
// cgroup v2: line starts with "0::/"
686+
const v2Line = data.split('\n').find(l => l.startsWith('0::/'));
687+
if (v2Line) {
688+
const relPath = v2Line.split('::')[1].trim() || '/';
689+
const base = relPath === '/'
690+
? '/sys/fs/cgroup'
691+
: `/sys/fs/cgroup${relPath}`;
692+
return { version: 2, basePath: base };
693+
}
694+
695+
// cgroup v1: look for memory controller line
696+
const memLine = data.split('\n').find(l => l.includes(':memory:'));
697+
if (memLine) {
698+
const relPath = memLine.split(':')[2].trim() || '/';
699+
const base = relPath === '/'
700+
? '/sys/fs/cgroup/memory'
701+
: `/sys/fs/cgroup/memory${relPath}`;
702+
return { version: 1, basePath: base };
703+
}
704+
} catch (e) {
705+
this._log('Failed to read /proc/self/cgroup');
706+
}
707+
return null;
708+
}
709+
672710
/**
673711
* Detect container limits at startup (run once, cache results).
674712
* Sets this._containerLimits = { isContainerized, memoryLimit, cpuQuota }
@@ -681,6 +719,9 @@ export default class SystemMetricsCollector {
681719
if (this.platform !== 'linux') return;
682720

683721
try {
722+
// Resolve the real cgroup base path first (handles nested hierarchies like Galaxy)
723+
this._cgroupBase = await this._getCgroupBase();
724+
684725
const memoryLimit = await this._detectMemoryLimit();
685726
const cpuQuota = await this._detectCpuQuota();
686727

@@ -690,31 +731,54 @@ export default class SystemMetricsCollector {
690731
memoryLimit,
691732
cpuQuota, // may be null if no CPU quota is set
692733
};
693-
this._log(`Container detected: memory=${memoryLimit}, cpuQuota=${cpuQuota}`);
734+
this._log(`Container detected: memory=${memoryLimit}, cpuQuota=${cpuQuota}` +
735+
(this._cgroupBase ? `, cgroupBase=${this._cgroupBase.basePath}` : ''));
694736
}
695737
} catch (e) {
696738
this._log('Container detection failed:', e.message);
697739
}
698740
}
699741

700742
/**
701-
* Detect container memory limit using 3-strategy fallback:
702-
* 1. process.constrainedMemory() (Node 19+)
703-
* 2. cgroup v2: /sys/fs/cgroup/memory.max
704-
* 3. cgroup v1: /sys/fs/cgroup/memory/memory.limit_in_bytes
743+
* Detect container memory limit using 4-strategy fallback:
744+
* 1. process.constrainedMemory() (Node 19+, reads the real cgroup internally)
745+
* 2. Nested cgroup path from /proc/self/cgroup (Galaxy, nested Docker/K8s)
746+
* 3. Root cgroup v2: /sys/fs/cgroup/memory.max
747+
* 4. Root cgroup v1: /sys/fs/cgroup/memory/memory.limit_in_bytes
705748
* @returns {Promise<number|null>} Memory limit in bytes, or null if not constrained
706749
* @private
707750
*/
708751
async _detectMemoryLimit() {
709752
// Strategy 1: process.constrainedMemory() (Node 19+)
753+
// Node internally resolves the correct cgroup path, so this is the best source
710754
try {
711755
if (typeof process.constrainedMemory === 'function') {
712756
const limit = process.constrainedMemory();
713-
if (limit && limit > 0) return limit;
757+
if (limit && limit > 0 && limit < Number.MAX_SAFE_INTEGER) return limit;
714758
}
715759
} catch (e) { /* ignore */ }
716760

717-
// Strategy 2: cgroup v2
761+
// Strategy 2: Use resolved cgroup base path (handles nested hierarchies)
762+
if (this._cgroupBase) {
763+
try {
764+
if (this._cgroupBase.version === 2) {
765+
const data = await fs.readFile(`${this._cgroupBase.basePath}/memory.max`, 'utf8');
766+
const trimmed = data.trim();
767+
if (trimmed !== 'max') {
768+
const limit = parseInt(trimmed, 10);
769+
if (!isNaN(limit) && limit > 0) return limit;
770+
}
771+
} else {
772+
const data = await fs.readFile(`${this._cgroupBase.basePath}/memory.limit_in_bytes`, 'utf8');
773+
const limit = parseInt(data.trim(), 10);
774+
if (!isNaN(limit) && limit > 0 && limit < CGROUP_V1_UNLIMITED) return limit;
775+
}
776+
} catch (e) {
777+
this._log(`Nested cgroup memory limit not readable: ${e.message}`);
778+
}
779+
}
780+
781+
// Strategy 3: Root cgroup v2 fallback
718782
try {
719783
const data = await fs.readFile('/sys/fs/cgroup/memory.max', 'utf8');
720784
const trimmed = data.trim();
@@ -724,26 +788,40 @@ export default class SystemMetricsCollector {
724788
}
725789
} catch (e) { /* file doesn't exist — not cgroup v2 */ }
726790

727-
// Strategy 3: cgroup v1
791+
// Strategy 4: Root cgroup v1 fallback
728792
try {
729793
const data = await fs.readFile('/sys/fs/cgroup/memory/memory.limit_in_bytes', 'utf8');
730794
const limit = parseInt(data.trim(), 10);
731-
// cgroup v1 sets a sentinel >= 2^62 when no limit is configured
732795
if (!isNaN(limit) && limit > 0 && limit < CGROUP_V1_UNLIMITED) return limit;
733796
} catch (e) { /* file doesn't exist — not cgroup v1 */ }
734797

735798
return null;
736799
}
737800

738801
/**
739-
* Detect container CPU quota using 2-strategy fallback:
740-
* 1. cgroup v2: /sys/fs/cgroup/cpu.max ("quota period")
741-
* 2. cgroup v1: cpu.cfs_quota_us / cpu.cfs_period_us
802+
* Detect container CPU quota using 4-strategy fallback:
803+
* 1. Nested cgroup v2: {basePath}/cpu.max
804+
* 2. Root cgroup v2: /sys/fs/cgroup/cpu.max
805+
* 3. Nested cgroup v1: {basePath}/../cpu/cpu.cfs_quota_us (v1 has per-controller dirs)
806+
* 4. Root cgroup v1: /sys/fs/cgroup/cpu/cpu.cfs_quota_us
742807
* @returns {Promise<number|null>} Effective CPU count (float), or null if no quota
743808
* @private
744809
*/
745810
async _detectCpuQuota() {
746-
// Strategy 1: cgroup v2
811+
// Strategy 1: Nested cgroup v2 path
812+
if (this._cgroupBase && this._cgroupBase.version === 2) {
813+
try {
814+
const data = await fs.readFile(`${this._cgroupBase.basePath}/cpu.max`, 'utf8');
815+
const [quotaStr, periodStr] = data.trim().split(/\s+/);
816+
if (quotaStr !== 'max') {
817+
const quota = parseInt(quotaStr, 10);
818+
const period = parseInt(periodStr, 10);
819+
if (quota > 0 && period > 0) return quota / period;
820+
}
821+
} catch (e) { /* not available at nested path */ }
822+
}
823+
824+
// Strategy 2: Root cgroup v2
747825
try {
748826
const data = await fs.readFile('/sys/fs/cgroup/cpu.max', 'utf8');
749827
const parts = data.trim().split(/\s+/);
@@ -754,7 +832,23 @@ export default class SystemMetricsCollector {
754832
}
755833
} catch (e) { /* not cgroup v2 */ }
756834

757-
// Strategy 2: cgroup v1
835+
// Strategy 3: Nested cgroup v1 (cpu controller has its own mount)
836+
if (this._cgroupBase && this._cgroupBase.version === 1) {
837+
try {
838+
// v1 memory basePath is /sys/fs/cgroup/memory/{relPath}
839+
// cpu controller is at /sys/fs/cgroup/cpu/{relPath}
840+
const cpuBase = this._cgroupBase.basePath.replace('/cgroup/memory', '/cgroup/cpu');
841+
const [quotaData, periodData] = await Promise.all([
842+
fs.readFile(`${cpuBase}/cpu.cfs_quota_us`, 'utf8'),
843+
fs.readFile(`${cpuBase}/cpu.cfs_period_us`, 'utf8'),
844+
]);
845+
const quota = parseInt(quotaData.trim(), 10);
846+
const period = parseInt(periodData.trim(), 10);
847+
if (quota > 0 && period > 0) return quota / period;
848+
} catch (e) { /* not available at nested path */ }
849+
}
850+
851+
// Strategy 4: Root cgroup v1
758852
try {
759853
const [quotaData, periodData] = await Promise.all([
760854
fs.readFile('/sys/fs/cgroup/cpu/cpu.cfs_quota_us', 'utf8'),
@@ -811,17 +905,32 @@ export default class SystemMetricsCollector {
811905

812906
/**
813907
* Detect which cgroup memory usage file exists (called once, result cached).
908+
* Prioritizes the nested cgroup path (for Galaxy/nested Docker) over root paths.
814909
* @returns {string|null}
815910
* @private
816911
*/
817912
_detectCgroupMemUsagePath() {
818-
const paths = [
819-
'/sys/fs/cgroup/memory.current', // cgroup v2
820-
'/sys/fs/cgroup/memory/memory.usage_in_bytes', // cgroup v1
821-
];
913+
const paths = [];
914+
915+
// Prioritize nested cgroup path if resolved
916+
if (this._cgroupBase) {
917+
if (this._cgroupBase.version === 2) {
918+
paths.push(`${this._cgroupBase.basePath}/memory.current`);
919+
} else {
920+
paths.push(`${this._cgroupBase.basePath}/memory.usage_in_bytes`);
921+
}
922+
}
923+
924+
// Fallback to root cgroup paths
925+
paths.push(
926+
'/sys/fs/cgroup/memory.current', // root cgroup v2
927+
'/sys/fs/cgroup/memory/memory.usage_in_bytes', // root cgroup v1
928+
);
929+
930+
const fsSync = require('fs');
822931
for (const p of paths) {
823932
try {
824-
require('fs').accessSync(p);
933+
fsSync.accessSync(p);
825934
return p;
826935
} catch (e) { /* not available */ }
827936
}

0 commit comments

Comments
 (0)