fix(docker-stats,podman-stats): restore per-container CPU and memory perfdata (fix #1104)

markuslf · markuslf · commit 64aa4f73ae0d · 2026-05-12T09:51:26.000+02:00
v2026041002 replaced the per-container cpu_usage / mem_usage perfdata
with aggregates ('containers_running', 'cpu' for docker; plus
'block_input', 'block_output', 'images', 'net_rx', 'net_tx', 'ram'
for podman) on the rationale that container names come and go and
bloat the time-series backend. That broke the long-term trending of
individual workloads, which is the primary use case for these checks.

Re-emit &lt;container&gt;_cpu_usage and &lt;container&gt;_mem_usage per running
container alongside the aggregates. Names are still shortened via
shorten() unless --full-name is passed, matching v2025022501
semantics.

Extend the unit-test assertions to pin the per-container perfdata
labels so this can't regress silently again.

Bump __version__ to 2026051201.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -85,6 +85,11 @@ Grafana:
 * `schemaVersion` fixed to `42`; Grafana 12 was failing to import the date-encoded value
 
 
+Monitoring Plugins:
+
+* docker-stats, podman-stats: per-container CPU and memory perfdata restored. The previous release reported only aggregate values, breaking long-term trending of individual containers ([#1104](https://github.com/Linuxfabrik/monitoring-plugins/issues/1104))
+
+
 ### Removed
 
 Monitoring Plugins:
diff --git a/check-plugins/docker-stats/README.md b/check-plugins/docker-stats/README.md
@@ -108,8 +108,12 @@ myconti_ds_1              ! 0.0   ! 11.42
 
 ## Perfdata / Metrics
 
+The plugin emits one CPU and one memory metric per container so individual workloads can be plotted long-term. Because container names appear and disappear as workloads come and go, the time-series backend (Graphite, InfluxDB, ...) will keep stale entries until they are pruned.
+
 | Name | Type | Description |
 |----|----|----|
+| `<container>_cpu_usage` | Percentage | Per-container CPU usage, normalized by host CPU count. |
+| `<container>_mem_usage` | Percentage | Per-container memory usage, relative to the container memory limit or host memory. |
 | containers_running | Number | Number of running containers. |
 | cpu | Number | Number of host CPUs. |
 
diff --git a/check-plugins/docker-stats/docker-stats b/check-plugins/docker-stats/docker-stats
@@ -22,7 +22,7 @@ import lib.txt
 from lib.globals import STATE_CRIT, STATE_OK, STATE_UNKNOWN, STATE_WARN
 
 __author__ = 'Linuxfabrik GmbH, Zurich/Switzerland'
-__version__ = '2026041002'
+__version__ = '2026051201'
 
 DESCRIPTION = """Reports CPU and memory usage for all running Docker containers. CPU usage is
 normalized by dividing by the number of available host CPU cores. CPU alerts only
@@ -220,6 +220,26 @@ def main():
         cpu_usage = round(float(cpu_percent.replace('%', '').strip()) / host_cpus, 1)
         mem_usage = round(float(mem_percent.replace('%', '').strip()), 1)
 
+        # per-container perfdata for long-term trending of individual workloads
+        perfdata += lib.base.get_perfdata(
+            f'{name}_cpu_usage',
+            cpu_usage,
+            uom='%',
+            warn=args.WARN_CPU,
+            crit=args.CRIT_CPU,
+            _min=0,
+            _max=100,
+        )
+        perfdata += lib.base.get_perfdata(
+            f'{name}_mem_usage',
+            mem_usage,
+            uom='%',
+            warn=args.WARN_MEM,
+            crit=args.CRIT_MEM,
+            _min=0,
+            _max=100,
+        )
+
         # save trend data to local sqlite database, limited to "count" rows max.
         lib.base.coe(
             lib.db_sqlite.insert(
diff --git a/check-plugins/docker-stats/unit-test/run b/check-plugins/docker-stats/unit-test/run
@@ -33,6 +33,9 @@ TESTS = [
             'Container         ! CPU % ! Mem %',
             '------------------+-------+------',
             'traefik_traefik.2 ! 0.0   ! 0.0',
+            # per-container perfdata (https://github.com/Linuxfabrik/monitoring-plugins/issues/1104)
+            "'traefik_traefik.2_cpu_usage'=0.0%",
+            "'traefik_traefik.2_mem_usage'=0.0%",
         ],
     },
     {
@@ -45,6 +48,8 @@ TESTS = [
             'Container                                   ! CPU % ! Mem %',
             '--------------------------------------------+-------+------',
             'traefik_traefik.2.1idw12p2yqpxutlzkcwign4at ! 0.0   ! 0.0',
+            "'traefik_traefik.2.1idw12p2yqpxutlzkcwign4at_cpu_usage'=0.0%",
+            "'traefik_traefik.2.1idw12p2yqpxutlzkcwign4at_mem_usage'=0.0%",
         ],
     },
     {
@@ -58,6 +63,12 @@ TESTS = [
             'elasticsearch ! 188.8 ! 16.7',
             'graylog       ! 204.2 ! 5.7',
             'mongo         ! 0.3   ! 1.9',
+            "'elasticsearch_cpu_usage'=188.8%",
+            "'elasticsearch_mem_usage'=16.7%",
+            "'graylog_cpu_usage'=204.2%",
+            "'graylog_mem_usage'=5.7%",
+            "'mongo_cpu_usage'=0.3%",
+            "'mongo_mem_usage'=1.9%",
         ],
     },
     {
@@ -71,6 +82,8 @@ TESTS = [
             'runner-7ayh6h5f-project-107-concurrent-0-37b2c7aee9359db9-build     ! 95.0  ! 1.2',
             'runner-7ayh6h5f-project-19-concurrent-0-99f0211c36d59d01-build      ! 59.5  ! 1.0',
             'runner-7ayh6h5f-project-49-concurrent-0-e180afe41fc754dc-predefined ! 79.5  ! 0.1',
+            "'runner-7ayh6h5f-project-107-concurrent-0-37b2c7aee9359db9-build_cpu_usage'=95.0%",
+            "'runner-7ayh6h5f-project-49-concurrent-0-e180afe41fc754dc-predefined_mem_usage'=0.1%",
         ],
     },
 ]
diff --git a/check-plugins/podman-stats/README.md b/check-plugins/podman-stats/README.md
@@ -109,16 +109,20 @@ myconti_ds_1              ! 0.0   ! 11.42
 
 ## Perfdata / Metrics
 
+The plugin emits one CPU and one memory metric per container so individual workloads can be plotted long-term. Because container names appear and disappear as workloads come and go, the time-series backend (Graphite, InfluxDB, ...) will keep stale entries until they are pruned.
+
 | Name | Type | Description |
 |----|----|----|
-| block_input        | Bytes  | Total data read from block device across all containers.  |
-| block_output       | Bytes  | Total data written to block device across all containers. |
-| containers_running | Number | Number of running containers.                             |
-| cpu                | Number | Number of host CPUs.                                      |
-| images             | Number | Number of images.                                         |
-| net_rx             | Bytes  | Total network bytes received across all containers.       |
-| net_tx             | Bytes  | Total network bytes transmitted across all containers.    |
-| ram                | Bytes  | Total host memory.                                        |
+| `<container>_cpu_usage` | Percentage | Per-container CPU usage, normalized by host CPU count.                            |
+| `<container>_mem_usage` | Percentage | Per-container memory usage, relative to the container memory limit or host memory. |
+| block_input             | Bytes      | Total data read from block device across all containers.                          |
+| block_output            | Bytes      | Total data written to block device across all containers.                         |
+| containers_running      | Number     | Number of running containers.                                                     |
+| cpu                     | Number     | Number of host CPUs.                                                              |
+| images                  | Number     | Number of images.                                                                 |
+| net_rx                  | Bytes      | Total network bytes received across all containers.                               |
+| net_tx                  | Bytes      | Total network bytes transmitted across all containers.                            |
+| ram                     | Bytes      | Total host memory.                                                                |
 
 
 ## Credits, License
diff --git a/check-plugins/podman-stats/podman-stats b/check-plugins/podman-stats/podman-stats
@@ -22,7 +22,7 @@ import lib.shell
 from lib.globals import STATE_CRIT, STATE_OK, STATE_UNKNOWN, STATE_WARN
 
 __author__ = 'Linuxfabrik GmbH, Zurich/Switzerland'
-__version__ = '2026041002'
+__version__ = '2026051201'
 
 DESCRIPTION = """Reports CPU and memory usage for all running Podman containers. CPU usage is
 normalized by dividing by the number of available host CPU cores. CPU alerts only
@@ -232,6 +232,26 @@ def main():
         cpu_usage = round(float(container.get('CPU', 0)) / host_cpus, 1)
         mem_usage = round(float(container.get('MemPerc', 0)), 1)
 
+        # per-container perfdata for long-term trending of individual workloads
+        perfdata += lib.base.get_perfdata(
+            f'{name}_cpu_usage',
+            cpu_usage,
+            uom='%',
+            warn=args.WARN_CPU,
+            crit=args.CRIT_CPU,
+            _min=0,
+            _max=100,
+        )
+        perfdata += lib.base.get_perfdata(
+            f'{name}_mem_usage',
+            mem_usage,
+            uom='%',
+            warn=args.WARN_MEM,
+            crit=args.CRIT_MEM,
+            _min=0,
+            _max=100,
+        )
+
         # accumulate totals for aggregate perfdata
         total_block_input += int(container.get('BlockInput', 0))
         total_block_output += int(container.get('BlockOutput', 0))
diff --git a/check-plugins/podman-stats/unit-test/run b/check-plugins/podman-stats/unit-test/run
@@ -33,6 +33,9 @@ TESTS = [
             'Container         ! CPU % ! Mem %',
             '------------------+-------+------',
             'traefik_traefik.2 ! 0.0   ! 0.0',
+            # per-container perfdata (https://github.com/Linuxfabrik/monitoring-plugins/issues/1104)
+            "'traefik_traefik.2_cpu_usage'=0.0%",
+            "'traefik_traefik.2_mem_usage'=0.0%",
         ],
     },
     {
@@ -45,6 +48,8 @@ TESTS = [
             'Container                                   ! CPU % ! Mem %',
             '--------------------------------------------+-------+------',
             'traefik_traefik.2.1idw12p2yqpxutlzkcwign4at ! 0.0   ! 0.0',
+            "'traefik_traefik.2.1idw12p2yqpxutlzkcwign4at_cpu_usage'=0.0%",
+            "'traefik_traefik.2.1idw12p2yqpxutlzkcwign4at_mem_usage'=0.0%",
         ],
     },
     {
@@ -58,6 +63,12 @@ TESTS = [
             'elasticsearch ! 188.8 ! 16.7',
             'graylog       ! 204.2 ! 5.7',
             'mongo         ! 0.3   ! 1.9',
+            "'elasticsearch_cpu_usage'=188.8%",
+            "'elasticsearch_mem_usage'=16.7%",
+            "'graylog_cpu_usage'=204.2%",
+            "'graylog_mem_usage'=5.7%",
+            "'mongo_cpu_usage'=0.3%",
+            "'mongo_mem_usage'=1.9%",
         ],
     },
     {
@@ -71,6 +82,8 @@ TESTS = [
             'runner-7ayh6h5f-project-107-concurrent-0-37b2c7aee9359db9-build     ! 95.0  ! 1.2',
             'runner-7ayh6h5f-project-19-concurrent-0-99f0211c36d59d01-build      ! 59.5  ! 1.0',
             'runner-7ayh6h5f-project-49-concurrent-0-e180afe41fc754dc-predefined ! 79.5  ! 0.1',
+            "'runner-7ayh6h5f-project-107-concurrent-0-37b2c7aee9359db9-build_cpu_usage'=95.0%",
+            "'runner-7ayh6h5f-project-49-concurrent-0-e180afe41fc754dc-predefined_mem_usage'=0.1%",
         ],
     },
 ]