[fix] Address PR #83 review feedback for metrics exporter

RobotGF · claude · juliusgao · commit be61454e7828 · 2026-04-28T12:53:39.000+08:00
- Use process RSS memory instead of manual tensor size estimation
- Remove redundant _op_counts/_sample_counts (IntervalPerfMonitor already tracks)
- Remove TQ_METRICS_ENABLED env var override; metrics controlled by config only
- Trim non-essential metrics (partition_fields, allocated_samples, storage_fields,
  data_memory, op_total, samples_total) to keep only essential ones
- Fix thread-safety: iterate over dict snapshots in collect methods
- Fix lifecycle docstring (start_metrics, not __init__)
- Use storage unit's self-reported ID as canonical Prometheus label
- Fix test skip reason to reflect actual dependencies
- Remove dashboard panels referencing deleted metrics

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/scripts/grafana_dashboard.json b/scripts/grafana_dashboard.json
@@ -209,23 +209,6 @@
       ],
       "type": "timeseries"
     },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "palette-classic" },
-          "custom": { "fillOpacity": 10, "lineWidth": 2 }
-        }
-      },
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 23 },
-      "id": 21,
-      "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } },
-      "title": "Allocated Sample Slots per Partition",
-      "targets": [
-        { "expr": "tq_partition_allocated_samples", "legendFormat": "{{ partition_id }}" }
-      ],
-      "type": "timeseries"
-    },
     {
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
       "fieldConfig": {
@@ -349,152 +332,6 @@
         { "expr": "tq_storage_memory_rss_bytes", "legendFormat": "{{ storage_unit_id }}" }
       ],
       "type": "timeseries"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "palette-classic" },
-          "custom": { "fillOpacity": 10, "lineWidth": 2 },
-          "unit": "bytes"
-        }
-      },
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 54 },
-      "id": 34,
-      "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } },
-      "title": "Storage Data Memory (Estimated)",
-      "targets": [
-        { "expr": "tq_storage_data_memory_bytes", "legendFormat": "{{ storage_unit_id }}" }
-      ],
-      "type": "timeseries"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "palette-classic" },
-          "custom": { "fillOpacity": 20, "lineWidth": 2, "stacking": { "mode": "none" } },
-          "unit": "ops"
-        }
-      },
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 62 },
-      "id": 35,
-      "options": { "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } },
-      "title": "Storage Put/Get/Clear Rate (per second)",
-      "targets": [
-        { "expr": "rate(tq_storage_op_total{op_type=\"PUT_DATA\"}[$__rate_interval])", "legendFormat": "PUT {{ storage_unit_id }}" },
-        { "expr": "rate(tq_storage_op_total{op_type=\"GET_DATA\"}[$__rate_interval])", "legendFormat": "GET {{ storage_unit_id }}" },
-        { "expr": "rate(tq_storage_op_total{op_type=\"CLEAR_DATA\"}[$__rate_interval])", "legendFormat": "CLEAR {{ storage_unit_id }}" }
-      ],
-      "type": "timeseries"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "palette-classic" },
-          "custom": { "fillOpacity": 10, "lineWidth": 2 }
-        }
-      },
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 62 },
-      "id": 36,
-      "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } },
-      "title": "Storage Cumulative Operations",
-      "targets": [
-        { "expr": "tq_storage_op_total", "legendFormat": "{{ op_type }} {{ storage_unit_id }}" }
-      ],
-      "type": "timeseries"
-    },
-    {
-      "collapsed": false,
-      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 70 },
-      "id": 104,
-      "title": "Data Lifecycle (Leak Detection)",
-      "type": "row"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "palette-classic" },
-          "custom": { "fillOpacity": 10, "lineWidth": 2 },
-          "unit": "short"
-        }
-      },
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 71 },
-      "id": 40,
-      "options": { "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } },
-      "title": "Sample Put / Get / Clear Rate (per second)",
-      "targets": [
-        { "expr": "sum(rate(tq_storage_samples_total{op_type=\"PUT_DATA\"}[$__rate_interval]))", "legendFormat": "put" },
-        { "expr": "sum(rate(tq_storage_samples_total{op_type=\"GET_DATA\"}[$__rate_interval]))", "legendFormat": "get" },
-        { "expr": "sum(rate(tq_storage_samples_total{op_type=\"CLEAR_DATA\"}[$__rate_interval]))", "legendFormat": "clear" }
-      ],
-      "type": "timeseries"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "description": "put - cleared across all storage units. A steadily rising line indicates samples are being written but never cleared — potential leak. Should stay bounded in healthy RL training.",
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "thresholds" },
-          "thresholds": { "steps": [
-            { "color": "green", "value": null },
-            { "color": "yellow", "value": 5000 },
-            { "color": "red", "value": 10000 }
-          ]},
-          "custom": { "fillOpacity": 15, "lineWidth": 2 },
-          "unit": "short"
-        }
-      },
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 71 },
-      "id": 41,
-      "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } },
-      "title": "Samples In-Flight (put - cleared) [Leak Indicator]",
-      "targets": [
-        { "expr": "sum(tq_storage_samples_total{op_type=\"PUT_DATA\"}) - sum(tq_storage_samples_total{op_type=\"CLEAR_DATA\"}) or vector(0)", "legendFormat": "in-flight samples" }
-      ],
-      "type": "timeseries"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "palette-classic" },
-          "custom": { "fillOpacity": 10, "lineWidth": 2 },
-          "unit": "short"
-        }
-      },
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 79 },
-      "id": 42,
-      "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } },
-      "title": "Cumulative Samples Put vs Cleared (per Storage Unit)",
-      "targets": [
-        { "expr": "tq_storage_samples_total{op_type=\"PUT_DATA\"}", "legendFormat": "put {{ storage_unit_id }}" },
-        { "expr": "tq_storage_samples_total{op_type=\"CLEAR_DATA\"}", "legendFormat": "cleared {{ storage_unit_id }}" }
-      ],
-      "type": "timeseries"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "description": "Ratio of cleared samples to put samples. Healthy value approaches 1.0 over time. A value stuck well below 1.0 means samples accumulate without being reclaimed.",
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "continuous-GrYlRd" },
-          "custom": { "fillOpacity": 15, "lineWidth": 2 },
-          "unit": "percentunit",
-          "min": 0,
-          "max": 1
-        }
-      },
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 79 },
-      "id": 43,
-      "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } },
-      "title": "Clear / Put Ratio (per Storage Unit) [Reclaim Health]",
-      "targets": [
-        { "expr": "tq_storage_samples_total{op_type=\"CLEAR_DATA\"} / on(storage_unit_id) clamp_min(tq_storage_samples_total{op_type=\"PUT_DATA\"}, 1)", "legendFormat": "{{ storage_unit_id }}" }
-      ],
-      "type": "timeseries"
     }
   ],
   "refresh": "10s",
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -28,7 +28,7 @@
 except (ImportError, OSError):
     _HAS_DEPS = False
 
-pytestmark = pytest.mark.skipif(not _HAS_DEPS, reason="torch / CUDA dependencies unavailable")
+pytestmark = pytest.mark.skipif(not _HAS_DEPS, reason="prometheus_client / psutil / pyzmq dependencies unavailable")
 
 
 # ---------------------------------------------------------------------------
@@ -98,8 +98,6 @@ def test_all_metrics_are_registered(self):
             "tq_controller_memory_rss_bytes",
             "tq_partitions_total",
             "tq_partition_samples_total",
-            "tq_partition_fields_total",
-            "tq_partition_allocated_samples",
             "tq_partition_production_progress",
             "tq_partition_consumption_progress",
             "tq_global_index_allocated_total",
@@ -110,9 +108,7 @@ def test_all_metrics_are_registered(self):
             "tq_storage_capacity_total",
             "tq_storage_active_keys_total",
             "tq_storage_utilization_ratio",
-            "tq_storage_fields_total",
             "tq_storage_memory_rss_bytes",
-            "tq_storage_data_memory_bytes",
         ]
 
         registered = {m.name for m in exporter.registry.collect()}
@@ -250,9 +246,7 @@ def test_storage_metrics_populated_on_success(self):
                 "storage_unit_id": "SU_001",
                 "capacity": 1000,
                 "active_keys": 250,
-                "fields_count": 3,
                 "process_rss_bytes": 512 * 1024 * 1024,
-                "data_memory_bytes": 256 * 1024 * 1024,
             }
         )
 
@@ -261,9 +255,7 @@ def test_storage_metrics_populated_on_success(self):
         assert exporter.storage_capacity.labels(storage_unit_id="SU_001")._value.get() == 1000
         assert exporter.storage_active_keys.labels(storage_unit_id="SU_001")._value.get() == 250
         assert exporter.storage_utilization.labels(storage_unit_id="SU_001")._value.get() == 0.25
-        assert exporter.storage_fields.labels(storage_unit_id="SU_001")._value.get() == 3
         assert exporter.storage_memory_rss.labels(storage_unit_id="SU_001")._value.get() == 512 * 1024 * 1024
-        assert exporter.storage_data_memory.labels(storage_unit_id="SU_001")._value.get() == 256 * 1024 * 1024
 
     def test_storage_metrics_handles_query_failure(self):
         """If a storage unit query fails, other units should still be collected."""
@@ -289,7 +281,6 @@ def mock_query(su_info, su_id):
                 "active_keys": 100,
                 "fields_count": 2,
                 "process_rss_bytes": 100 * 1024 * 1024,
-                "data_memory_bytes": 50 * 1024 * 1024,
             }
 
         exporter._query_storage_unit = mock_query
diff --git a/transfer_queue/config.yaml b/transfer_queue/config.yaml
@@ -2,7 +2,6 @@
 # and use transfer_queue.init(conf) to overwrite the config entries.
 
 # Prometheus metrics exporter.
-# Can also be enabled via the TQ_METRICS_ENABLED=true environment variable.
 metrics:
   enabled: false
 
diff --git a/transfer_queue/interface.py b/transfer_queue/interface.py
@@ -431,9 +431,6 @@ def init(conf: Optional[DictConfig] = None) -> Optional[DictConfig]:
 
     # start Prometheus metrics exporter if enabled
     metrics_enabled = final_conf.get("metrics", {}).get("enabled", False)
-    if not metrics_enabled:
-        # Also check environment variable as a convenience override
-        metrics_enabled = os.environ.get("TQ_METRICS_ENABLED", "false").lower() == "true"
     if metrics_enabled:
         metrics_endpoint = ray.get(_TRANSFER_QUEUE_CONTROLLER.start_metrics.remote())
         final_conf.metrics.endpoint = metrics_endpoint
diff --git a/transfer_queue/metrics.py b/transfer_queue/metrics.py
@@ -57,7 +57,7 @@ class TQMetricsExporter:
     SimpleStorageUnit instances (via ZMQ ``GET_METRICS`` requests).
 
     Lifecycle:
-        1. Created by ``TransferQueueController.__init__`` when metrics are enabled.
+        1. Created by ``TransferQueueController.start_metrics()`` when metrics are enabled.
         2. ``start()`` launches the HTTP server and a background collection thread.
         3. The collection thread calls ``collect_controller_metrics`` and
            ``collect_storage_metrics`` every ``TQ_METRICS_COLLECT_INTERVAL`` seconds.
@@ -95,15 +95,6 @@ def _define_metrics(self) -> None:
         self.partition_samples = Gauge(
             "tq_partition_samples_total", "Number of active samples in a partition", ["partition_id"], registry=r
         )
-        self.partition_fields = Gauge(
-            "tq_partition_fields_total", "Number of fields in a partition", ["partition_id"], registry=r
-        )
-        self.partition_allocated_samples = Gauge(
-            "tq_partition_allocated_samples",
-            "Number of allocated sample slots in a partition",
-            ["partition_id"],
-            registry=r,
-        )
         self.partition_production_progress = Gauge(
             "tq_partition_production_progress",
             "Production progress ratio (0.0-1.0)",
@@ -159,30 +150,9 @@ def _define_metrics(self) -> None:
             ["storage_unit_id"],
             registry=r,
         )
-        self.storage_fields = Gauge(
-            "tq_storage_fields_total", "Number of fields in storage unit", ["storage_unit_id"], registry=r
-        )
         self.storage_memory_rss = Gauge(
             "tq_storage_memory_rss_bytes", "Storage unit process RSS memory", ["storage_unit_id"], registry=r
         )
-        self.storage_data_memory = Gauge(
-            "tq_storage_data_memory_bytes",
-            "Estimated data memory in storage unit",
-            ["storage_unit_id"],
-            registry=r,
-        )
-        self.storage_op_total = Gauge(
-            "tq_storage_op_total",
-            "Cumulative operation count on storage unit",
-            ["storage_unit_id", "op_type"],
-            registry=r,
-        )
-        self.storage_samples_total = Gauge(
-            "tq_storage_samples_total",
-            "Cumulative number of samples processed by storage unit",
-            ["storage_unit_id", "op_type"],
-            registry=r,
-        )
 
     @contextmanager
     def measure(self, op_type: str):
@@ -220,16 +190,16 @@ def collect_controller_metrics(self) -> None:
         except Exception:
             pass
 
-        # Partition-level
-        current_pids = set(ctrl.partitions.keys())
+        # Partition-level — iterate over a snapshot to avoid RuntimeError
+        # if partitions dict is mutated concurrently.
+        partitions_snapshot = list(ctrl.partitions.items())
+        current_pids = {pid for pid, _ in partitions_snapshot}
         current_consumption_labels: set[tuple[str, str]] = set()
         self.partitions_total.set(len(current_pids))
 
-        for pid, partition in ctrl.partitions.items():
+        for pid, partition in partitions_snapshot:
             stats = partition.get_statistics()
             self.partition_samples.labels(partition_id=pid).set(stats["total_samples_num"])
-            self.partition_fields.labels(partition_id=pid).set(stats["total_fields_num"])
-            self.partition_allocated_samples.labels(partition_id=pid).set(stats["allocated_samples_num"])
             self.partition_production_progress.labels(partition_id=pid).set(stats.get("production_progress", 0))
 
             for task_name, cstats in stats.get("consumption_statistics", {}).items():
@@ -242,8 +212,6 @@ def collect_controller_metrics(self) -> None:
         for stale_pid in self._known_partition_ids - current_pids:
             for metric in (
                 self.partition_samples,
-                self.partition_fields,
-                self.partition_allocated_samples,
                 self.partition_production_progress,
             ):
                 try:
@@ -267,25 +235,24 @@ def collect_storage_metrics(self) -> None:
         if not self._storage_unit_infos:
             return
 
-        for su_id, su_info in self._storage_unit_infos.items():
+        # Iterate over a snapshot to avoid RuntimeError from concurrent mutation.
+        storage_snapshot = list(self._storage_unit_infos.items())
+        for su_id, su_info in storage_snapshot:
             try:
                 metrics = self._query_storage_unit(su_info, su_id)
                 if metrics is None:
                     continue
+                # Use the storage unit's own ID from the response as the
+                # canonical label to keep dashboard labels consistent with logs.
+                label = metrics.get("storage_unit_id", su_id)
                 capacity = metrics.get("capacity", 0)
                 active = metrics.get("active_keys", 0)
-                self.storage_capacity.labels(storage_unit_id=su_id).set(capacity)
-                self.storage_active_keys.labels(storage_unit_id=su_id).set(active)
-                self.storage_utilization.labels(storage_unit_id=su_id).set(
+                self.storage_capacity.labels(storage_unit_id=label).set(capacity)
+                self.storage_active_keys.labels(storage_unit_id=label).set(active)
+                self.storage_utilization.labels(storage_unit_id=label).set(
                     active / capacity if capacity > 0 else 0.0
                 )
-                self.storage_fields.labels(storage_unit_id=su_id).set(metrics.get("fields_count", 0))
-                self.storage_memory_rss.labels(storage_unit_id=su_id).set(metrics.get("process_rss_bytes", 0))
-                self.storage_data_memory.labels(storage_unit_id=su_id).set(metrics.get("data_memory_bytes", 0))
-                for op_type, count in metrics.get("op_counts", {}).items():
-                    self.storage_op_total.labels(storage_unit_id=su_id, op_type=op_type).set(count)
-                for op_type, count in metrics.get("sample_counts", {}).items():
-                    self.storage_samples_total.labels(storage_unit_id=su_id, op_type=op_type).set(count)
+                self.storage_memory_rss.labels(storage_unit_id=label).set(metrics.get("process_rss_bytes", 0))
             except Exception as e:
                 logger.warning(f"Failed to collect metrics from storage unit {su_id}: {e}")
 
diff --git a/transfer_queue/storage/simple_backend.py b/transfer_queue/storage/simple_backend.py