Merge pull request #23 from Redislabs-Solution-Architects/GTI-608/fix-replica-only-node-role

ymendez-redis · web-flow · commit 76aa8069ea2d · 2026-04-24T09:16:29.000-06:00
fix: use replication/role metric for authoritative node role detection (GTI-608)
diff --git a/memorystore.py b/memorystore.py
@@ -20,6 +20,7 @@
   --step 60           # alignment step in seconds for rate metrics (default 60)
 
 """
+
 import argparse
 import csv
 import os
@@ -47,6 +48,7 @@
     "commands": "redis.googleapis.com/commands/calls",
     "memory_usage": "redis.googleapis.com/stats/memory/usage",
     "max_memory": "redis.googleapis.com/stats/memory/maxmemory",
+    "replication_role": "redis.googleapis.com/replication/role",
 }
 # Valkey (Memorystore for Valkey) - use node-level for commands & usage; instance-level for size.
 VALKEY_METRICS = {
@@ -81,6 +83,14 @@ def _pick(labels: Dict[str, str], keys) -> Optional[str]:
     return None
 
 
+def _point_value(point, default=0):
+    """Extract a numeric value from a GCP monitoring point, handling both int64 and double types."""
+    try:
+        return point.value.int64_value or point.value.double_value
+    except Exception:
+        return default
+
+
 def _time_interval(duration_sec: int) -> monitoring_v3.TimeInterval:
     now = time.time()
     seconds = int(now)
@@ -199,16 +209,7 @@ def _accumulate_commands(results, table, product_name: str, project_id: str):
             t = point.interval.start_time.timestamp()
             if t not in entry["points"]:
                 entry["points"][t] = {}
-            # Support both int/double values
-            pv = 0.0
-            try:
-                pv = point.value.double_value
-            except Exception:
-                try:
-                    pv = float(point.value.int64_value)
-                except Exception:
-                    pv = 0.0
-            entry["points"][t][cmd] = pv
+            entry["points"][t][cmd] = float(_point_value(point, default=0.0))
 
 
 def _apply_processed_categories(table):
@@ -244,13 +245,7 @@ def _attach_memory_usage(results, table, key_name="BytesUsedForCache"):
         # take the max usage observed
         maxv = 0
         for point in ts.points:
-            try:
-                v = int(point.value.int64_value)
-            except Exception:
-                try:
-                    v = int(point.value.double_value)
-                except Exception:
-                    v = 0
+            v = int(_point_value(point))
             if v > maxv:
                 maxv = v
         prev = entry.get(key_name, 0)
@@ -270,13 +265,7 @@ def _attach_capacity_scalar(results, table, key_name="MaxMemory"):
         )
         v_max = 0
         for point in ts.points:
-            try:
-                v = int(point.value.int64_value)
-            except Exception:
-                try:
-                    v = int(point.value.double_value)
-                except Exception:
-                    v = 0
+            v = int(_point_value(point))
             if v > v_max:
                 v_max = v
         if v_max > cap_by_inst[inst_key]:
@@ -288,6 +277,30 @@ def _attach_capacity_scalar(results, table, key_name="MaxMemory"):
                 nodes[node_id][key_name] = cap_by_inst[inst_key]
 
 
+def _attach_node_role(results, table):
+    """Set NodeRole using the dedicated replication/role metric.
+
+    The 'role' label on commands/calls is metadata — not its purpose to report
+    node role — and has been observed returning 'replica' for both nodes on
+    Standard Tier instances (GTI-608, ~93 affected clusters).
+
+    replication/role is the GCP-designated metric for this: 1 = primary, 0 = replica.
+    See: https://cloud.google.com/memorystore/docs/redis/supported-monitoring-metrics
+    """
+    for ts in results:
+        rlabels = dict(ts.resource.labels)
+        inst_key = rlabels.get("instance_id") or "unknown"
+        node_id = rlabels.get("node_id") or "unknown"
+        if inst_key not in table or node_id not in table[inst_key]:
+            continue
+
+        if not ts.points:
+            continue
+
+        role_val = int(_point_value(ts.points[0]))
+        table[inst_key][node_id]["NodeRole"] = "Master" if role_val == 1 else "Replica"
+
+
 def _flatten_rows(table, project_id: str, instance_type: str) -> List[Dict[str, Any]]:
     rows = []
     for inst_key, nodes in table.items():
@@ -359,6 +372,16 @@ def collect_for_product(
     except Exception:
         pass
 
+    # Node role (Redis only - uses authoritative replication/role metric)
+    if "replication_role" in metric_map:
+        try:
+            role_results = _list_ts(
+                client, project_name, metric_map["replication_role"], interval
+            )
+            _attach_node_role(role_results, table)
+        except Exception:
+            pass
+
     # Compute command categories
     _apply_processed_categories(table)
 
diff --git a/msstats.py b/msstats.py
@@ -867,7 +867,7 @@ def main():
         help="Duration of the metric window in seconds. Default is 604800 (7 days).",
     )
 
-    (options, _) = parser.parse_args()
+    options, _ = parser.parse_args()
 
     if not os.path.isdir(options.outDir):
         os.makedirs(options.outDir)

Original file line number	Diff line number	Diff line change
`@@ -867,7 +867,7 @@ def main():`
`867`	`867`	`help="Duration of the metric window in seconds. Default is 604800 (7 days).",`
`868`	`868`	`)`
`869`	`869`
`870`		`- (options, _) = parser.parse_args()`
	`870`	`+ options, _ = parser.parse_args()`
`871`	`871`
`872`	`872`	`if not os.path.isdir(options.outDir):`
`873`	`873`	`os.makedirs(options.outDir)`