Skip to content

Commit 76aa806

Browse files
Merge pull request #23 from Redislabs-Solution-Architects/GTI-608/fix-replica-only-node-role
fix: use replication/role metric for authoritative node role detection (GTI-608)
2 parents 071e5b5 + 30aa16d commit 76aa806

2 files changed

Lines changed: 48 additions & 25 deletions

File tree

memorystore.py

Lines changed: 47 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
--step 60 # alignment step in seconds for rate metrics (default 60)
2121
2222
"""
23+
2324
import argparse
2425
import csv
2526
import os
@@ -47,6 +48,7 @@
4748
"commands": "redis.googleapis.com/commands/calls",
4849
"memory_usage": "redis.googleapis.com/stats/memory/usage",
4950
"max_memory": "redis.googleapis.com/stats/memory/maxmemory",
51+
"replication_role": "redis.googleapis.com/replication/role",
5052
}
5153
# Valkey (Memorystore for Valkey) - use node-level for commands & usage; instance-level for size.
5254
VALKEY_METRICS = {
@@ -81,6 +83,14 @@ def _pick(labels: Dict[str, str], keys) -> Optional[str]:
8183
return None
8284

8385

86+
def _point_value(point, default=0):
87+
"""Extract a numeric value from a GCP monitoring point, handling both int64 and double types."""
88+
try:
89+
return point.value.int64_value or point.value.double_value
90+
except Exception:
91+
return default
92+
93+
8494
def _time_interval(duration_sec: int) -> monitoring_v3.TimeInterval:
8595
now = time.time()
8696
seconds = int(now)
@@ -199,16 +209,7 @@ def _accumulate_commands(results, table, product_name: str, project_id: str):
199209
t = point.interval.start_time.timestamp()
200210
if t not in entry["points"]:
201211
entry["points"][t] = {}
202-
# Support both int/double values
203-
pv = 0.0
204-
try:
205-
pv = point.value.double_value
206-
except Exception:
207-
try:
208-
pv = float(point.value.int64_value)
209-
except Exception:
210-
pv = 0.0
211-
entry["points"][t][cmd] = pv
212+
entry["points"][t][cmd] = float(_point_value(point, default=0.0))
212213

213214

214215
def _apply_processed_categories(table):
@@ -244,13 +245,7 @@ def _attach_memory_usage(results, table, key_name="BytesUsedForCache"):
244245
# take the max usage observed
245246
maxv = 0
246247
for point in ts.points:
247-
try:
248-
v = int(point.value.int64_value)
249-
except Exception:
250-
try:
251-
v = int(point.value.double_value)
252-
except Exception:
253-
v = 0
248+
v = int(_point_value(point))
254249
if v > maxv:
255250
maxv = v
256251
prev = entry.get(key_name, 0)
@@ -270,13 +265,7 @@ def _attach_capacity_scalar(results, table, key_name="MaxMemory"):
270265
)
271266
v_max = 0
272267
for point in ts.points:
273-
try:
274-
v = int(point.value.int64_value)
275-
except Exception:
276-
try:
277-
v = int(point.value.double_value)
278-
except Exception:
279-
v = 0
268+
v = int(_point_value(point))
280269
if v > v_max:
281270
v_max = v
282271
if v_max > cap_by_inst[inst_key]:
@@ -288,6 +277,30 @@ def _attach_capacity_scalar(results, table, key_name="MaxMemory"):
288277
nodes[node_id][key_name] = cap_by_inst[inst_key]
289278

290279

280+
def _attach_node_role(results, table):
281+
"""Set NodeRole using the dedicated replication/role metric.
282+
283+
The 'role' label on commands/calls is metadata — not its purpose to report
284+
node role — and has been observed returning 'replica' for both nodes on
285+
Standard Tier instances (GTI-608, ~93 affected clusters).
286+
287+
replication/role is the GCP-designated metric for this: 1 = primary, 0 = replica.
288+
See: https://cloud.google.com/memorystore/docs/redis/supported-monitoring-metrics
289+
"""
290+
for ts in results:
291+
rlabels = dict(ts.resource.labels)
292+
inst_key = rlabels.get("instance_id") or "unknown"
293+
node_id = rlabels.get("node_id") or "unknown"
294+
if inst_key not in table or node_id not in table[inst_key]:
295+
continue
296+
297+
if not ts.points:
298+
continue
299+
300+
role_val = int(_point_value(ts.points[0]))
301+
table[inst_key][node_id]["NodeRole"] = "Master" if role_val == 1 else "Replica"
302+
303+
291304
def _flatten_rows(table, project_id: str, instance_type: str) -> List[Dict[str, Any]]:
292305
rows = []
293306
for inst_key, nodes in table.items():
@@ -359,6 +372,16 @@ def collect_for_product(
359372
except Exception:
360373
pass
361374

375+
# Node role (Redis only - uses authoritative replication/role metric)
376+
if "replication_role" in metric_map:
377+
try:
378+
role_results = _list_ts(
379+
client, project_name, metric_map["replication_role"], interval
380+
)
381+
_attach_node_role(role_results, table)
382+
except Exception:
383+
pass
384+
362385
# Compute command categories
363386
_apply_processed_categories(table)
364387

msstats.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -867,7 +867,7 @@ def main():
867867
help="Duration of the metric window in seconds. Default is 604800 (7 days).",
868868
)
869869

870-
(options, _) = parser.parse_args()
870+
options, _ = parser.parse_args()
871871

872872
if not os.path.isdir(options.outDir):
873873
os.makedirs(options.outDir)

0 commit comments

Comments
 (0)