Skip to content

Commit b99bb44

Browse files
joelmarcotteclaude
andauthored
Switch replica_sync_state to use per-replica DMV (DataDog#23310)
* Switch replica_sync_state to use per-replica DMV The previous query used sys.dm_hadr_database_replica_states which returns one row per database per replica, but lacked a database_name tag. This caused metric collisions for AGs with multiple databases. Switch to sys.dm_hadr_availability_replica_states which returns one row per replica, matching the metric's intent. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Apply ruff formatting changes Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * changelog update * changelog update Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Keep synchronization_state_desc and synchronization_state tags for backward compatibility Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Remove synchronization_health_desc tag, keep original tag names Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Update changelog to reflect preserved tag names Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Alias synchronization_health columns in SQL query for clarity Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Alias synchronization_health columns in SQL query for clarity Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Fix changelog wording Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Drop synchronization_state tag (was not present before) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 0ef7a10 commit b99bb44

7 files changed

Lines changed: 42 additions & 32 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Switch `sqlserver.ao.replica_sync_state` from `sys.dm_hadr_database_replica_states` to `sys.dm_hadr_availability_replica_states` to report per-replica synchronization health instead of per-database synchronization state. The `synchronization_state_desc` tag name is preserved for backward compatibility, but its values now reflect the replica-level health rollup (e.g. `HEALTHY`, `PARTIALLY_HEALTHY`, `NOT_HEALTHY`). The previous implementation was broken for availability groups containing multiple databases, as rows would collide on the same tag set and only the last value was reported. To get per-database synchronization state, use `sqlserver.ao.replica_status` which includes a `synchronization_state` tag with full database-level granularity.

sqlserver/datadog_checks/sqlserver/const.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@
187187
('sqlserver.ao.ag_sync_health', 'sys.dm_hadr_availability_group_states', 'synchronization_health'),
188188
]
189189
AO_REPLICA_SYNC_METRICS = [
190-
('sqlserver.ao.replica_sync_state', 'sys.dm_hadr_database_replica_states', 'synchronization_state'),
190+
('sqlserver.ao.replica_sync_state', 'sys.dm_hadr_availability_replica_states', 'synchronization_health'),
191191
]
192192
AO_REPLICA_FAILOVER_METRICS = [
193193
('sqlserver.ao.replica_failover_mode', 'sys.availability_replicas', 'failover_mode'),

sqlserver/datadog_checks/sqlserver/database_metrics/database_replication_stats_metrics.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,18 @@
55
from .base import SqlserverDatabaseMetricsBase
66

77
DATABASE_REPLICATION_STATS_METRICS_QUERY = {
8-
"name": "sys.dm_hadr_database_replica_states",
8+
"name": "sys.dm_hadr_availability_replica_states",
99
"query": """SELECT
1010
resource_group_id,
1111
name,
1212
replica_server_name,
13-
synchronization_state_desc,
14-
synchronization_state
15-
from sys.dm_hadr_database_replica_states as dhdrs
13+
synchronization_health_desc as synchronization_state_desc,
14+
synchronization_health as replica_sync_state
15+
from sys.dm_hadr_availability_replica_states as dhars
1616
inner join sys.availability_groups as ag
17-
on ag.group_id = dhdrs.group_id
17+
on ag.group_id = dhars.group_id
1818
inner join sys.availability_replicas as ar
19-
on dhdrs.replica_id = ar.replica_id
19+
on dhars.replica_id = ar.replica_id
2020
""".strip(),
2121
"columns": [
2222
{"name": "availability_group", "type": "tag"},

sqlserver/datadog_checks/sqlserver/schemas.py

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -197,26 +197,28 @@ def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject:
197197
"name": cursor_row.get("schema_name"),
198198
"id": str(cursor_row.get("schema_id")), # Backend expects a string
199199
"owner_name": cursor_row.get("owner_name"),
200-
"tables": [
201-
{
202-
k: v
203-
for k, v in {
204-
"id": str(cursor_row.get("table_id")), # Backend expects a string
205-
"name": cursor_row.get("table_name"),
206-
"columns": [column for column in columns if column.get("name") is not None],
207-
"indexes": [index for index in indexes if index.get("name") is not None],
208-
"foreign_keys": [
209-
foreign_key
210-
for foreign_key in foreign_keys
211-
if foreign_key.get("foreign_key_name") is not None
212-
],
213-
"partitions": {"partition_count": partition_count},
214-
}.items()
215-
if v is not None
216-
}
217-
]
218-
if cursor_row.get("table_name") is not None
219-
else [],
200+
"tables": (
201+
[
202+
{
203+
k: v
204+
for k, v in {
205+
"id": str(cursor_row.get("table_id")), # Backend expects a string
206+
"name": cursor_row.get("table_name"),
207+
"columns": [column for column in columns if column.get("name") is not None],
208+
"indexes": [index for index in indexes if index.get("name") is not None],
209+
"foreign_keys": [
210+
foreign_key
211+
for foreign_key in foreign_keys
212+
if foreign_key.get("foreign_key_name") is not None
213+
],
214+
"partitions": {"partition_count": partition_count},
215+
}.items()
216+
if v is not None
217+
}
218+
]
219+
if cursor_row.get("table_name") is not None
220+
else []
221+
),
220222
}
221223
]
222224
return object

sqlserver/metadata.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ sqlserver.ao.redo_rate,gauge,,byte,second,"Average rate at which the log records
2525
sqlserver.ao.replica_failover_mode,gauge,,,,"Replica failover mode: 0 = Automatic failover, 1 = Manual failover. Tags: `replica_server_name`, `availability_group`, `availability_group_name`, `failover_mode_desc`",0,sql_server,replica failover mode,,
2626
sqlserver.ao.replica_failover_readiness,gauge,,,,"Replica failover readiness: 0 = Not ready for failover, 1 = Ready for failover. Tags: `replica_server_name`, `availability_group`, `availability_group_name`, `failover_mode_desc`",0,sql_server,replica failover readiness,,
2727
sqlserver.ao.replica_status,gauge,,,,"Denotes an Availability Group replica's status. Tags: `availability_group`, `availability_group_name`, `replica_server_name`, `failover_mode`, `availability_mode`, `db`, `replica_id`, `database_id`, `database_state`, `synchronization_state`, `failover_cluster`, `replica_role`",0,sql_server,is primary replica,,
28-
sqlserver.ao.replica_sync_state,gauge,,,,"Replica synchronization state: 0 = Not synchronizing, 1 = Synchronizing, 2 = Synchronized, 3 = Reverting, 4 = Initializing. Tags: `availability_group`, `availability_group_name`, `synchronization_health_desc`",0,sql_server,replica sync state,,
28+
sqlserver.ao.replica_sync_state,gauge,,,,"Replica synchronization health: 0 = Not healthy, 1 = Partially healthy, 2 = Healthy. Tags: `availability_group`, `availability_group_name`, `replica_server_name`, `synchronization_state_desc`",0,sql_server,replica sync state,,
2929
sqlserver.ao.secondary_lag_seconds,gauge,,second,,"The number of seconds that the secondary replica is behind the primary replica during synchronization. Tags: `availability_group`, `availability_group_name`, `replica_server_name`, `failover_mode`, `availability_mode`, `db`, `replica_id`, `database_id`, `database_state`, `synchronization_state`, `failover_cluster`, `replica_role`",0,sql_server,secondary lag seconds,,
3030
sqlserver.ao.secondary_replica_health,gauge,,,,"Recovery health of secondary replica: 0 = In progress, 1 = Online. The metric is not emitted if on a primary replica. Tags: `availability_group`, `availability_group_name`, `synchronization_health_desc`",0,sql_server,secondary replica health,,
3131
sqlserver.buffer.cache_hit_ratio,gauge,,fraction,,The ratio of data pages found and read from the buffer cache over all data page requests. (Perf. Counter: `Buffer Manager - Buffer cache hit ratio`),1,sql_server,buff hit ratio,,

sqlserver/tests/test_database_metrics.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -339,25 +339,31 @@ def execute_query_handler_mocked(query, db=None):
339339
pytest.param(
340340
None,
341341
None,
342-
[('AG1', 'AG1', 'aoag_secondary', 'SYNCHRONIZED', 2), ('AG1', 'AG1', 'aoag_primary', 'SYNCHRONIZED', 2)],
342+
[
343+
('AG1', 'AG1', 'aoag_secondary', 'HEALTHY', 2),
344+
('AG1', 'AG1', 'aoag_primary', 'HEALTHY', 2),
345+
],
343346
id='no availability_group, no only_emit_local',
344347
),
345348
pytest.param(
346349
'AG1',
347350
None,
348-
[('AG1', 'AG1', 'aoag_secondary', 'SYNCHRONIZED', 2), ('AG1', 'AG1', 'aoag_primary', 'SYNCHRONIZED', 2)],
351+
[
352+
('AG1', 'AG1', 'aoag_secondary', 'HEALTHY', 2),
353+
('AG1', 'AG1', 'aoag_primary', 'HEALTHY', 2),
354+
],
349355
id='availability_group set, no only_emit_local',
350356
),
351357
pytest.param(
352358
None,
353359
True,
354-
[('AG1', 'AG1', 'aoag_primary', 'SYNCHRONIZED', 2)],
360+
[('AG1', 'AG1', 'aoag_primary', 'HEALTHY', 2)],
355361
id='no availability_group, only_emit_local is True',
356362
),
357363
pytest.param(
358364
'AG1',
359365
True,
360-
[('AG1', 'AG1', 'aoag_primary', 'SYNCHRONIZED', 2)],
366+
[('AG1', 'AG1', 'aoag_primary', 'HEALTHY', 2)],
361367
id='availability_group set, only_emit_local is True',
362368
),
363369
],

sqlserver/tests/test_metrics.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -604,6 +604,7 @@ def test_check_ao_secondary_replica(aggregator, dd_run_check, init_config, insta
604604
'availability_group',
605605
'availability_group_name',
606606
'synchronization_state_desc',
607+
'synchronization_state',
607608
'replica_server_name',
608609
):
609610
aggregator.assert_metric_has_tag_prefix(metric_name, tag_prefix=tag_prefix)

0 commit comments

Comments
 (0)