Skip to content

Commit 5d19587

Browse files
[DBMON-6602] Avoid cleanup when cancel called while check running (#23728) (#23729)
* Avoid cleanup when cancel called while check running * Add changelog * check if cancelled before running jobs * Add some debug lines for cancel flow (cherry picked from commit 4c7e8bb) Co-authored-by: Eric Weaver <eweaver755@gmail.com>
1 parent 83acc5a commit 5d19587

3 files changed

Lines changed: 158 additions & 29 deletions

File tree

postgres/changelog.d/23728.fixed

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix a crash caused by cancel closing database connections while the check is still running.

postgres/datadog_checks/postgres/postgres.py

Lines changed: 83 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import copy
66
import functools
77
import os
8+
import threading
89
from string import Template
910
from time import time
1011

@@ -17,7 +18,6 @@
1718
from datadog_checks.base.utils.db.core import QueryManager
1819
from datadog_checks.base.utils.db.health import HealthEvent, HealthStatus
1920
from datadog_checks.base.utils.db.utils import (
20-
DBMAsyncJob,
2121
default_json_event_encoding,
2222
tracked_query,
2323
)
@@ -194,6 +194,10 @@ def __init__(self, name, init_config, instances):
194194

195195
self.diagnosis.register(functools.partial(run_diagnostics, self))
196196

197+
self._cancel_lock = threading.Lock()
198+
self._is_running = False
199+
self._cancelled = False
200+
197201
def database_monitoring_column_statistics(self, raw_event: str):
198202
self.event_platform_event(raw_event, "dbm-column-statistics")
199203

@@ -476,38 +480,87 @@ def dynamic_queries(self):
476480

477481
return self._dynamic_queries
478482

479-
@staticmethod
480-
def _cancel_async_job(job: DBMAsyncJob):
481-
job.cancel()
482-
if job._job_loop_future:
483-
job._job_loop_future.result()
484-
job._job_loop_future = None
485-
job._shutdown()
483+
def run(self):
484+
# TODO: move this lock into the base class
485+
with self._cancel_lock:
486+
if self._cancelled:
487+
self.log.debug("run() skipped, check already cancelled")
488+
return ''
489+
self._is_running = True
490+
try:
491+
return super().run()
492+
finally:
493+
needs_finalize = False
494+
with self._cancel_lock:
495+
self._is_running = False
496+
if self._cancelled:
497+
needs_finalize = True
498+
if needs_finalize:
499+
self.log.debug("Check cancel has been signaled, finalizing now that run() is complete")
500+
self._finalize()
486501

487502
def cancel(self):
503+
"""Signal that the check is being unscheduled.
504+
505+
This method can be called while check() is running on another thread
506+
(the GIL is released during psycopg I/O). It must not perform any
507+
destructive operations — closing connections or nulling attributes that
508+
check() depends on — because that causes a SIGSEGV in libpq when
509+
check() resumes.
510+
511+
Destructive cleanup is deferred to _finalize(), which is called either
512+
here (if the check is idle) or by run()'s finally block (if the check
513+
is in-flight). The Agent guarantees it will not call run() again after
514+
cancel().
488515
"""
489-
Cancels and sends cancel signal to all threads.
490-
"""
516+
self.log.debug("Marking check as cancelled")
517+
self._cancel_async_jobs()
518+
needs_finalize = False
519+
with self._cancel_lock:
520+
self._cancelled = True
521+
if not self._is_running:
522+
needs_finalize = True
523+
if needs_finalize:
524+
self.log.debug("cancel() finalizing immediately, check is idle")
525+
self._finalize()
526+
else:
527+
self.log.debug("cancel() deferred finalize, check is still running")
528+
529+
@property
530+
def _async_jobs(self):
531+
"""Return the async jobs active for this check's configuration."""
532+
jobs = []
491533
if self._config.dbm:
492-
self._cancel_async_job(self.statement_metrics)
493-
self._cancel_async_job(self.statement_samples)
494-
self._cancel_async_job(self.metadata_samples)
534+
jobs.extend([self.statement_metrics, self.statement_samples, self.metadata_samples])
495535
elif self._config.data_observability.enabled:
496-
self._cancel_async_job(self.metadata_samples)
536+
jobs.append(self.metadata_samples)
497537
if self._config.data_observability.enabled:
498-
self._cancel_async_job(self.data_observability)
538+
jobs.append(self.data_observability)
539+
return jobs
540+
541+
def _cancel_async_jobs(self):
542+
"""Signal async jobs to stop. Safe to call while check() is running."""
543+
for job in self._async_jobs:
544+
job.cancel()
545+
546+
def _finalize(self):
547+
"""Tear down check state. Must not run while check() is executing."""
548+
self.log.debug("Finalizing check: closing connections and clearing state")
549+
for job in self._async_jobs:
550+
if job._job_loop_future:
551+
job._job_loop_future.result()
552+
job._job_loop_future = None
553+
job._shutdown()
499554
self._clean_state()
500-
self._query_manager = None
501-
self.health = None
502555
self.check_initializations.clear()
503556
# TODO: move diagnosis cleanup into AgentCheck.cancel() in the base class
504557
self._diagnosis = None
558+
self.log.check = None
559+
self._query_manager = None
560+
self.health = None
505561
self._close_db()
506562
self._close_db_pool()
507-
# CheckLoggingAdapter holds self.check until check_id is resolved via
508-
# process(), which only happens after the agent scheduler calls run().
509-
# If cancel() is called before that, the back-reference is never cleared.
510-
self.log.check = None
563+
self.log.debug("Check cleanup complete")
511564

512565
def _clean_state(self):
513566
self.log.debug("Cleaning state")
@@ -1191,14 +1244,15 @@ def check(self, _):
11911244

11921245
if not self._config.only_custom_queries:
11931246
self._collect_stats(tags)
1194-
if self._config.dbm:
1195-
self.statement_metrics.run_job_loop(tags)
1196-
self.statement_samples.run_job_loop(tags)
1197-
self.metadata_samples.run_job_loop(tags)
1198-
elif self._config.data_observability.enabled:
1199-
self.metadata_samples.run_job_loop(tags)
1200-
if self._config.data_observability.enabled:
1201-
self.data_observability.run_job_loop(tags)
1247+
if not self._cancelled:
1248+
if self._config.dbm:
1249+
self.statement_metrics.run_job_loop(tags)
1250+
self.statement_samples.run_job_loop(tags)
1251+
self.metadata_samples.run_job_loop(tags)
1252+
elif self._config.data_observability.enabled:
1253+
self.metadata_samples.run_job_loop(tags)
1254+
if self._config.data_observability.enabled:
1255+
self.data_observability.run_job_loop(tags)
12021256
if self._config.collect_wal_metrics is True:
12031257
# collect wal metrics for pg < 10 only when explicitly enabled
12041258
# (requires local filesystem access to the WAL directory)

postgres/tests/test_unit.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,80 @@ def test_check_gc_after_cancel(pg_instance):
443443
gc.enable()
444444

445445

446+
def test_cancel_during_running_check_defers_finalize(pg_instance):
447+
"""Verify that cancel() during an in-flight check() does not close connections.
448+
449+
Destructive cleanup (_finalize) must be deferred until run() completes so
450+
that check() never accesses a closed psycopg connection, which would cause
451+
a SIGSEGV in libpq.
452+
"""
453+
import threading
454+
455+
check = PostgreSql('postgres', {}, [pg_instance])
456+
conn = mock.MagicMock()
457+
check._db = conn
458+
459+
check_started = threading.Event()
460+
cancel_done = threading.Event()
461+
462+
def slow_run(self_arg):
463+
check_started.set()
464+
cancel_done.wait(timeout=5)
465+
return ''
466+
467+
run_result = [None]
468+
469+
def run_check():
470+
with mock.patch.object(type(check).__mro__[1], 'run', slow_run):
471+
run_result[0] = check.run()
472+
473+
run_thread = threading.Thread(target=run_check)
474+
run_thread.start()
475+
476+
check_started.wait(timeout=5)
477+
478+
check.cancel()
479+
# cancel() should have signaled but NOT finalized since run() is in-flight
480+
assert not conn.close.called, "_close_db() ran while check() was still executing"
481+
assert check._cancelled is True
482+
483+
cancel_done.set()
484+
run_thread.join(timeout=5)
485+
486+
# After run() completes, _finalize() should have been called
487+
conn.close.assert_called_once()
488+
assert check._db is None
489+
assert check._query_manager is None
490+
assert check.health is None
491+
492+
493+
def test_cancel_on_idle_check_finalizes_immediately(pg_instance):
494+
"""Verify that cancel() on an idle check runs _finalize() inline."""
495+
check = PostgreSql('postgres', {}, [pg_instance])
496+
conn = mock.MagicMock()
497+
check._db = conn
498+
499+
assert not check._is_running
500+
501+
check.cancel()
502+
503+
conn.close.assert_called_once()
504+
assert check._db is None
505+
assert check._query_manager is None
506+
assert check.health is None
507+
508+
509+
def test_run_after_cancel_returns_immediately(pg_instance):
510+
"""Verify that run() returns '' without executing check() if already cancelled."""
511+
check = PostgreSql('postgres', {}, [pg_instance])
512+
check.cancel()
513+
514+
with mock.patch.object(check, 'check', side_effect=AssertionError("check() should not be called")):
515+
result = check.run()
516+
517+
assert result == ''
518+
519+
446520
def test_collect_column_statistics_updates_timestamp_on_failure(pg_instance):
447521
pg_instance['dbm'] = True
448522
pg_instance['collect_column_statistics'] = {'enabled': True, 'collection_interval': 60}

0 commit comments

Comments
 (0)