Skip to content

Commit 566c265

Browse files
committed
Avoid cleanup when cancel called while check running
1 parent d54c6b8 commit 566c265

2 files changed

Lines changed: 140 additions & 21 deletions

File tree

postgres/datadog_checks/postgres/postgres.py

Lines changed: 66 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import copy
66
import functools
77
import os
8+
import threading
89
from string import Template
910
from time import time
1011

@@ -17,7 +18,6 @@
1718
from datadog_checks.base.utils.db.core import QueryManager
1819
from datadog_checks.base.utils.db.health import HealthEvent, HealthStatus
1920
from datadog_checks.base.utils.db.utils import (
20-
DBMAsyncJob,
2121
default_json_event_encoding,
2222
tracked_query,
2323
)
@@ -194,6 +194,10 @@ def __init__(self, name, init_config, instances):
194194

195195
self.diagnosis.register(functools.partial(run_diagnostics, self))
196196

197+
self._cancel_lock = threading.Lock()
198+
self._is_running = False
199+
self._cancelled = False
200+
197201
def database_monitoring_column_statistics(self, raw_event: str):
198202
self.event_platform_event(raw_event, "dbm-column-statistics")
199203

@@ -476,38 +480,79 @@ def dynamic_queries(self):
476480

477481
return self._dynamic_queries
478482

479-
@staticmethod
480-
def _cancel_async_job(job: DBMAsyncJob):
481-
job.cancel()
482-
if job._job_loop_future:
483-
job._job_loop_future.result()
484-
job._job_loop_future = None
485-
job._shutdown()
483+
def run(self):
484+
# TODO: move this lock into the base class
485+
with self._cancel_lock:
486+
if self._cancelled:
487+
return ''
488+
self._is_running = True
489+
try:
490+
return super().run()
491+
finally:
492+
needs_finalize = False
493+
with self._cancel_lock:
494+
self._is_running = False
495+
if self._cancelled:
496+
needs_finalize = True
497+
if needs_finalize:
498+
self._finalize()
486499

487500
def cancel(self):
501+
"""Signal that the check is being unscheduled.
502+
503+
This method can be called while check() is running on another thread
504+
(the GIL is released during psycopg I/O). It must not perform any
505+
destructive operations — closing connections or nulling attributes that
506+
check() depends on — because that causes a SIGSEGV in libpq when
507+
check() resumes.
508+
509+
Destructive cleanup is deferred to _finalize(), which is called either
510+
here (if the check is idle) or by run()'s finally block (if the check
511+
is in-flight). The Agent guarantees it will not call run() again after
512+
cancel().
488513
"""
489-
Cancels and sends cancel signal to all threads.
490-
"""
514+
self._cancel_async_jobs()
515+
needs_finalize = False
516+
with self._cancel_lock:
517+
self._cancelled = True
518+
if not self._is_running:
519+
needs_finalize = True
520+
if needs_finalize:
521+
self._finalize()
522+
523+
@property
524+
def _async_jobs(self):
525+
"""Return the async jobs active for this check's configuration."""
526+
jobs = []
491527
if self._config.dbm:
492-
self._cancel_async_job(self.statement_metrics)
493-
self._cancel_async_job(self.statement_samples)
494-
self._cancel_async_job(self.metadata_samples)
528+
jobs.extend([self.statement_metrics, self.statement_samples, self.metadata_samples])
495529
elif self._config.data_observability.enabled:
496-
self._cancel_async_job(self.metadata_samples)
530+
jobs.append(self.metadata_samples)
497531
if self._config.data_observability.enabled:
498-
self._cancel_async_job(self.data_observability)
532+
jobs.append(self.data_observability)
533+
return jobs
534+
535+
def _cancel_async_jobs(self):
536+
"""Signal async jobs to stop. Safe to call while check() is running."""
537+
for job in self._async_jobs:
538+
job.cancel()
539+
540+
def _finalize(self):
541+
"""Tear down check state. Must not run while check() is executing."""
542+
for job in self._async_jobs:
543+
if job._job_loop_future:
544+
job._job_loop_future.result()
545+
job._job_loop_future = None
546+
job._shutdown()
499547
self._clean_state()
500-
self._query_manager = None
501-
self.health = None
502548
self.check_initializations.clear()
503549
# TODO: move diagnosis cleanup into AgentCheck.cancel() in the base class
504550
self._diagnosis = None
551+
self.log.check = None
552+
self._query_manager = None
553+
self.health = None
505554
self._close_db()
506555
self._close_db_pool()
507-
# CheckLoggingAdapter holds self.check until check_id is resolved via
508-
# process(), which only happens after the agent scheduler calls run().
509-
# If cancel() is called before that, the back-reference is never cleared.
510-
self.log.check = None
511556

512557
def _clean_state(self):
513558
self.log.debug("Cleaning state")

postgres/tests/test_unit.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,80 @@ def test_check_gc_after_cancel(pg_instance):
443443
gc.enable()
444444

445445

446+
def test_cancel_during_running_check_defers_finalize(pg_instance):
447+
"""Verify that cancel() during an in-flight check() does not close connections.
448+
449+
Destructive cleanup (_finalize) must be deferred until run() completes so
450+
that check() never accesses a closed psycopg connection, which would cause
451+
a SIGSEGV in libpq.
452+
"""
453+
import threading
454+
455+
check = PostgreSql('postgres', {}, [pg_instance])
456+
conn = mock.MagicMock()
457+
check._db = conn
458+
459+
check_started = threading.Event()
460+
cancel_done = threading.Event()
461+
462+
def slow_run(self_arg):
463+
check_started.set()
464+
cancel_done.wait(timeout=5)
465+
return ''
466+
467+
run_result = [None]
468+
469+
def run_check():
470+
with mock.patch.object(type(check).__mro__[1], 'run', slow_run):
471+
run_result[0] = check.run()
472+
473+
run_thread = threading.Thread(target=run_check)
474+
run_thread.start()
475+
476+
check_started.wait(timeout=5)
477+
478+
check.cancel()
479+
# cancel() should have signaled but NOT finalized since run() is in-flight
480+
assert not conn.close.called, "_close_db() ran while check() was still executing"
481+
assert check._cancelled is True
482+
483+
cancel_done.set()
484+
run_thread.join(timeout=5)
485+
486+
# After run() completes, _finalize() should have been called
487+
conn.close.assert_called_once()
488+
assert check._db is None
489+
assert check._query_manager is None
490+
assert check.health is None
491+
492+
493+
def test_cancel_on_idle_check_finalizes_immediately(pg_instance):
494+
"""Verify that cancel() on an idle check runs _finalize() inline."""
495+
check = PostgreSql('postgres', {}, [pg_instance])
496+
conn = mock.MagicMock()
497+
check._db = conn
498+
499+
assert not check._is_running
500+
501+
check.cancel()
502+
503+
conn.close.assert_called_once()
504+
assert check._db is None
505+
assert check._query_manager is None
506+
assert check.health is None
507+
508+
509+
def test_run_after_cancel_returns_immediately(pg_instance):
510+
"""Verify that run() returns '' without executing check() if already cancelled."""
511+
check = PostgreSql('postgres', {}, [pg_instance])
512+
check.cancel()
513+
514+
with mock.patch.object(check, 'check', side_effect=AssertionError("check() should not be called")):
515+
result = check.run()
516+
517+
assert result == ''
518+
519+
446520
def test_collect_column_statistics_updates_timestamp_on_failure(pg_instance):
447521
pg_instance['dbm'] = True
448522
pg_instance['collect_column_statistics'] = {'enabled': True, 'collection_interval': 60}

0 commit comments

Comments
 (0)