Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions dcgm/README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Agent Check: DCGM Exporter
{{< callout url="https://docs.datadoghq.com/gpu_monitoring/setup/?tab=datadogoperator" >}}
{{< callout >}}

This is a legacy integration that is no longer being developed or fully supported. For advanced insights into your GPU fleet, use Datadog's GPU Monitoring instead of installing this integration.
This is a legacy integration that is no longer being developed or fully supported. For advanced insights into your GPU fleet, use Datadog's [GPU Monitoring](https://docs.datadoghq.com/gpu_monitoring/) instead of installing this integration.
{{< /callout >}}

## Overview
We recommend using [GPU Monitoring][18], which not only provides metric parity with this integration, but even more metrics and guided remediation actions across your AI stack. [See the Datadog GPU Monitoring documentation][19] for setup instructions.
We recommend using [GPU Monitoring][18], which not only provides metric parity with this integration, but even more metrics and guided remediation actions across your AI stack. [See the Datadog GPU Monitoring documentation][19] for setup instructions.

This legacy check submits metrics exposed by the [NVIDIA DCGM Exporter][16] in Datadog Agent format. For more information on NVIDIA Data Center GPU Manager (DCGM), see [NVIDIA DCGM][15].

Expand Down
1 change: 1 addition & 0 deletions postgres/changelog.d/24102.fixed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Cache parameterized-query explain failures caused by unresolvable parameter types so they aren't re-attempted on every collection.
112 changes: 90 additions & 22 deletions postgres/datadog_checks/postgres/explain_parameterized_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import logging
import re
from typing import Dict, List, Optional, Tuple

import psycopg

Expand All @@ -26,6 +27,16 @@

EXPLAIN_QUERY = 'SELECT {explain_function}(%s)'

# Errors raised when PostgreSQL can't resolve a parameter's type while preparing or explaining a parameterized
# query (e.g. untyped NULL parameters from ORMs using the extended query protocol: "operator does not exist:
# bigint = text"). The parameter types that made the original query valid aren't available in pg_stat_activity,
# so these queries can't be explained and the failure is deterministic for a given query signature.
EXPECTED_PARAMETER_TYPE_ERRORS = (
psycopg.errors.IndeterminateDatatype,
psycopg.errors.UndefinedFunction,
psycopg.errors.DatatypeMismatch,
)


def agent_check_getter(self):
return self._check
Expand Down Expand Up @@ -73,7 +84,9 @@ def __init__(self, check, config, explain_function):
self._explain_function = explain_function

@tracked_method(agent_check_getter=agent_check_getter)
def explain_statement(self, dbname, statement, obfuscated_statement, query_signature):
def explain_statement(
self, dbname: str, statement: str, obfuscated_statement: str, query_signature: str
) -> Tuple[Optional[Dict], Optional[DBExplainError], Optional[str]]:
if self._check.version < V12:
# if pg version < 12, skip explaining parameterized queries because
# plan_cache_mode is not supported
Expand All @@ -86,18 +99,27 @@ def explain_statement(self, dbname, statement, obfuscated_statement, query_signa
with self._check.db_pool.get_connection(dbname) as conn:
try:
self._set_plan_cache_mode(conn)
self._create_prepared_statement(conn, statement, obfuscated_statement, query_signature)
except psycopg.errors.IndeterminateDatatype as e:
return None, DBExplainError.indeterminate_datatype, '{}'.format(type(e))
except psycopg.errors.UndefinedFunction as e:
return None, DBExplainError.undefined_function, '{}'.format(type(e))
prepared_statement_error = self._create_prepared_statement(
conn, statement, obfuscated_statement, query_signature
)
except Exception as e:
# if we fail to create a prepared statement, we cannot explain the query
# an unexpected failure creating the prepared statement means we cannot explain the query
return None, DBExplainError.failed_to_explain_with_prepared_statement, '{}'.format(type(e))

if prepared_statement_error is not None:
# an expected, deterministic parameter type-resolution failure (e.g. untyped NULL parameters).
error_code, err_msg = prepared_statement_error
return None, error_code, err_msg

try:
result = self._explain_prepared_statement(conn, statement, obfuscated_statement, query_signature)
if result:
result, explain_error = self._explain_prepared_statement(
conn, statement, obfuscated_statement, query_signature
)
if explain_error is not None:
# an expected, deterministic parameter type-resolution failure surfaced during EXPLAIN EXECUTE
error_code, err_msg = explain_error
return None, error_code, err_msg
elif result:
plan = result[0][0][0]
return plan, DBExplainError.explained_with_prepared_statement, None
else:
Expand All @@ -117,24 +139,57 @@ def _set_plan_cache_mode(self, conn):
self._execute_query(conn, "SET plan_cache_mode = force_generic_plan")

@tracked_method(agent_check_getter=agent_check_getter)
def _create_prepared_statement(self, conn, statement, obfuscated_statement, query_signature):
def _create_prepared_statement(
self, conn, statement: str, obfuscated_statement: str, query_signature: str
) -> Optional[Tuple[DBExplainError, str]]:
# Returns None on success, or a (DBExplainError, err_msg) tuple when the query can't be prepared because
# a parameter's type can't be resolved. Other unexpected errors are re-raised.
try:
self._execute_query(
conn,
PREPARE_STATEMENT_QUERY.format(query_signature=query_signature, statement=statement),
)
return None
except EXPECTED_PARAMETER_TYPE_ERRORS as e:
# The parameter types can't be resolved, so this query can't be prepared (and therefore can't be
# explained). Map the failure to the corresponding explain error code.
self._log_failed_statement(
'Failed to create prepared statement when explaining statement(%s)=[%s] | err=[%s]',
statement,
obfuscated_statement,
query_signature,
e,
)
return self._map_parameter_type_error(e)
except Exception as e:
logged_statement = obfuscated_statement
if self._config.log_unobfuscated_plans:
logged_statement = statement
logger.debug(
self._log_failed_statement(
'Failed to create prepared statement when explaining statement(%s)=[%s] | err=[%s]',
statement,
obfuscated_statement,
query_signature,
logged_statement,
e,
)
raise

def _map_parameter_type_error(self, e: Exception) -> Tuple[DBExplainError, str]:
# Map an unresolved parameter-type error to its specific explain error code so cached responses
# and emitted error tags reflect the actual failure rather than a generic one.
if isinstance(e, psycopg.errors.IndeterminateDatatype):
return DBExplainError.indeterminate_datatype, '{}'.format(type(e))
if isinstance(e, psycopg.errors.DatatypeMismatch):
return DBExplainError.datatype_mismatch, '{}'.format(type(e))
return DBExplainError.undefined_function, '{}'.format(type(e))

def _log_failed_statement(
self, message: str, statement: str, obfuscated_statement: str, query_signature: str, e: Exception
) -> None:
# Logs the obfuscated statement by default, falling back to the raw statement only when explicitly
# configured. The message is expected to interpolate (query_signature, statement, error) in that order.
logged_statement = obfuscated_statement
if self._config.log_unobfuscated_plans:
logged_statement = statement
logger.debug(message, query_signature, logged_statement, e)

@tracked_method(agent_check_getter=agent_check_getter)
def _get_number_of_parameters_for_prepared_statement(self, conn, query_signature):
rows = self._execute_query_and_fetch_rows(conn, PARAM_TYPES_COUNT_QUERY.format(query_signature=query_signature))
Expand All @@ -152,22 +207,35 @@ def _generate_prepared_statement_query(self, conn, query_signature: str) -> str:
return EXECUTE_PREPARED_STATEMENT_QUERY.format(prepared_statement=query_signature, parameters=parameters)

@tracked_method(agent_check_getter=agent_check_getter)
def _explain_prepared_statement(self, conn, statement, obfuscated_statement, query_signature):
def _explain_prepared_statement(
self, conn, statement: str, obfuscated_statement: str, query_signature: str
) -> Tuple[Optional[List], Optional[Tuple[DBExplainError, str]]]:
# Returns (rows, None) on success, or (None, (DBExplainError, err_msg)) when a parameter's type can't be
# resolved during EXPLAIN EXECUTE. Other unexpected errors are re-raised.
try:
prepared_statement_query = self._generate_prepared_statement_query(conn, query_signature)
return self._execute_query_and_fetch_rows(
rows = self._execute_query_and_fetch_rows(
conn,
EXPLAIN_QUERY.format(explain_function=self._explain_function),
(prepared_statement_query,),
)
return rows, None
except EXPECTED_PARAMETER_TYPE_ERRORS as e:
# The parameter types couldn't be resolved during EXPLAIN EXECUTE, so the query can't be explained.
self._log_failed_statement(
'Failed to explain parameterized statement(%s)=[%s] | err=[%s]',
statement,
obfuscated_statement,
query_signature,
e,
)
return None, self._map_parameter_type_error(e)
except Exception as e:
logged_statement = obfuscated_statement
if self._config.log_unobfuscated_plans:
logged_statement = statement
logger.debug(
self._log_failed_statement(
'Failed to explain parameterized statement(%s)=[%s] | err=[%s]',
statement,
obfuscated_statement,
query_signature,
logged_statement,
e,
)
raise
Expand Down
16 changes: 15 additions & 1 deletion postgres/datadog_checks/postgres/statement_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,16 @@

SUPPORTED_EXPLAIN_STATEMENTS = frozenset({'select', 'table', 'delete', 'insert', 'replace', 'update', 'with'})

# Deterministic parameterized-query explain failures that will not succeed on retry for a given query signature
# (e.g. a type mismatch caused by untyped parameters).
PARAMETERIZED_EXPLAIN_ERRORS_TO_CACHE = frozenset(
{
DBExplainError.undefined_function,
DBExplainError.indeterminate_datatype,
DBExplainError.datatype_mismatch,
}
)

# columns from pg_stat_activity which correspond to attributes common to all databases and are therefore stored in
# under other standard keys
pg_stat_activity_sample_exclude_keys = {
Expand Down Expand Up @@ -806,9 +816,13 @@ def _run_explain_safe(self, dbname, statement, obfuscated_statement, query_signa
# instead of trying to explain it then failing
if self._explain_parameterized_queries._is_parameterized_query(statement):
if is_affirmative(self._config.query_samples.explain_parameterized_queries):
return self._explain_parameterized_queries.explain_statement(
plan, explain_err_code, err_msg = self._explain_parameterized_queries.explain_statement(
dbname, statement, obfuscated_statement, query_signature
)
# Cache deterministic failures (e.g. type mismatches from untyped parameters)
if explain_err_code in PARAMETERIZED_EXPLAIN_ERRORS_TO_CACHE:
self._explain_errors_cache[query_signature] = (plan, explain_err_code, err_msg)
return plan, explain_err_code, err_msg
e = psycopg.errors.UndefinedParameter("Unable to explain parameterized query")
self._log.debug(
"Unable to collect execution plan, clients using the extended query protocol or prepared statements"
Expand Down
117 changes: 113 additions & 4 deletions postgres/tests/test_explain_parameterized_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def test_explain_parameterized_queries_explain_prepared_statement_no_plan_return

with mock.patch(
'datadog_checks.postgres.explain_parameterized_queries.ExplainParameterizedQueries._execute_query_and_fetch_rows',
return_value=None,
return_value=[],
):
plan_dict, explain_err_code, err = check.statement_samples._run_and_track_explain(
DB_NAME,
Expand All @@ -197,6 +197,52 @@ def test_explain_parameterized_queries_explain_prepared_statement_no_plan_return
assert err is None


@pytest.mark.integration
@pytest.mark.usefixtures("dd_environment")
@requires_over_12
@pytest.mark.parametrize(
"query,expected_error_code",
[
# the parameter appears only in `$1 IS NULL`, so its type can't be resolved while preparing
("SELECT * FROM pg_settings WHERE $1 IS NULL", DBExplainError.indeterminate_datatype),
# `setting` is text but `$1::int` pins $1 to int, leaving no `text = integer` operator
(
"SELECT * FROM pg_settings WHERE setting = $1 AND name = $1::int",
DBExplainError.undefined_function,
),
],
)
def test_parameterized_explain_type_resolution_failure_is_handled(
integration_check, dbm_instance, query, expected_error_code
):
"""A parameterized query whose parameter types can't be resolved can't be prepared or explained.

Real Postgres raises the type-resolution error while preparing the statement. The failure should
surface as an explain error code (not raise) and be cached per query signature so we don't keep
attempting to prepare a statement we already know we can't prepare.
"""
check = integration_check(dbm_instance)
check.check(dbm_instance)
query_signature = compute_sql_signature(query)

plan, explain_err_code, _ = check.statement_samples._run_and_track_explain(DB_NAME, query, query, query_signature)

assert plan is None
assert explain_err_code == expected_error_code
# the deterministic failure is cached so we don't re-attempt it every collection
assert query_signature in check.statement_samples._explain_errors_cache

# the cached failure short-circuits subsequent attempts without re-querying Postgres
with mock.patch.object(
check.statement_samples._explain_parameterized_queries,
'explain_statement',
return_value=(None, expected_error_code, None),
) as mock_explain:
_, cached_err_code, _ = check.statement_samples._run_explain_safe(DB_NAME, query, query, query_signature)
assert cached_err_code == expected_error_code
mock_explain.assert_not_called()


@requires_over_12
def test_generate_prepared_statement_query_no_parameters(integration_check, dbm_instance):
check = integration_check(dbm_instance)
Expand Down Expand Up @@ -232,21 +278,84 @@ def test_generate_prepared_statement_query_three_parameters(integration_check, d

@pytest.mark.unit
@requires_over_12
def test_create_prepared_statement_exception(integration_check, dbm_instance):
# psycopg.errors.DatabaseError is the parent of the expected type-resolution errors, so it also guards that
# the narrow except in _create_prepared_statement does not accidentally swallow unexpected database errors.
@pytest.mark.parametrize("exception_class", [Exception, psycopg.errors.DatabaseError])
def test_create_prepared_statement_exception(integration_check, dbm_instance, exception_class):
check = integration_check(dbm_instance)

query = "SELECT * FROM pg_settings WHERE name = $1"
query_signature = compute_sql_signature(query)
with mock.patch(
'datadog_checks.postgres.explain_parameterized_queries.ExplainParameterizedQueries._execute_query',
side_effect=Exception,
side_effect=exception_class,
):
with pytest.raises(Exception):
with pytest.raises(exception_class):
check.statement_samples._explain_parameterized_queries._create_prepared_statement(
DB_NAME, query, query, query_signature
)


@pytest.mark.unit
@requires_over_12
def test_create_prepared_statement_datatype_mismatch_maps_to_code(integration_check, dbm_instance):
"""A DatatypeMismatch during PREPARE means the statement can't be prepared, so it maps to the
datatype_mismatch error code instead of raising.

IndeterminateDatatype and UndefinedFunction are covered end-to-end against real Postgres in
test_parameterized_explain_type_resolution_failure_is_handled. DatatypeMismatch is exercised here
because it isn't reliably triggered by a portable query.
"""
check = integration_check(dbm_instance)
epq = check.statement_samples._explain_parameterized_queries

with mock.patch.object(epq, '_execute_query', side_effect=psycopg.errors.DatatypeMismatch("type mismatch")):
result = epq._create_prepared_statement(
None,
"SELECT id FROM t WHERE id = $1",
"SELECT id FROM t WHERE id = $1",
"test_sig",
)

assert result is not None
assert result[0] == DBExplainError.datatype_mismatch


@pytest.mark.unit
@pytest.mark.parametrize(
"exception_class,expected_error_code",
[
(psycopg.errors.IndeterminateDatatype, DBExplainError.indeterminate_datatype),
(psycopg.errors.DatatypeMismatch, DBExplainError.datatype_mismatch),
(psycopg.errors.UndefinedFunction, DBExplainError.undefined_function),
],
)
def test_explain_prepared_statement_type_mismatch_maps_to_code(
integration_check, dbm_instance, exception_class, expected_error_code
):
"""When EXPLAIN EXECUTE hits a parameter type-resolution error the statement can't be explained,
so _explain_prepared_statement returns the specific mapped error code rather than raising."""
check = integration_check(dbm_instance)
epq = check.statement_samples._explain_parameterized_queries

with mock.patch.object(epq, '_generate_prepared_statement_query', return_value="EXECUTE dd_test(null)"):
with mock.patch.object(
epq,
'_execute_query_and_fetch_rows',
side_effect=exception_class("operator does not exist: bigint = text"),
):
rows, explain_error = epq._explain_prepared_statement(
None,
"SELECT id FROM t WHERE id = $1",
"SELECT id FROM t WHERE id = $1",
"test_sig",
)

assert rows is None
assert explain_error is not None
assert explain_error[0] == expected_error_code


@pytest.mark.unit
@pytest.mark.parametrize(
"query,statement_is_parameterized_query",
Expand Down
Loading