diff --git a/dcgm/README.md b/dcgm/README.md index 70e21eb362d97..f6d3ce82cad81 100644 --- a/dcgm/README.md +++ b/dcgm/README.md @@ -1,11 +1,11 @@ # Agent Check: DCGM Exporter -{{< callout url="https://docs.datadoghq.com/gpu_monitoring/setup/?tab=datadogoperator" >}} +{{< callout >}} -This is a legacy integration that is no longer being developed or fully supported. For advanced insights into your GPU fleet, use Datadog's GPU Monitoring instead of installing this integration. +This is a legacy integration that is no longer being developed or fully supported. For advanced insights into your GPU fleet, use Datadog's [GPU Monitoring](https://docs.datadoghq.com/gpu_monitoring/) instead of installing this integration. {{< /callout >}} ## Overview -We recommend using [GPU Monitoring][18], which not only provides metric parity with this integration, but even more metrics and guided remediation actions across your AI stack. [See the Datadog GPU Monitoring documentation][19] for setup instructions. +We recommend using [GPU Monitoring][18], which not only provides metric parity with this integration, but even more metrics and guided remediation actions across your AI stack. [See the Datadog GPU Monitoring documentation][19] for setup instructions. This legacy check submits metrics exposed by the [NVIDIA DCGM Exporter][16] in Datadog Agent format. For more information on NVIDIA Data Center GPU Manager (DCGM), see [NVIDIA DCGM][15]. diff --git a/postgres/changelog.d/24102.fixed b/postgres/changelog.d/24102.fixed new file mode 100644 index 0000000000000..7ec40e166d230 --- /dev/null +++ b/postgres/changelog.d/24102.fixed @@ -0,0 +1 @@ +Cache parameterized-query explain failures caused by unresolvable parameter types so they aren't re-attempted on every collection. diff --git a/postgres/datadog_checks/postgres/explain_parameterized_queries.py b/postgres/datadog_checks/postgres/explain_parameterized_queries.py index 74cea579fe8b9..aacd3b0a6d273 100644 --- a/postgres/datadog_checks/postgres/explain_parameterized_queries.py +++ b/postgres/datadog_checks/postgres/explain_parameterized_queries.py @@ -4,6 +4,7 @@ import logging import re +from typing import Dict, List, Optional, Tuple import psycopg @@ -26,6 +27,16 @@ EXPLAIN_QUERY = 'SELECT {explain_function}(%s)' +# Errors raised when PostgreSQL can't resolve a parameter's type while preparing or explaining a parameterized +# query (e.g. untyped NULL parameters from ORMs using the extended query protocol: "operator does not exist: +# bigint = text"). The parameter types that made the original query valid aren't available in pg_stat_activity, +# so these queries can't be explained and the failure is deterministic for a given query signature. +EXPECTED_PARAMETER_TYPE_ERRORS = ( + psycopg.errors.IndeterminateDatatype, + psycopg.errors.UndefinedFunction, + psycopg.errors.DatatypeMismatch, +) + def agent_check_getter(self): return self._check @@ -73,7 +84,9 @@ def __init__(self, check, config, explain_function): self._explain_function = explain_function @tracked_method(agent_check_getter=agent_check_getter) - def explain_statement(self, dbname, statement, obfuscated_statement, query_signature): + def explain_statement( + self, dbname: str, statement: str, obfuscated_statement: str, query_signature: str + ) -> Tuple[Optional[Dict], Optional[DBExplainError], Optional[str]]: if self._check.version < V12: # if pg version < 12, skip explaining parameterized queries because # plan_cache_mode is not supported @@ -86,18 +99,27 @@ def explain_statement(self, dbname, statement, obfuscated_statement, query_signa with self._check.db_pool.get_connection(dbname) as conn: try: self._set_plan_cache_mode(conn) - self._create_prepared_statement(conn, statement, obfuscated_statement, query_signature) - except psycopg.errors.IndeterminateDatatype as e: - return None, DBExplainError.indeterminate_datatype, '{}'.format(type(e)) - except psycopg.errors.UndefinedFunction as e: - return None, DBExplainError.undefined_function, '{}'.format(type(e)) + prepared_statement_error = self._create_prepared_statement( + conn, statement, obfuscated_statement, query_signature + ) except Exception as e: - # if we fail to create a prepared statement, we cannot explain the query + # an unexpected failure creating the prepared statement means we cannot explain the query return None, DBExplainError.failed_to_explain_with_prepared_statement, '{}'.format(type(e)) + if prepared_statement_error is not None: + # an expected, deterministic parameter type-resolution failure (e.g. untyped NULL parameters). + error_code, err_msg = prepared_statement_error + return None, error_code, err_msg + try: - result = self._explain_prepared_statement(conn, statement, obfuscated_statement, query_signature) - if result: + result, explain_error = self._explain_prepared_statement( + conn, statement, obfuscated_statement, query_signature + ) + if explain_error is not None: + # an expected, deterministic parameter type-resolution failure surfaced during EXPLAIN EXECUTE + error_code, err_msg = explain_error + return None, error_code, err_msg + elif result: plan = result[0][0][0] return plan, DBExplainError.explained_with_prepared_statement, None else: @@ -117,24 +139,57 @@ def _set_plan_cache_mode(self, conn): self._execute_query(conn, "SET plan_cache_mode = force_generic_plan") @tracked_method(agent_check_getter=agent_check_getter) - def _create_prepared_statement(self, conn, statement, obfuscated_statement, query_signature): + def _create_prepared_statement( + self, conn, statement: str, obfuscated_statement: str, query_signature: str + ) -> Optional[Tuple[DBExplainError, str]]: + # Returns None on success, or a (DBExplainError, err_msg) tuple when the query can't be prepared because + # a parameter's type can't be resolved. Other unexpected errors are re-raised. try: self._execute_query( conn, PREPARE_STATEMENT_QUERY.format(query_signature=query_signature, statement=statement), ) + return None + except EXPECTED_PARAMETER_TYPE_ERRORS as e: + # The parameter types can't be resolved, so this query can't be prepared (and therefore can't be + # explained). Map the failure to the corresponding explain error code. + self._log_failed_statement( + 'Failed to create prepared statement when explaining statement(%s)=[%s] | err=[%s]', + statement, + obfuscated_statement, + query_signature, + e, + ) + return self._map_parameter_type_error(e) except Exception as e: - logged_statement = obfuscated_statement - if self._config.log_unobfuscated_plans: - logged_statement = statement - logger.debug( + self._log_failed_statement( 'Failed to create prepared statement when explaining statement(%s)=[%s] | err=[%s]', + statement, + obfuscated_statement, query_signature, - logged_statement, e, ) raise + def _map_parameter_type_error(self, e: Exception) -> Tuple[DBExplainError, str]: + # Map an unresolved parameter-type error to its specific explain error code so cached responses + # and emitted error tags reflect the actual failure rather than a generic one. + if isinstance(e, psycopg.errors.IndeterminateDatatype): + return DBExplainError.indeterminate_datatype, '{}'.format(type(e)) + if isinstance(e, psycopg.errors.DatatypeMismatch): + return DBExplainError.datatype_mismatch, '{}'.format(type(e)) + return DBExplainError.undefined_function, '{}'.format(type(e)) + + def _log_failed_statement( + self, message: str, statement: str, obfuscated_statement: str, query_signature: str, e: Exception + ) -> None: + # Logs the obfuscated statement by default, falling back to the raw statement only when explicitly + # configured. The message is expected to interpolate (query_signature, statement, error) in that order. + logged_statement = obfuscated_statement + if self._config.log_unobfuscated_plans: + logged_statement = statement + logger.debug(message, query_signature, logged_statement, e) + @tracked_method(agent_check_getter=agent_check_getter) def _get_number_of_parameters_for_prepared_statement(self, conn, query_signature): rows = self._execute_query_and_fetch_rows(conn, PARAM_TYPES_COUNT_QUERY.format(query_signature=query_signature)) @@ -152,22 +207,35 @@ def _generate_prepared_statement_query(self, conn, query_signature: str) -> str: return EXECUTE_PREPARED_STATEMENT_QUERY.format(prepared_statement=query_signature, parameters=parameters) @tracked_method(agent_check_getter=agent_check_getter) - def _explain_prepared_statement(self, conn, statement, obfuscated_statement, query_signature): + def _explain_prepared_statement( + self, conn, statement: str, obfuscated_statement: str, query_signature: str + ) -> Tuple[Optional[List], Optional[Tuple[DBExplainError, str]]]: + # Returns (rows, None) on success, or (None, (DBExplainError, err_msg)) when a parameter's type can't be + # resolved during EXPLAIN EXECUTE. Other unexpected errors are re-raised. try: prepared_statement_query = self._generate_prepared_statement_query(conn, query_signature) - return self._execute_query_and_fetch_rows( + rows = self._execute_query_and_fetch_rows( conn, EXPLAIN_QUERY.format(explain_function=self._explain_function), (prepared_statement_query,), ) + return rows, None + except EXPECTED_PARAMETER_TYPE_ERRORS as e: + # The parameter types couldn't be resolved during EXPLAIN EXECUTE, so the query can't be explained. + self._log_failed_statement( + 'Failed to explain parameterized statement(%s)=[%s] | err=[%s]', + statement, + obfuscated_statement, + query_signature, + e, + ) + return None, self._map_parameter_type_error(e) except Exception as e: - logged_statement = obfuscated_statement - if self._config.log_unobfuscated_plans: - logged_statement = statement - logger.debug( + self._log_failed_statement( 'Failed to explain parameterized statement(%s)=[%s] | err=[%s]', + statement, + obfuscated_statement, query_signature, - logged_statement, e, ) raise diff --git a/postgres/datadog_checks/postgres/statement_samples.py b/postgres/datadog_checks/postgres/statement_samples.py index a1ce074b98c7d..1134417ed033e 100644 --- a/postgres/datadog_checks/postgres/statement_samples.py +++ b/postgres/datadog_checks/postgres/statement_samples.py @@ -53,6 +53,16 @@ SUPPORTED_EXPLAIN_STATEMENTS = frozenset({'select', 'table', 'delete', 'insert', 'replace', 'update', 'with'}) +# Deterministic parameterized-query explain failures that will not succeed on retry for a given query signature +# (e.g. a type mismatch caused by untyped parameters). +PARAMETERIZED_EXPLAIN_ERRORS_TO_CACHE = frozenset( + { + DBExplainError.undefined_function, + DBExplainError.indeterminate_datatype, + DBExplainError.datatype_mismatch, + } +) + # columns from pg_stat_activity which correspond to attributes common to all databases and are therefore stored in # under other standard keys pg_stat_activity_sample_exclude_keys = { @@ -806,9 +816,13 @@ def _run_explain_safe(self, dbname, statement, obfuscated_statement, query_signa # instead of trying to explain it then failing if self._explain_parameterized_queries._is_parameterized_query(statement): if is_affirmative(self._config.query_samples.explain_parameterized_queries): - return self._explain_parameterized_queries.explain_statement( + plan, explain_err_code, err_msg = self._explain_parameterized_queries.explain_statement( dbname, statement, obfuscated_statement, query_signature ) + # Cache deterministic failures (e.g. type mismatches from untyped parameters) + if explain_err_code in PARAMETERIZED_EXPLAIN_ERRORS_TO_CACHE: + self._explain_errors_cache[query_signature] = (plan, explain_err_code, err_msg) + return plan, explain_err_code, err_msg e = psycopg.errors.UndefinedParameter("Unable to explain parameterized query") self._log.debug( "Unable to collect execution plan, clients using the extended query protocol or prepared statements" diff --git a/postgres/tests/test_explain_parameterized_queries.py b/postgres/tests/test_explain_parameterized_queries.py index 9f08beb116eba..7201c78df4b67 100644 --- a/postgres/tests/test_explain_parameterized_queries.py +++ b/postgres/tests/test_explain_parameterized_queries.py @@ -184,7 +184,7 @@ def test_explain_parameterized_queries_explain_prepared_statement_no_plan_return with mock.patch( 'datadog_checks.postgres.explain_parameterized_queries.ExplainParameterizedQueries._execute_query_and_fetch_rows', - return_value=None, + return_value=[], ): plan_dict, explain_err_code, err = check.statement_samples._run_and_track_explain( DB_NAME, @@ -197,6 +197,52 @@ def test_explain_parameterized_queries_explain_prepared_statement_no_plan_return assert err is None +@pytest.mark.integration +@pytest.mark.usefixtures("dd_environment") +@requires_over_12 +@pytest.mark.parametrize( + "query,expected_error_code", + [ + # the parameter appears only in `$1 IS NULL`, so its type can't be resolved while preparing + ("SELECT * FROM pg_settings WHERE $1 IS NULL", DBExplainError.indeterminate_datatype), + # `setting` is text but `$1::int` pins $1 to int, leaving no `text = integer` operator + ( + "SELECT * FROM pg_settings WHERE setting = $1 AND name = $1::int", + DBExplainError.undefined_function, + ), + ], +) +def test_parameterized_explain_type_resolution_failure_is_handled( + integration_check, dbm_instance, query, expected_error_code +): + """A parameterized query whose parameter types can't be resolved can't be prepared or explained. + + Real Postgres raises the type-resolution error while preparing the statement. The failure should + surface as an explain error code (not raise) and be cached per query signature so we don't keep + attempting to prepare a statement we already know we can't prepare. + """ + check = integration_check(dbm_instance) + check.check(dbm_instance) + query_signature = compute_sql_signature(query) + + plan, explain_err_code, _ = check.statement_samples._run_and_track_explain(DB_NAME, query, query, query_signature) + + assert plan is None + assert explain_err_code == expected_error_code + # the deterministic failure is cached so we don't re-attempt it every collection + assert query_signature in check.statement_samples._explain_errors_cache + + # the cached failure short-circuits subsequent attempts without re-querying Postgres + with mock.patch.object( + check.statement_samples._explain_parameterized_queries, + 'explain_statement', + return_value=(None, expected_error_code, None), + ) as mock_explain: + _, cached_err_code, _ = check.statement_samples._run_explain_safe(DB_NAME, query, query, query_signature) + assert cached_err_code == expected_error_code + mock_explain.assert_not_called() + + @requires_over_12 def test_generate_prepared_statement_query_no_parameters(integration_check, dbm_instance): check = integration_check(dbm_instance) @@ -232,21 +278,84 @@ def test_generate_prepared_statement_query_three_parameters(integration_check, d @pytest.mark.unit @requires_over_12 -def test_create_prepared_statement_exception(integration_check, dbm_instance): +# psycopg.errors.DatabaseError is the parent of the expected type-resolution errors, so it also guards that +# the narrow except in _create_prepared_statement does not accidentally swallow unexpected database errors. +@pytest.mark.parametrize("exception_class", [Exception, psycopg.errors.DatabaseError]) +def test_create_prepared_statement_exception(integration_check, dbm_instance, exception_class): check = integration_check(dbm_instance) query = "SELECT * FROM pg_settings WHERE name = $1" query_signature = compute_sql_signature(query) with mock.patch( 'datadog_checks.postgres.explain_parameterized_queries.ExplainParameterizedQueries._execute_query', - side_effect=Exception, + side_effect=exception_class, ): - with pytest.raises(Exception): + with pytest.raises(exception_class): check.statement_samples._explain_parameterized_queries._create_prepared_statement( DB_NAME, query, query, query_signature ) +@pytest.mark.unit +@requires_over_12 +def test_create_prepared_statement_datatype_mismatch_maps_to_code(integration_check, dbm_instance): + """A DatatypeMismatch during PREPARE means the statement can't be prepared, so it maps to the + datatype_mismatch error code instead of raising. + + IndeterminateDatatype and UndefinedFunction are covered end-to-end against real Postgres in + test_parameterized_explain_type_resolution_failure_is_handled. DatatypeMismatch is exercised here + because it isn't reliably triggered by a portable query. + """ + check = integration_check(dbm_instance) + epq = check.statement_samples._explain_parameterized_queries + + with mock.patch.object(epq, '_execute_query', side_effect=psycopg.errors.DatatypeMismatch("type mismatch")): + result = epq._create_prepared_statement( + None, + "SELECT id FROM t WHERE id = $1", + "SELECT id FROM t WHERE id = $1", + "test_sig", + ) + + assert result is not None + assert result[0] == DBExplainError.datatype_mismatch + + +@pytest.mark.unit +@pytest.mark.parametrize( + "exception_class,expected_error_code", + [ + (psycopg.errors.IndeterminateDatatype, DBExplainError.indeterminate_datatype), + (psycopg.errors.DatatypeMismatch, DBExplainError.datatype_mismatch), + (psycopg.errors.UndefinedFunction, DBExplainError.undefined_function), + ], +) +def test_explain_prepared_statement_type_mismatch_maps_to_code( + integration_check, dbm_instance, exception_class, expected_error_code +): + """When EXPLAIN EXECUTE hits a parameter type-resolution error the statement can't be explained, + so _explain_prepared_statement returns the specific mapped error code rather than raising.""" + check = integration_check(dbm_instance) + epq = check.statement_samples._explain_parameterized_queries + + with mock.patch.object(epq, '_generate_prepared_statement_query', return_value="EXECUTE dd_test(null)"): + with mock.patch.object( + epq, + '_execute_query_and_fetch_rows', + side_effect=exception_class("operator does not exist: bigint = text"), + ): + rows, explain_error = epq._explain_prepared_statement( + None, + "SELECT id FROM t WHERE id = $1", + "SELECT id FROM t WHERE id = $1", + "test_sig", + ) + + assert rows is None + assert explain_error is not None + assert explain_error[0] == expected_error_code + + @pytest.mark.unit @pytest.mark.parametrize( "query,statement_is_parameterized_query",