Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ibm_spectrum_lsf/assets/dashboards/overview.json
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@
"display_format": "countsAndList",
"hide_zero_counts": true,
"last_triggered_format": "relative",
"query": "tag:(source:ibm_spectrum_lsf)",
"query": "tag:(integration:ibm-spectrum-lsf)",
"show_last_triggered": false,
"show_priority": false,
"show_status": true,
Expand Down
2 changes: 1 addition & 1 deletion ibm_spectrum_lsf/assets/monitors/high_pending_jobs.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"type": "query alert",
"query": "avg(last_1d):anomalies(sum:ibm_spectrum_lsf.queue.pending{*} by {queue_name,lsf_cluster_name}, 'basic', 2, direction='above', interval=300, alert_window='last_1h', count_default_zero='true') >= 1",
"message": "{{#is_alert}}\nNumber of Pending Jobs in IBM Spectrum LSF cluster {{lsf_cluster_name.name}} is higher than normal. \n{{/is_alert}}\n\n{{#is_warning}}\nNumber of Pending Jobs in IBM Spectrum LSF cluster {{lsf_cluster_name.name}} is higher than normal.\n{{/is_warning}}\n\n{{#is_recovery}}\nNumber of Pending Jobs in IBM Spectrum LSF cluster {{lsf_cluster_name.name}} is back to normal.\n{{/is_recovery}}",
"tags": [],
"tags": ["integration:ibm-spectrum-lsf"],
"options": {
"thresholds": {
"critical": 1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"type": "query alert",
"query": "avg(last_5m):sum:ibm_spectrum_lsf.server.gpu.num_gpus_shared_available{*} by {lsf_cluster_name} <= 0",
"message": "{{#is_alert}}There are no GPU Slots available to be shared with other jobs on cluster {{lsf_cluster_name.name}}. Jobs that require GPU resources cannot run until the current GPU-using jobs complete. {{/is_alert}}\n\n{{#is_recovery}}\nGPU slots are now available on cluster {{lsf_cluster_name.name}}. \n{{/is_recovery}}",
"tags": [],
"tags": ["integration:ibm-spectrum-lsf"],
"options": {
"thresholds": {
"critical": 0
Expand Down
2 changes: 2 additions & 0 deletions mongo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,8 @@ Then, [instrument your application container][8] and set `DD_AGENT_HOST` to the

## Data Collected

Some of the metrics listed below require additional configuration, see the [sample mongo.d/conf.yaml][5] for all configurable options. Query metrics for MongoDB require Datadog Agent v7.78 or later.

### Metrics

See [metadata.csv][22] for a list of metrics provided by this check.
Expand Down
1 change: 1 addition & 0 deletions mysql/changelog.d/23593.fixed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix schema collection query failure when MySQL ANSI_QUOTES sql_mode is enabled
2 changes: 1 addition & 1 deletion mysql/datadog_checks/mysql/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@
row_format as `row_format`,
create_time as `create_time`
FROM information_schema.TABLES
WHERE TABLE_SCHEMA = %s AND TABLE_TYPE="BASE TABLE"
WHERE TABLE_SCHEMA = %s AND TABLE_TYPE='BASE TABLE'
"""

SQL_COLUMNS = """\
Expand Down
1 change: 1 addition & 0 deletions postgres/changelog.d/23433.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Surface common Postgres and DBM setup issues through `datadog-agent diagnose`: connection/auth, Postgres version, server GUCs (`shared_preload_libraries`, `track_activity_query_size`, `track_io_timing`, `pg_stat_statements.max`), `pg_monitor` role, `pg_stat_activity` visibility, and per-database `datadog` schema, `pg_stat_statements`, and `datadog.explain_statement` checks. Probes are gated on the subfeature that consumes each dependency and cascade-suppressed so a single root cause yields one actionable row. The statement collector also now distinguishes `pg_stat_statements_not_loaded` from `pg_stat_statements_not_created` so `agent status` points at the right fix.
891 changes: 891 additions & 0 deletions postgres/datadog_checks/postgres/diagnose.py

Large diffs are not rendered by default.

13 changes: 11 additions & 2 deletions postgres/datadog_checks/postgres/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@

from .__about__ import __version__
from .config import build_config, sanitize
from .diagnose import PostgresDiagnose
from .util import (
ANALYZE_PROGRESS_METRICS,
AWS_RDS_HOSTNAME_SUFFIX,
Expand Down Expand Up @@ -99,7 +100,7 @@

MAX_CUSTOM_RESULTS = 100

PG_SETTINGS_QUERY = "SELECT name, setting FROM pg_settings WHERE name IN (%s, %s, %s)"
PG_SETTINGS_QUERY = "SELECT name, setting FROM pg_settings WHERE name IN (%s, %s, %s, %s)"


class PostgreSql(DatabaseCheck):
Expand Down Expand Up @@ -190,6 +191,9 @@ def __init__(self, name, init_config, instances):
ttl=self._config.database_instance_collection_interval,
) # type: TTLCache

# Register explicit pre-flight diagnostics for `datadog-agent diagnose`.
PostgresDiagnose(self).register()

def _submit_initialization_health_event(self):
try:
# Handle the config validation result after we've set tags so those tags are included in the health event
Expand Down Expand Up @@ -1033,7 +1037,12 @@ def _load_pg_settings(self, db):
self.log.debug("Running query [%s]", PG_SETTINGS_QUERY)
cursor.execute(
PG_SETTINGS_QUERY,
("pg_stat_statements.max", "track_activity_query_size", "track_io_timing"),
(
"pg_stat_statements.max",
"track_activity_query_size",
"track_io_timing",
"shared_preload_libraries",
),
)
rows = cursor.fetchall()
self.pg_settings.clear()
Expand Down
78 changes: 49 additions & 29 deletions postgres/datadog_checks/postgres/statements.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@
from datadog_checks.postgres.config_models import InstanceConfig

from .query_calls_cache import QueryCallsCache
from .util import DatabaseConfigurationError, payload_pg_version, warning_with_tags
from .util import (
DatabaseConfigurationError,
parse_shared_preload_libraries,
payload_pg_version,
warning_with_tags,
)
from .version_utils import V9_4, V10, V14

try:
Expand Down Expand Up @@ -424,35 +429,33 @@ def _load_pg_stat_statements(self):

if (isinstance(e, psycopg.errors.ObjectNotInPrerequisiteState)) and 'pg_stat_statements' in str(e):
error_tag = "error:database-{}-pg_stat_statements_not_loaded".format(type(e).__name__)
self._check.record_warning(
DatabaseConfigurationError.pg_stat_statements_not_loaded,
warning_with_tags(
"Unable to collect statement metrics because pg_stat_statements "
"extension is not loaded in database '%s'. "
"See https://docs.datadoghq.com/database_monitoring/setup_postgres/"
"troubleshooting#%s for more details",
self._config.dbname,
DatabaseConfigurationError.pg_stat_statements_not_loaded.value,
host=self._check.reported_hostname,
dbname=self._config.dbname,
code=DatabaseConfigurationError.pg_stat_statements_not_loaded.value,
),
)
self._record_pg_stat_statements_not_loaded()
elif isinstance(e, psycopg.errors.UndefinedTable) and 'pg_stat_statements' in str(e):
error_tag = "error:database-{}-pg_stat_statements_not_created".format(type(e).__name__)
self._check.record_warning(
DatabaseConfigurationError.pg_stat_statements_not_created,
warning_with_tags(
"Unable to collect statement metrics because pg_stat_statements is not created "
"in database '%s'. See https://docs.datadoghq.com/database_monitoring/setup_postgres/"
"troubleshooting#%s for more details",
self._config.dbname,
DatabaseConfigurationError.pg_stat_statements_not_created.value,
host=self._check.reported_hostname,
dbname=self._config.dbname,
code=DatabaseConfigurationError.pg_stat_statements_not_created.value,
),
)
# UndefinedTable fires whether the extension was never CREATEd OR the library was
# never loaded via shared_preload_libraries (you can't CREATE EXTENSION without SPL).
# Inspect pg_settings (populated at connect time) to attribute correctly -- otherwise
# we'd tell the user to `CREATE EXTENSION` when that command will fail until SPL is
# fixed and the server restarted. pg_settings returns an empty string when the
# datadog user lacks pg_monitor and can't read SPL; fall back to `not_created` then.
spl = self._check.pg_settings.get("shared_preload_libraries", "") or ""
if spl and "pg_stat_statements" not in parse_shared_preload_libraries(spl):
error_tag = "error:database-{}-pg_stat_statements_not_loaded".format(type(e).__name__)
self._record_pg_stat_statements_not_loaded()
else:
error_tag = "error:database-{}-pg_stat_statements_not_created".format(type(e).__name__)
self._check.record_warning(
DatabaseConfigurationError.pg_stat_statements_not_created,
warning_with_tags(
"Unable to collect statement metrics because pg_stat_statements is not created "
"in database '%s'. See https://docs.datadoghq.com/database_monitoring/setup_postgres/"
"troubleshooting#%s for more details",
self._config.dbname,
DatabaseConfigurationError.pg_stat_statements_not_created.value,
host=self._check.reported_hostname,
dbname=self._config.dbname,
code=DatabaseConfigurationError.pg_stat_statements_not_created.value,
),
)
else:
self._check.warning(
warning_with_tags(
Expand All @@ -476,6 +479,23 @@ def _load_pg_stat_statements(self):

return []

def _record_pg_stat_statements_not_loaded(self):
code = DatabaseConfigurationError.pg_stat_statements_not_loaded
self._check.record_warning(
code,
warning_with_tags(
"Unable to collect statement metrics because pg_stat_statements "
"extension is not loaded in database '%s'. "
"See https://docs.datadoghq.com/database_monitoring/setup_postgres/"
"troubleshooting#%s for more details",
self._config.dbname,
code.value,
host=self._check.reported_hostname,
dbname=self._config.dbname,
code=code.value,
),
)

def _emit_pg_stat_statements_dealloc(self):
if self._check.version < V14:
return
Expand Down
Loading
Loading