From 43bf5ca68c280bd593b46f303ca0291b5640526d Mon Sep 17 00:00:00 2001 From: NouemanKHAL Date: Wed, 3 Jun 2026 16:27:07 +0200 Subject: [PATCH 1/4] glusterfs: require trusted provider for gstatus_path (#23881) * feat(glusterfs): require trusted provider for gstatus_path Co-Authored-By: Claude Sonnet 4.6 * ddev validate all --fix * fix 3rd party licenses * chore(glusterfs): add changelog entry for #23881 Co-Authored-By: Claude Sonnet 4.6 * chore(glusterfs): add changelog for security validation onboarding Co-Authored-By: Claude Sonnet 4.6 * ddev validate licenses --------- Co-authored-by: Claude Sonnet 4.6 --- .github/workflows/config/labeler.yml | 16 ++++++++-------- glusterfs/assets/configuration/spec.yaml | 1 + glusterfs/changelog.d/23881.added | 1 + .../glusterfs/config_models/shared.py | 8 ++++++++ 4 files changed, 18 insertions(+), 8 deletions(-) create mode 100644 glusterfs/changelog.d/23881.added diff --git a/.github/workflows/config/labeler.yml b/.github/workflows/config/labeler.yml index 0363017ec6cf1..de7ece88baa76 100644 --- a/.github/workflows/config/labeler.yml +++ b/.github/workflows/config/labeler.yml @@ -897,10 +897,6 @@ integration/langchain: - changed-files: - any-glob-to-any-file: - langchain/**/* -integration/lparstats: -- changed-files: - - any-glob-to-any-file: - - lparstats/**/* integration/lastpass: - changed-files: - any-glob-to-any-file: @@ -925,6 +921,10 @@ integration/litellm: - changed-files: - any-glob-to-any-file: - litellm/**/* +integration/lparstats: +- changed-files: + - any-glob-to-any-file: + - lparstats/**/* integration/lustre: - changed-files: - any-glob-to-any-file: @@ -1009,10 +1009,6 @@ integration/nagios: - changed-files: - any-glob-to-any-file: - nagios/**/* -integration/nifi: -- changed-files: - - any-glob-to-any-file: - - nifi/**/* integration/network: - changed-files: - any-glob-to-any-file: @@ -1033,6 +1029,10 @@ integration/nginx_ingress_controller: - changed-files: - any-glob-to-any-file: - nginx_ingress_controller/**/* +integration/nifi: +- changed-files: + - any-glob-to-any-file: + - nifi/**/* integration/ntp: - changed-files: - any-glob-to-any-file: diff --git a/glusterfs/assets/configuration/spec.yaml b/glusterfs/assets/configuration/spec.yaml index 6855e26b76116..9c2da7e1a592d 100644 --- a/glusterfs/assets/configuration/spec.yaml +++ b/glusterfs/assets/configuration/spec.yaml @@ -15,6 +15,7 @@ files: value: type: string example: /embedded/sbin/gstatus + require_trusted_provider: true - template: init_config/default - template: instances options: diff --git a/glusterfs/changelog.d/23881.added b/glusterfs/changelog.d/23881.added new file mode 100644 index 0000000000000..691a43d8733d5 --- /dev/null +++ b/glusterfs/changelog.d/23881.added @@ -0,0 +1 @@ +Add support for security validation in models for the `gstatus_path` configuration option. diff --git a/glusterfs/datadog_checks/glusterfs/config_models/shared.py b/glusterfs/datadog_checks/glusterfs/config_models/shared.py index 73eab6388015c..cd84acb1a3ee7 100644 --- a/glusterfs/datadog_checks/glusterfs/config_models/shared.py +++ b/glusterfs/datadog_checks/glusterfs/config_models/shared.py @@ -19,6 +19,9 @@ from . import defaults, validators +SECURE_FIELD_NAMES = frozenset(['gstatus_path']) + + class SharedConfig(BaseModel): model_config = ConfigDict( validate_default=True, @@ -38,6 +41,11 @@ def _validate(cls, value, info): field_name = field.alias or info.field_name if field_name in info.context['configured_fields']: value = getattr(validators, f'shared_{info.field_name}', identity)(value, field=field) + + if info.field_name in SECURE_FIELD_NAMES: + validation.security.check_field_trusted_provider( + info.field_name, value, info.context.get('security_config') + ) else: value = getattr(defaults, f'shared_{info.field_name}', lambda: value)() From 17f0700ec8f08178656b1d604279055913a83db9 Mon Sep 17 00:00:00 2001 From: Sangeeta Shivaji Rao Date: Wed, 3 Jun 2026 13:03:04 -0400 Subject: [PATCH 2/4] clickhouse: add schema collection (collect_schemas) (#23899) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add ClickHouse schema collection (collect_schemas) Adds catalog metadata collection for DBM Schema Explorer: databases, tables, views, and columns via new ClickhouseMetadata job class. Introduces collect_schemas config block with include/exclude regex filters for databases and tables. Co-Authored-By: Claude Sonnet 4.6 * test(clickhouse): remove per-table size gauge test from schema-collection PR That test checks clickhouse.table.rows/bytes which are emitted by ClickhouseTableMetrics — a feature in the schema-metrics PR, not here. Also remove the schema_metrics fixture config that has no effect in this branch. Co-Authored-By: Claude Sonnet 4.6 * fix(clickhouse): rename changelog entry to match PR number 23899 Co-Authored-By: Claude Sonnet 4.6 * test(clickhouse): remove view refresh metric assertions from schema-collection test clickhouse.view.refresh.status and next_time are emitted by _collect_view_refresh_metrics(), which lives in the schema-metrics PR. Keep only the is_refreshable catalog payload assertion here. Co-Authored-By: Claude Sonnet 4.6 * fix(clickhouse): sync CollectSchemas model formatting with spec.yaml Co-Authored-By: Claude Sonnet 4.6 * fix(clickhouse): validate collect_schemas interval and correct default-off docs - Normalize collect_schemas.collection_interval in _apply_validated_defaults so a 0/negative value is downgraded to the default with a warning instead of raising ZeroDivisionError at rate_limit = 1 / collection_interval. - Fix spec.yaml enabled example (true -> false) and regenerate conf.yaml.example so docs match the actual default-off behavior. - Add regression test for zero/negative collection_interval. Co-Authored-By: Claude Opus 4.8 (1M context) * fix(clickhouse): emit tables and views in a single tables list Collapse the per-database schema payload to one `tables` list instead of separate `tables`/`views` arrays, matching the canonical DBM schema shape used by postgres/mysql/sqlserver (whose backend structs carry only Tables). Views remain identifiable via the `engine` field (View, MaterializedView, LiveView, WindowView) and `is_refreshable`. Co-Authored-By: Claude Opus 4.8 (1M context) * fix(clickhouse): bind schema-filter regex patterns as query parameters User-supplied include/exclude regex patterns were interpolated into the schema-collection SQL with manual quote-escaping. Bind them as clickhouse-connect query parameters ({name:String}) instead, eliminating the SQL-injection surface. Only trusted structural pieces (system-table identifiers, validated integer limits, the constant system-database list) remain interpolated. Co-Authored-By: Claude Opus 4.8 (1M context) * Remove redundant int() casts — typed config already guarantees int Co-Authored-By: Claude Sonnet 4.6 * Remove redundant tuple() and or () guards — defaults applied in config.py Co-Authored-By: Claude Sonnet 4.6 * Remove redundant collect_schemas null check — always initialized with defaults Co-Authored-By: Claude Sonnet 4.6 --------- Co-authored-by: Claude Sonnet 4.6 --- clickhouse/assets/configuration/spec.yaml | 93 +++ clickhouse/changelog.d/23899.added | 1 + .../datadog_checks/clickhouse/clickhouse.py | 19 + .../datadog_checks/clickhouse/config.py | 11 + .../clickhouse/config_models/dict_defaults.py | 15 + .../clickhouse/config_models/instance.py | 18 + .../clickhouse/data/conf.yaml.example | 69 ++ .../datadog_checks/clickhouse/features.py | 2 + .../datadog_checks/clickhouse/metadata.py | 52 ++ .../datadog_checks/clickhouse/schemas.py | 240 +++++++ clickhouse/tests/test_config_defaults.py | 13 + clickhouse/tests/test_metadata.py | 621 ++++++++++++++++++ clickhouse/tests/test_metadata_integration.py | 317 +++++++++ clickhouse/tests/test_unit.py | 15 + 14 files changed, 1486 insertions(+) create mode 100644 clickhouse/changelog.d/23899.added create mode 100644 clickhouse/datadog_checks/clickhouse/metadata.py create mode 100644 clickhouse/datadog_checks/clickhouse/schemas.py create mode 100644 clickhouse/tests/test_metadata.py create mode 100644 clickhouse/tests/test_metadata_integration.py diff --git a/clickhouse/assets/configuration/spec.yaml b/clickhouse/assets/configuration/spec.yaml index 4ee3b90e08490..24dff8cb6f90b 100644 --- a/clickhouse/assets/configuration/spec.yaml +++ b/clickhouse/assets/configuration/spec.yaml @@ -345,6 +345,99 @@ files: type: boolean example: false fleet_configurable: false + - name: collect_schemas + description: | + Configure collection of ClickHouse catalog metadata (databases, + tables, views, columns) for Database Monitoring's Schema Explorer. + Requires `dbm: true`. + options: + - name: enabled + description: | + Enable collection of catalog metadata. When enabled, the agent + polls `system.tables`, `system.columns`, and + `system.view_refreshes` (ClickHouse 24.3+) and emits one + `kind=clickhouse_databases` payload per cycle. + value: + type: boolean + example: false + - name: collection_interval + description: | + Set the schema collection interval (in seconds). Catalog data + changes slowly; 600s (10 min) is a reasonable default. + value: + type: number + example: 600 + - name: max_tables + description: | + Maximum number of tables plus views to collect per cycle + across all databases. + value: + type: integer + example: 300 + - name: max_columns + description: | + Maximum number of columns to collect per table/view. + value: + type: integer + example: 1000 + - name: max_query_duration + description: | + Maximum duration of the schema collection queries in seconds. + Applied to each query via the `max_execution_time` ClickHouse setting. + value: + type: integer + example: 60 + - name: include_databases + description: | + A list of regex patterns to include databases. Any database whose + name matches any one of these patterns will be included. If empty, + all databases (other than those excluded) are included. + value: + type: array + items: + type: string + example: + - "mydb" + - name: exclude_databases + description: | + A list of regex patterns to exclude databases. Any database whose + name matches any one of these patterns will be excluded. The + ClickHouse system databases (`system`, `INFORMATION_SCHEMA`, + `information_schema`) are always excluded regardless of this setting. + value: + type: array + items: + type: string + example: + - "tmp_.*" + - name: include_tables + description: | + A list of regex patterns to include tables. Any table whose name + matches any one of these patterns will be included. If empty, all + tables (other than those excluded) are included. + value: + type: array + items: + type: string + example: + - "events.*" + - name: exclude_tables + description: | + A list of regex patterns to exclude tables. Any table whose name + matches any one of these patterns will be excluded. + value: + type: array + items: + type: string + example: + - ".*_tmp" + - name: run_sync + hidden: true + description: | + Run the metadata collection synchronously. For testing only. + value: + type: boolean + example: false - name: parts_and_merges description: Configure parts and merges monitoring options: diff --git a/clickhouse/changelog.d/23899.added b/clickhouse/changelog.d/23899.added new file mode 100644 index 0000000000000..dc8712e95841d --- /dev/null +++ b/clickhouse/changelog.d/23899.added @@ -0,0 +1 @@ +Add ClickHouse schema collection: catalog payload (databases, tables, views, columns) under collect_schemas with include/exclude regex filters for databases and tables. diff --git a/clickhouse/datadog_checks/clickhouse/clickhouse.py b/clickhouse/datadog_checks/clickhouse/clickhouse.py index 3f5e66416995d..9f022a5887f7e 100644 --- a/clickhouse/datadog_checks/clickhouse/clickhouse.py +++ b/clickhouse/datadog_checks/clickhouse/clickhouse.py @@ -17,6 +17,7 @@ from .__about__ import __version__ from .config import build_config, sanitize from .health import ClickhouseHealth, HealthEvent, HealthStatus +from .metadata import ClickhouseMetadata from .parts_and_merges import ClickhousePartsAndMerges from .query_completions import ClickhouseQueryCompletions from .query_errors import ClickhouseQueryErrors @@ -122,6 +123,12 @@ def _init_dbm_components(self): else: self.query_errors = None + # Initialize schema collection (catalog metadata for Schema Explorer) + if self._config.dbm and self._config.collect_schemas.enabled: + self.metadata = ClickhouseMetadata(self) + else: + self.metadata = None + # Initialize parts and merges monitoring (from system.parts, merges, mutations, replication_queue) if self._config.dbm and self._config.parts_and_merges.enabled: self.parts_and_merges = ClickhousePartsAndMerges(self, self._config.parts_and_merges) @@ -260,6 +267,10 @@ def check(self, _): if self.query_errors: self.query_errors.run_job_loop(self.tags) + # Run schema collection if enabled + if self.metadata: + self.metadata.run_job_loop(self.tags) + # Run parts and merges monitoring if enabled if self.parts_and_merges: self.parts_and_merges.run_job_loop(self.tags) @@ -364,6 +375,10 @@ def database_identifier(self) -> str: self._database_identifier = template.safe_substitute(**tag_dict) return self._database_identifier + @property + def dbms(self) -> str: + return "clickhouse" + @property def dbms_version(self) -> str: """Get the ClickHouse server version.""" @@ -525,6 +540,8 @@ def cancel(self): self.query_completions.cancel() if self.query_errors: self.query_errors.cancel() + if self.metadata: + self.metadata.cancel() if self.parts_and_merges: self.parts_and_merges.cancel() @@ -537,6 +554,8 @@ def cancel(self): self.query_completions._job_loop_future.result() if self.query_errors and self.query_errors._job_loop_future: self.query_errors._job_loop_future.result() + if self.metadata and self.metadata._job_loop_future: + self.metadata._job_loop_future.result() if self.parts_and_merges and self.parts_and_merges._job_loop_future: self.parts_and_merges._job_loop_future.result() diff --git a/clickhouse/datadog_checks/clickhouse/config.py b/clickhouse/datadog_checks/clickhouse/config.py index e20b63832c8ed..b425b2936e2a5 100644 --- a/clickhouse/datadog_checks/clickhouse/config.py +++ b/clickhouse/datadog_checks/clickhouse/config.py @@ -128,6 +128,10 @@ def build_config(check: ClickhouseCheck) -> Tuple[InstanceConfig, ValidationResu **dict_defaults.instance_parts_and_merges().model_dump(), **(instance.get('parts_and_merges', {})), }, + "collect_schemas": { + **dict_defaults.instance_collect_schemas().model_dump(), + **(instance.get('collect_schemas', {})), + }, # Tags - ensure we have a list, not None "tags": list(instance.get('tags', [])), # Other settings @@ -224,6 +228,13 @@ def _apply_validated_defaults(args: dict, instance: dict, validation_result: Val f"parts_and_merges.collection_interval must be greater than 0, defaulting to {default_value} seconds." ) + if _safefloat(args.get('collect_schemas', {}).get('collection_interval')) <= 0: + default_value = dict_defaults.instance_collect_schemas().collection_interval + args['collect_schemas']['collection_interval'] = default_value + validation_result.add_warning( + f"collect_schemas.collection_interval must be greater than 0, defaulting to {default_value} seconds." + ) + _pm_defaults = dict_defaults.instance_parts_and_merges() for _field in ( 'max_parts_rows', diff --git a/clickhouse/datadog_checks/clickhouse/config_models/dict_defaults.py b/clickhouse/datadog_checks/clickhouse/config_models/dict_defaults.py index 3082d0cc60f2d..e7fe5bbec7b5e 100644 --- a/clickhouse/datadog_checks/clickhouse/config_models/dict_defaults.py +++ b/clickhouse/datadog_checks/clickhouse/config_models/dict_defaults.py @@ -57,6 +57,21 @@ def instance_query_errors(): ) +def instance_collect_schemas(): + return instance.CollectSchemas( + enabled=False, + collection_interval=600, + max_tables=300, + max_columns=1000, + max_query_duration=60, + include_databases=(), + exclude_databases=(), + include_tables=(), + exclude_tables=(), + run_sync=False, + ) + + def instance_parts_and_merges(): return instance.PartsAndMerges( enabled=True, diff --git a/clickhouse/datadog_checks/clickhouse/config_models/instance.py b/clickhouse/datadog_checks/clickhouse/config_models/instance.py index 4bec26f4cd289..3bc35a160aa02 100644 --- a/clickhouse/datadog_checks/clickhouse/config_models/instance.py +++ b/clickhouse/datadog_checks/clickhouse/config_models/instance.py @@ -23,6 +23,23 @@ SECURE_FIELD_NAMES = frozenset(['tls_ca_cert']) +class CollectSchemas(BaseModel): + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + collection_interval: Optional[float] = None + enabled: Optional[bool] = None + exclude_databases: Optional[tuple[str, ...]] = None + exclude_tables: Optional[tuple[str, ...]] = None + include_databases: Optional[tuple[str, ...]] = None + include_tables: Optional[tuple[str, ...]] = None + max_columns: Optional[int] = None + max_query_duration: Optional[int] = None + max_tables: Optional[int] = None + run_sync: Optional[bool] = None + + class CustomQuery(BaseModel): model_config = ConfigDict( arbitrary_types_allowed=True, @@ -127,6 +144,7 @@ class InstanceConfig(BaseModel): arbitrary_types_allowed=True, frozen=True, ) + collect_schemas: Optional[CollectSchemas] = None compression: Optional[str] = None connect_timeout: Optional[int] = None custom_queries: Optional[tuple[CustomQuery, ...]] = None diff --git a/clickhouse/datadog_checks/clickhouse/data/conf.yaml.example b/clickhouse/datadog_checks/clickhouse/data/conf.yaml.example index 279e5d96ec433..6d189b4615304 100644 --- a/clickhouse/datadog_checks/clickhouse/data/conf.yaml.example +++ b/clickhouse/datadog_checks/clickhouse/data/conf.yaml.example @@ -214,6 +214,75 @@ instances: # # samples_per_hour_per_query: 60 + ## Configure collection of ClickHouse catalog metadata (databases, + ## tables, views, columns) for Database Monitoring's Schema Explorer. + ## Requires `dbm: true`. + # + # collect_schemas: + + ## @param enabled - boolean - optional - default: false + ## Enable collection of catalog metadata. When enabled, the agent + ## polls `system.tables`, `system.columns`, and + ## `system.view_refreshes` (ClickHouse 24.3+) and emits one + ## `kind=clickhouse_databases` payload per cycle. + # + # enabled: false + + ## @param collection_interval - number - optional - default: 600 + ## Set the schema collection interval (in seconds). Catalog data + ## changes slowly; 600s (10 min) is a reasonable default. + # + # collection_interval: 600 + + ## @param max_tables - integer - optional - default: 300 + ## Maximum number of tables plus views to collect per cycle + ## across all databases. + # + # max_tables: 300 + + ## @param max_columns - integer - optional - default: 1000 + ## Maximum number of columns to collect per table/view. + # + # max_columns: 1000 + + ## @param max_query_duration - integer - optional - default: 60 + ## Maximum duration of the schema collection queries in seconds. + ## Applied to each query via the `max_execution_time` ClickHouse setting. + # + # max_query_duration: 60 + + ## @param include_databases - list of strings - optional + ## A list of regex patterns to include databases. Any database whose + ## name matches any one of these patterns will be included. If empty, + ## all databases (other than those excluded) are included. + # + # include_databases: + # - mydb + + ## @param exclude_databases - list of strings - optional + ## A list of regex patterns to exclude databases. Any database whose + ## name matches any one of these patterns will be excluded. The + ## ClickHouse system databases (`system`, `INFORMATION_SCHEMA`, + ## `information_schema`) are always excluded regardless of this setting. + # + # exclude_databases: + # - tmp_.* + + ## @param include_tables - list of strings - optional + ## A list of regex patterns to include tables. Any table whose name + ## matches any one of these patterns will be included. If empty, all + ## tables (other than those excluded) are included. + # + # include_tables: + # - events.* + + ## @param exclude_tables - list of strings - optional + ## A list of regex patterns to exclude tables. Any table whose name + ## matches any one of these patterns will be excluded. + # + # exclude_tables: + # - .*_tmp + ## Configure parts and merges monitoring # # parts_and_merges: diff --git a/clickhouse/datadog_checks/clickhouse/features.py b/clickhouse/datadog_checks/clickhouse/features.py index 5fd61bdae93cf..6eb705a785d0c 100644 --- a/clickhouse/datadog_checks/clickhouse/features.py +++ b/clickhouse/datadog_checks/clickhouse/features.py @@ -23,6 +23,7 @@ class FeatureKey(Enum): QUERY_COMPLETIONS = "query_completions" EXPLAIN_PLANS = "explain_plans" QUERY_ERRORS = "query_errors" + COLLECT_SCHEMAS = "collect_schemas" PARTS_AND_MERGES = "parts_and_merges" SINGLE_ENDPOINT_MODE = "single_endpoint_mode" @@ -34,6 +35,7 @@ class FeatureKey(Enum): FeatureKey.QUERY_COMPLETIONS: 'Query Completions', FeatureKey.QUERY_ERRORS: 'Query Errors', FeatureKey.EXPLAIN_PLANS: 'Explain Plans', + FeatureKey.COLLECT_SCHEMAS: 'Collect Schemas', FeatureKey.PARTS_AND_MERGES: 'Parts and Merges', FeatureKey.SINGLE_ENDPOINT_MODE: 'Single Endpoint Mode', } diff --git a/clickhouse/datadog_checks/clickhouse/metadata.py b/clickhouse/datadog_checks/clickhouse/metadata.py new file mode 100644 index 0000000000000..6b09a4f0072f6 --- /dev/null +++ b/clickhouse/datadog_checks/clickhouse/metadata.py @@ -0,0 +1,52 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from __future__ import annotations + +from typing import TYPE_CHECKING + +from clickhouse_connect.driver.exceptions import DatabaseError + +if TYPE_CHECKING: + from datadog_checks.clickhouse import ClickhouseCheck + +from datadog_checks.base.utils.db.utils import DBMAsyncJob +from datadog_checks.base.utils.tracking import tracked_method + +from .schemas import ClickhouseSchemaCollector + + +def agent_check_getter(self): + return self._check + + +class ClickhouseMetadata(DBMAsyncJob): + """Top-level DBM job that drives the schema collector on its configured cadence.""" + + def __init__(self, check: ClickhouseCheck): + collection_interval = check._config.collect_schemas.collection_interval + super(ClickhouseMetadata, self).__init__( + check, + rate_limit=1 / collection_interval, + run_sync=check._config.collect_schemas.run_sync, + enabled=check._config.collect_schemas.enabled, + dbms='clickhouse', + min_collection_interval=check._config.min_collection_interval, + expected_db_exceptions=(DatabaseError,), + job_name='clickhouse-metadata', + ) + self._check = check + self._collection_interval = collection_interval + self._schema_collector = ClickhouseSchemaCollector(check) + self._schema_collector._cancel_event = self._cancel_event + + def cancel(self): + super(ClickhouseMetadata, self).cancel() + self._schema_collector.close() + + @tracked_method(agent_check_getter=agent_check_getter) + def run_job(self): + try: + self._schema_collector.collect_schemas() + except Exception: + self._log.exception("Schema collection failed") diff --git a/clickhouse/datadog_checks/clickhouse/schemas.py b/clickhouse/datadog_checks/clickhouse/schemas.py new file mode 100644 index 0000000000000..0512fa8217cb0 --- /dev/null +++ b/clickhouse/datadog_checks/clickhouse/schemas.py @@ -0,0 +1,240 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from __future__ import annotations + +import contextlib +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from datadog_checks.clickhouse import ClickhouseCheck + +from datadog_checks.base.utils.db.schemas import SchemaCollector, SchemaCollectorConfig + +_SYSTEM_DATABASE_NAMES = ("'system'", "'INFORMATION_SCHEMA'", "'information_schema'") + +# Single stub so the base-class loop runs exactly once; actual database names +# come from the `database` column of each cursor row. +_CLUSTER_STUB = {'name': '_cluster_'} + +_TABLES_COLUMNS_QUERY = """\ +WITH +tables AS ( + SELECT + database, + name, + engine, + toString(uuid) AS uuid, + create_table_query, + sorting_key, + partition_key, + primary_key, + sampling_key, + toInt64(toUnixTimestamp(metadata_modification_time)) AS metadata_modified_at + FROM {tables_table} + WHERE database NOT IN ({system_dbs}) + {db_filters} + {table_filters} + ORDER BY database, name + LIMIT 1 BY (database, name) + LIMIT {max_tables} +), +columns AS ( + SELECT database, table, name, type, default_expression, comment, toInt32(position) AS position + FROM {columns_table} + WHERE (database, table) IN (SELECT database, name FROM tables) + ORDER BY database, table, position + LIMIT 1 BY (database, table, name) + LIMIT {limit_columns} +) +SELECT + t.database, + t.name, + t.engine, + t.uuid, + t.create_table_query, + t.sorting_key, + t.partition_key, + t.primary_key, + t.sampling_key, + t.metadata_modified_at, + groupArrayIf({max_columns})( + tuple(c.name, c.type, c.default_expression, c.comment, c.position), + c.name IS NOT NULL + ) AS columns +FROM tables t +LEFT JOIN columns c ON t.database = c.database AND t.name = c.table +GROUP BY + t.database, t.name, t.engine, t.uuid, t.create_table_query, + t.sorting_key, t.partition_key, t.primary_key, t.sampling_key, + t.metadata_modified_at +ORDER BY t.database, t.name +""" + + +class ClickhouseSchemaCollectorConfig(SchemaCollectorConfig): + max_tables: int + max_columns: int + max_query_duration: int + include_databases: tuple[str, ...] + exclude_databases: tuple[str, ...] + include_tables: tuple[str, ...] + exclude_tables: tuple[str, ...] + + +class ClickhouseSchemaCollector(SchemaCollector): + """Collects ClickHouse schema metadata via a single CTE query per cycle.""" + + _check: ClickhouseCheck + _config: ClickhouseSchemaCollectorConfig + + def __init__(self, check: ClickhouseCheck): + config = ClickhouseSchemaCollectorConfig() + config.collection_interval = check._config.collect_schemas.collection_interval + config.max_tables = check._config.collect_schemas.max_tables + config.max_columns = check._config.collect_schemas.max_columns + config.max_query_duration = check._config.collect_schemas.max_query_duration + config.include_databases = check._config.collect_schemas.include_databases + config.exclude_databases = check._config.collect_schemas.exclude_databases + config.include_tables = check._config.collect_schemas.include_tables + config.exclude_tables = check._config.collect_schemas.exclude_tables + + super().__init__(check, config) + self._db_client = None + self._cancel_event = None + + @property + def kind(self) -> str: + return 'clickhouse_databases' + + @property + def base_event(self) -> dict[str, Any]: + event = super().base_event + event['collector_id'] = self._check.check_id + return event + + def close(self) -> None: + if self._db_client: + try: + self._db_client.close() + except Exception as e: + self._log.debug("Error closing schema collector client: %s", e) + self._db_client = None + + def _check_cancelled(self) -> None: + if self._cancel_event is not None and self._cancel_event.is_set(): + raise Exception("Job loop cancelled. Aborting query.") + + def _get_databases(self) -> list[dict[str, str]]: + return [_CLUSTER_STUB] + + @contextlib.contextmanager + def _get_cursor(self, _database_name: str): + self._db_client = self._check.create_dbm_client() + self._db_client.set_client_setting('max_execution_time', self._config.max_query_duration) + try: + db_filters, db_params = _build_match_clauses( + 'database', self._config.include_databases, self._config.exclude_databases, 'db' + ) + table_filters, table_params = _build_match_clauses( + 'name', self._config.include_tables, self._config.exclude_tables, 'table' + ) + + # Only structural pieces (trusted system-table identifiers and + # validated integer limits) are interpolated; user-supplied regex + # patterns are bound as query parameters below. + fmt = { + 'tables_table': self._check.get_system_table('tables'), + 'columns_table': self._check.get_system_table('columns'), + 'system_dbs': ", ".join(_SYSTEM_DATABASE_NAMES), + 'max_tables': self._config.max_tables, + 'max_columns': self._config.max_columns, + 'limit_columns': self._config.max_tables * self._config.max_columns, + 'db_filters': db_filters, + 'table_filters': table_filters, + } + query_parameters = {**db_params, **table_params} + self._check_cancelled() + with self._db_client.query_rows_stream( + _TABLES_COLUMNS_QUERY.format(**fmt), parameters=query_parameters + ) as stream: + yield stream + finally: + self.close() + + def _get_next(self, cursor) -> tuple | None: + return next(cursor, None) + + def _map_row(self, _database: dict[str, str], cursor_row: tuple) -> dict[str, Any]: + # Tables and views are emitted in a single `tables` list, matching the + # canonical DBM schema payload shape (postgres/mysql/sqlserver). The + # `engine` field on each item distinguishes views (View, MaterializedView, + # LiveView, WindowView) from regular tables. + actual_db_name = cursor_row[0] + return {'name': actual_db_name, 'tables': [self._build_item(cursor_row)]} + + def _build_item(self, row: tuple) -> dict[str, Any]: + ( + database, + name, + engine, + uuid_str, + create_query, + sorting_key, + partition_key, + primary_key, + sampling_key, + metadata_modified_at, + raw_columns, + ) = row + cols = [ + { + 'name': col[0], + 'type': col[1], + 'default': col[2] or '', + 'comment': col[3] or '', + 'position': int(col[4] or 0), + } + for col in (raw_columns or []) + ] + return { + 'name': name, + 'engine': engine, + 'uuid': uuid_str, + 'sorting_key': sorting_key or '', + 'partition_key': partition_key or '', + 'primary_key': primary_key or '', + 'sampling_key': sampling_key or '', + 'create_query': create_query, + 'columns': cols, + 'metadata_modified_at': int(metadata_modified_at or 0), + 'is_refreshable': 'REFRESH' in (create_query or '').upper(), + } + + +def _build_match_clauses( + column: str, + include_patterns: tuple[str, ...], + exclude_patterns: tuple[str, ...], + param_prefix: str, +) -> tuple[str, dict[str, str]]: + """Build regex match clauses using bound query parameters rather than string + interpolation, so user-supplied patterns cannot be used for SQL injection. + + Returns the clause text (with ``{name:String}`` placeholders) and the dict of + parameter values to pass to ``query_rows_stream(parameters=...)``. + """ + clauses: list[str] = [] + params: dict[str, str] = {} + for i, pattern in enumerate(exclude_patterns): + key = f'{param_prefix}_exclude_{i}' + clauses.append(f"AND NOT match({column}, {{{key}:String}})") + params[key] = pattern + if include_patterns: + ors = [] + for i, pattern in enumerate(include_patterns): + key = f'{param_prefix}_include_{i}' + ors.append(f"match({column}, {{{key}:String}})") + params[key] = pattern + clauses.append(f"AND ({' OR '.join(ors)})") + return "\n ".join(clauses), params diff --git a/clickhouse/tests/test_config_defaults.py b/clickhouse/tests/test_config_defaults.py index 40929dd30e726..9a600f1d1f39b 100644 --- a/clickhouse/tests/test_config_defaults.py +++ b/clickhouse/tests/test_config_defaults.py @@ -70,6 +70,19 @@ 'max_samples_per_collection': 1000, 'run_sync': False, }, + # === DBM: Schema collector === + 'collect_schemas': { + 'enabled': False, + 'collection_interval': 600, + 'max_tables': 300, + 'max_columns': 1000, + 'max_query_duration': 60, + 'include_databases': (), + 'exclude_databases': (), + 'include_tables': (), + 'exclude_tables': (), + 'run_sync': False, + }, # === DBM: Parts and merges === 'parts_and_merges': { 'enabled': True, diff --git a/clickhouse/tests/test_metadata.py b/clickhouse/tests/test_metadata.py new file mode 100644 index 0000000000000..1e05dd37997be --- /dev/null +++ b/clickhouse/tests/test_metadata.py @@ -0,0 +1,621 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import contextlib +import json +import threading +from unittest import mock + +import pytest + +from datadog_checks.clickhouse import ClickhouseCheck +from datadog_checks.clickhouse.metadata import ClickhouseMetadata +from datadog_checks.clickhouse.schemas import ( + ClickhouseSchemaCollector, + _build_match_clauses, +) + +pytestmark = pytest.mark.unit + + +def _column_row(name='id', type_='UInt64', default='', comment='', position=1): + """Column tuple as returned by groupArrayIf in the combined CTE query.""" + return (name, type_, default, comment, position) + + +def _table_row( + database='default', + name='events', + engine='MergeTree', + uuid_str='uuid-1', + create_query='CREATE TABLE default.events (id UInt64) ENGINE = MergeTree ORDER BY id', + sorting_key='id', + partition_key='', + primary_key='id', + sampling_key='', + metadata_modified_at=1700000000, + columns=None, +): + return ( + database, + name, + engine, + uuid_str, + create_query, + sorting_key, + partition_key, + primary_key, + sampling_key, + metadata_modified_at, + columns or [], + ) + + +def _view_row( + database='default', + name='events_mv', + engine='MaterializedView', + uuid_str='uuid-2', + create_query='CREATE MATERIALIZED VIEW default.events_mv TO default.events_target AS SELECT * FROM default.events', + metadata_modified_at=1700001000, + columns=None, +): + return _table_row( + database=database, + name=name, + engine=engine, + uuid_str=uuid_str, + create_query=create_query, + sorting_key='', + partition_key='', + primary_key='', + sampling_key='', + metadata_modified_at=metadata_modified_at, + columns=columns, + ) + + +@pytest.fixture +def collect_schemas_instance(): + return { + 'server': 'localhost', + 'port': 9000, + 'username': 'default', + 'password': '', + 'db': 'default', + 'dbm': True, + 'collect_schemas': { + 'enabled': True, + 'collection_interval': 600, + 'max_tables': 5000, + 'max_columns': 1000, + 'run_sync': True, + }, + 'tags': ['test:clickhouse'], + } + + +@pytest.fixture +def check(collect_schemas_instance): + return ClickhouseCheck('clickhouse', {}, [collect_schemas_instance]) + + +@pytest.fixture +def collector(check) -> ClickhouseSchemaCollector: + return check.metadata._schema_collector + + +def _make_query_result(rows): + rows = list(rows) + result = mock.MagicMock() + result.result_set = rows + result.result_rows = rows + result.column_names = ('c0',) + result.column_types = () + result.summary = {} + return result + + +@contextlib.contextmanager +def _patch_query(collector, table_rows=None): + """Mocks the DBM client for the combined tables+columns CTE query.""" + table_rows = table_rows or [] + + @contextlib.contextmanager + def fake_stream(query, *args, **kwargs): + yield iter(table_rows) + + mock_client = mock.MagicMock() + mock_client.query_rows_stream.side_effect = fake_stream + + with mock.patch.object(collector._check, 'create_dbm_client', return_value=mock_client): + yield mock_client + + +def _capture_payloads(check): + captured: list[dict] = [] + check.database_monitoring_metadata = lambda raw: captured.append(json.loads(raw)) + check.gauge = lambda *a, **kw: None + return captured + + +def _run_collect(check, table_rows=None): + captured = _capture_payloads(check) + with _patch_query(check.metadata._schema_collector, table_rows): + check.metadata._schema_collector.collect_schemas() + return captured + + +@contextlib.contextmanager +def _capture_all_queries(collector): + """Records every SQL string sent through the DBM client.""" + seen: list[str] = [] + + def fake_client_query(query, *args, **kwargs): + seen.append(query) + return _make_query_result([]) + + @contextlib.contextmanager + def fake_stream(query, *args, **kwargs): + seen.append(query) + yield iter([]) + + mock_client = mock.MagicMock() + mock_client.query.side_effect = fake_client_query + mock_client.query_rows_stream.side_effect = fake_stream + + with mock.patch.object(collector._check, 'create_dbm_client', return_value=mock_client): + yield seen + + +@contextlib.contextmanager +def _capture_query_params(collector): + """Records (query, parameters) for each query_rows_stream call.""" + calls: list[tuple[str, dict]] = [] + + @contextlib.contextmanager + def fake_stream(query, *args, **kwargs): + calls.append((query, kwargs.get('parameters') or {})) + yield iter([]) + + mock_client = mock.MagicMock() + mock_client.query_rows_stream.side_effect = fake_stream + + with mock.patch.object(collector._check, 'create_dbm_client', return_value=mock_client): + yield calls + + +def test_initialization(check): + assert isinstance(check.metadata, ClickhouseMetadata) + assert isinstance(check.metadata._schema_collector, ClickhouseSchemaCollector) + assert check.metadata._collection_interval == 600 + assert check.metadata._schema_collector._config.max_tables == 5000 + assert check.metadata._schema_collector._config.max_columns == 1000 + + +def test_kind(collector): + assert collector.kind == 'clickhouse_databases' + + +def test_init_collection_interval_omitted_uses_default(): + check = ClickhouseCheck( + 'clickhouse', + {}, + [{'server': 'localhost', 'dbm': True, 'collect_schemas': {'enabled': True}}], + ) + assert check.metadata is not None + assert check.metadata._collection_interval == 600 + + +def test_init_max_tables_omitted_uses_default(): + check = ClickhouseCheck( + 'clickhouse', + {}, + [{'server': 'localhost', 'dbm': True, 'collect_schemas': {'enabled': True}}], + ) + assert check.metadata._schema_collector._config.max_tables == 300 + + +def test_init_max_query_duration_omitted_uses_default(): + check = ClickhouseCheck( + 'clickhouse', + {}, + [{'server': 'localhost', 'dbm': True, 'collect_schemas': {'enabled': True}}], + ) + assert check.metadata._schema_collector._config.max_query_duration == 60 + + +def test_init_filters_omitted_default_to_empty_tuples(): + check = ClickhouseCheck( + 'clickhouse', + {}, + [{'server': 'localhost', 'dbm': True, 'collect_schemas': {'enabled': True}}], + ) + cfg = check.metadata._schema_collector._config + assert cfg.include_databases == () + assert cfg.exclude_databases == () + assert cfg.include_tables == () + assert cfg.exclude_tables == () + + +def test_disabled_when_dbm_off(): + check = ClickhouseCheck( + 'clickhouse', + {}, + [{'server': 'localhost', 'dbm': False, 'collect_schemas': {'enabled': True, 'collection_interval': 600}}], + ) + assert check.metadata is None + + +def test_disabled_by_default_when_dbm_on(): + check = ClickhouseCheck('clickhouse', {}, [{'server': 'localhost', 'dbm': True}]) + assert check.metadata is None + + +def test_disabled_when_explicitly_opted_out(): + check = ClickhouseCheck( + 'clickhouse', + {}, + [{'server': 'localhost', 'dbm': True, 'collect_schemas': {'enabled': False, 'collection_interval': 600}}], + ) + assert check.metadata is None + + +def test_collect_emits_single_payload_when_small(check): + payloads = _run_collect( + check, + table_rows=[ + _table_row(name='events', columns=[_column_row(name='id')]), + _view_row(name='events_mv'), + ], + ) + assert len(payloads) == 1 + p = payloads[0] + assert p['kind'] == 'clickhouse_databases' + assert p['dbms'] == 'clickhouse' + assert p['collection_payloads_count'] == 1 + assert p['collection_started_at'] > 0 + assert 'host' in p + assert 'collector_id' in p + + +def test_collect_payload_tables_list_includes_views(check): + payloads = _run_collect( + check, + table_rows=[ + _table_row(name='events'), + _view_row( + name='events_mv', + create_query=( + 'CREATE MATERIALIZED VIEW default.events_mv' + ' REFRESH EVERY 1 HOUR TO default.events_target' + ' AS SELECT * FROM default.events' + ), + ), + ], + ) + dbs = payloads[0]['metadata'] + # Tables and views share a single `tables` list; views are identified by engine. + items = {t['name']: t for db in dbs for t in db['tables']} + assert 'events' in items + assert 'events_mv' in items + assert items['events_mv']['engine'] == 'MaterializedView' + assert items['events_mv']['is_refreshable'] is True + + +@pytest.mark.parametrize('engine', ['View', 'LiveView', 'WindowView']) +def test_collect_view_engines_appear_in_tables_list(check, engine): + payloads = _run_collect(check, table_rows=[_view_row(name='some_view', engine=engine)]) + dbs = payloads[0]['metadata'] + items = [t for db in dbs for t in db['tables']] + assert [t['name'] for t in items] == ['some_view'] + assert items[0]['engine'] == engine + + +def test_collect_dedupes_replica_rows_via_sql(check): + payloads = _run_collect(check, table_rows=[_table_row(name='events')]) + dbs = payloads[0]['metadata'] + table_names = [t['name'] for db in dbs for t in db['tables']] + assert table_names == ['events'] + + +def test_collect_emits_empty_snapshot_marker_when_no_tables(check): + # An empty run still emits one terminal payload so the backend receives a + # snapshot marker (collection_payloads_count) and can clear stale state. + payloads = _run_collect(check, table_rows=[]) + assert len(payloads) == 1 + assert payloads[0]['metadata'] == [] + assert payloads[0]['collection_payloads_count'] == 1 + + +def test_collect_marks_view_refreshable_based_on_create_query(check): + payloads = _run_collect( + check, + table_rows=[ + _view_row( + name='refreshable_mv', + create_query=( + 'CREATE MATERIALIZED VIEW default.refreshable_mv' + ' REFRESH EVERY 1 HOUR TO default.target' + ' AS SELECT * FROM default.src' + ), + ), + _view_row( + name='vanilla_view', + engine='View', + create_query='CREATE VIEW default.vanilla_view AS SELECT 1', + ), + ], + ) + by_name = {t['name']: t for db in payloads[0]['metadata'] for t in db['tables']} + assert by_name['refreshable_mv']['is_refreshable'] is True + assert by_name['vanilla_view']['is_refreshable'] is False + + +def test_collect_columns_attached_to_correct_parent(check): + payloads = _run_collect( + check, + table_rows=[ + _table_row(name='events', columns=[_column_row(name='id')]), + _view_row(name='events_mv', columns=[_column_row(name='count', type_='UInt64')]), + ], + ) + dbs = payloads[0]['metadata'] + table = next(t for db in dbs for t in db['tables'] if t['name'] == 'events') + view = next(t for db in dbs for t in db['tables'] if t['name'] == 'events_mv') + assert [c['name'] for c in table['columns']] == ['id'] + assert [c['name'] for c in view['columns']] == ['count'] + + +def test_collect_chunks_when_payload_chunk_size_exceeded(check): + check.metadata._schema_collector._config.payload_chunk_size = 5 + rows = [_table_row(name=f'big_{i}') for i in range(12)] + + payloads = _run_collect(check, table_rows=rows) + + assert len(payloads) >= 2 + emitted_total = sum(len(db['tables']) for p in payloads for db in p['metadata']) + assert emitted_total == 12 + + +def test_collect_collection_payloads_count_only_on_last(check): + check.metadata._schema_collector._config.payload_chunk_size = 5 + rows = [_table_row(name=f'big_{i}') for i in range(12)] + + payloads = _run_collect(check, table_rows=rows) + + for intermediate in payloads[:-1]: + assert 'collection_payloads_count' not in intermediate + assert payloads[-1]['collection_payloads_count'] == len(payloads) + + +def test_collect_all_chunks_share_collection_started_at(check): + check.metadata._schema_collector._config.payload_chunk_size = 5 + rows = [_table_row(name=f'big_{i}') for i in range(12)] + + payloads = _run_collect(check, table_rows=rows) + + started_ats = {p['collection_started_at'] for p in payloads} + assert len(started_ats) == 1 + + +def test_cancel_closes_db_client(check): + fake_client = mock.MagicMock() + check.metadata._schema_collector._db_client = fake_client + + check.metadata.cancel() + + assert check.metadata._schema_collector._db_client is None + fake_client.close.assert_called_once() + + +def test_combined_query_dedupes_replicas_before_limit(check): + _capture_payloads(check) + with _capture_all_queries(check.metadata._schema_collector) as seen_queries: + check.metadata._schema_collector.collect_schemas() + + combined_query = next(q for q in seen_queries if 'FROM system.tables' in q) + dedup_idx = combined_query.find('LIMIT 1 BY (database, name)') + outer_limit_idx = combined_query.find('LIMIT 5000') + assert dedup_idx >= 0 + assert outer_limit_idx >= 0 + assert dedup_idx < outer_limit_idx + + +def test_combined_query_joins_columns_and_caps_per_table(check): + _capture_payloads(check) + with _capture_all_queries(check.metadata._schema_collector) as seen_queries: + check.metadata._schema_collector.collect_schemas() + + combined_query = next(q for q in seen_queries if 'FROM system.columns' in q) + assert '(database, table) IN (' in combined_query + assert 'FROM system.tables' in combined_query + assert 'LIMIT 1 BY (database, name)' in combined_query + assert 'LIMIT 1 BY (database, table, name)' in combined_query + # limit_columns = max_tables * max_columns = 5000 * 1000 + assert 'LIMIT 5000000' in combined_query + # per-table cap via groupArrayIf + assert 'groupArrayIf(1000)' in combined_query + + +def test_collect_routes_through_cluster_all_replicas_in_single_endpoint_mode(collect_schemas_instance): + collect_schemas_instance['single_endpoint_mode'] = True + check = ClickhouseCheck('clickhouse', {}, [collect_schemas_instance]) + _capture_payloads(check) + with _capture_all_queries(check.metadata._schema_collector) as seen_queries: + check.metadata._schema_collector.collect_schemas() + + joined = '\n'.join(seen_queries) + assert "clusterAllReplicas('default', system.tables)" in joined + assert "clusterAllReplicas('default', system.columns)" in joined + + +def test_build_match_clauses_empty_returns_empty_string(): + assert _build_match_clauses('database', (), (), 'db') == ('', {}) + + +def test_build_match_clauses_excludes_only(): + out, params = _build_match_clauses('database', (), ('tmp_.*', 'shadow_.*'), 'db') + assert "AND NOT match(database, {db_exclude_0:String})" in out + assert "AND NOT match(database, {db_exclude_1:String})" in out + assert params == {'db_exclude_0': 'tmp_.*', 'db_exclude_1': 'shadow_.*'} + + +def test_build_match_clauses_includes_become_or_disjunction(): + out, params = _build_match_clauses('name', ('events.*', 'orders.*'), (), 'table') + assert "AND (match(name, {table_include_0:String}) OR match(name, {table_include_1:String}))" in out + assert params == {'table_include_0': 'events.*', 'table_include_1': 'orders.*'} + + +def test_build_match_clauses_single_include_pattern(): + out, params = _build_match_clauses('name', ('only_one.*',), (), 'table') + assert "AND (match(name, {table_include_0:String}))" in out + assert " OR " not in out + assert params == {'table_include_0': 'only_one.*'} + + +def test_build_match_clauses_excludes_appear_before_includes(): + out, _ = _build_match_clauses('database', ('keep_.*',), ('drop_.*',), 'db') + exclude_idx = out.find("AND NOT match(database, {db_exclude_0:String})") + include_idx = out.find("AND (match(database, {db_include_0:String}))") + assert exclude_idx >= 0 and include_idx >= 0 + assert exclude_idx < include_idx + + +def test_build_match_clauses_combines_includes_and_excludes(): + out, params = _build_match_clauses('database', ('keep_.*',), ('drop_.*',), 'db') + assert "AND NOT match(database, {db_exclude_0:String})" in out + assert "AND (match(database, {db_include_0:String}))" in out + assert params == {'db_exclude_0': 'drop_.*', 'db_include_0': 'keep_.*'} + + +def test_build_match_clauses_passes_pattern_verbatim_as_parameter(): + # SQL-injection guard: a pattern containing a quote is bound as a parameter + # value, not escaped/interpolated into the SQL text. + out, params = _build_match_clauses('database', (), ("o'reilly_.*",), 'db') + assert out == "AND NOT match(database, {db_exclude_0:String})" + assert params == {'db_exclude_0': "o'reilly_.*"} + + +def test_database_filters_appear_in_combined_query(collect_schemas_instance): + collect_schemas_instance['collect_schemas']['exclude_databases'] = ['tmp_.*'] + collect_schemas_instance['collect_schemas']['include_databases'] = ['keep_.*'] + check = ClickhouseCheck('clickhouse', {}, [collect_schemas_instance]) + _capture_payloads(check) + with _capture_query_params(check.metadata._schema_collector) as calls: + check.metadata._schema_collector.collect_schemas() + + combined_query, params = next((q, p) for q, p in calls if 'FROM system.tables' in q) + assert "AND NOT match(database, {db_exclude_0:String})" in combined_query + assert "AND (match(database, {db_include_0:String}))" in combined_query + assert params['db_exclude_0'] == 'tmp_.*' + assert params['db_include_0'] == 'keep_.*' + + +def test_table_filters_appear_in_combined_query(collect_schemas_instance): + collect_schemas_instance['collect_schemas']['include_tables'] = ['events.*'] + collect_schemas_instance['collect_schemas']['exclude_tables'] = ['tmp_.*'] + check = ClickhouseCheck('clickhouse', {}, [collect_schemas_instance]) + _capture_payloads(check) + with _capture_query_params(check.metadata._schema_collector) as calls: + check.metadata._schema_collector.collect_schemas() + + combined_query, params = next((q, p) for q, p in calls if 'FROM system.tables' in q) + assert "AND NOT match(name, {table_exclude_0:String})" in combined_query + assert "AND (match(name, {table_include_0:String}))" in combined_query + assert params['table_exclude_0'] == 'tmp_.*' + assert params['table_include_0'] == 'events.*' + + +def test_all_cluster_fanout_queries_dedupe_replica_rows(check): + """Every query that hits a system table needs LIMIT 1 BY to prevent replica fan-out duplicates.""" + _capture_payloads(check) + with _capture_all_queries(check.metadata._schema_collector) as seen_queries: + check.metadata._schema_collector.collect_schemas() + + combined_query = next(q for q in seen_queries if 'FROM system.tables' in q) + + assert 'LIMIT 1 BY (database, name)' in combined_query + assert 'LIMIT 1 BY (database, table, name)' in combined_query + + +def test_system_databases_excluded_from_all_queries(collect_schemas_instance): + """All cluster-wide queries hard-exclude ClickHouse's internal databases.""" + check = ClickhouseCheck('clickhouse', {}, [collect_schemas_instance]) + _capture_payloads(check) + with _capture_all_queries(check.metadata._schema_collector) as seen_queries: + check.metadata._schema_collector.collect_schemas() + + for kw in ('system.tables', 'system.columns'): + q = next(q for q in seen_queries if kw in q) + assert "database NOT IN (" in q + + +def test_collect_uses_local_system_tables_in_direct_mode(check): + _capture_payloads(check) + with _capture_all_queries(check.metadata._schema_collector) as seen_queries: + check.metadata._schema_collector.collect_schemas() + + joined = '\n'.join(seen_queries) + assert 'clusterAllReplicas' not in joined + assert 'FROM system.tables' in joined + assert 'FROM system.columns' in joined + + +def test_max_execution_time_set_on_client(collector): + _capture_payloads(collector._check) + with _patch_query(collector) as mock_client: + collector.collect_schemas() + + mock_client.set_client_setting.assert_called_once_with('max_execution_time', collector._config.max_query_duration) + + +def test_main_query_failure_closes_client(collector): + mock_client = mock.MagicMock() + mock_client.query_rows_stream.return_value.__enter__.side_effect = Exception("main query failed") + + _capture_payloads(collector._check) + with mock.patch.object(collector._check, 'create_dbm_client', return_value=mock_client): + with pytest.raises(Exception, match="main query failed"): + collector.collect_schemas() + + mock_client.close.assert_called_once() + assert collector._db_client is None + + +def test_payload_chunking(check, collector): + # Set a small chunk size so 7 tables produce 3 separate payloads. + collector._config.payload_chunk_size = 3 + table_rows = [_table_row(name=f'tbl_{i}') for i in range(7)] + captured = _run_collect(check, table_rows=table_rows) + + # Three payloads: rows 0-2, rows 3-5, row 6 + assert len(captured) == 3 + + # Only the last payload carries collection_payloads_count (snapshot marker) + assert 'collection_payloads_count' not in captured[0] + assert 'collection_payloads_count' not in captured[1] + assert captured[2]['collection_payloads_count'] == 3 + + # Non-final chunks hold exactly chunk_size rows; final chunk holds the remainder + assert len(captured[0]['metadata']) == 3 + assert len(captured[1]['metadata']) == 3 + assert len(captured[2]['metadata']) == 1 + + # Every table appears exactly once across all payloads + all_names = [ + entry['tables'][0]['name'] for payload in captured for entry in payload['metadata'] if entry.get('tables') + ] + assert sorted(all_names) == sorted(f'tbl_{i}' for i in range(7)) + + # Schema kind is correct on every payload + assert all(p['kind'] == 'clickhouse_databases' for p in captured) + + +def test_cancel_event_aborts_before_query(collector): + cancel_event = threading.Event() + collector._cancel_event = cancel_event + cancel_event.set() + + with pytest.raises(Exception, match="cancelled"): + collector._check_cancelled() diff --git a/clickhouse/tests/test_metadata_integration.py b/clickhouse/tests/test_metadata_integration.py new file mode 100644 index 0000000000000..f5139f1fabe7b --- /dev/null +++ b/clickhouse/tests/test_metadata_integration.py @@ -0,0 +1,317 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from concurrent.futures.thread import ThreadPoolExecutor +from copy import deepcopy + +import clickhouse_connect +import pytest + +from datadog_checks.base.utils.db.utils import DBMAsyncJob +from datadog_checks.clickhouse import ClickhouseCheck + +from .common import CLICKHOUSE_VERSION + +UNSUPPORTED_VERSIONS = {'18', '19', '20', '21.8', '22.7'} +NO_VIEW_REFRESHES_VERSIONS = UNSUPPORTED_VERSIONS | {'23.2', '23.8'} + + +def _is_supported(): + if CLICKHOUSE_VERSION == 'latest': + return True + return CLICKHOUSE_VERSION not in UNSUPPORTED_VERSIONS + + +def _supports_view_refreshes(): + if CLICKHOUSE_VERSION == 'latest': + return True + return CLICKHOUSE_VERSION not in NO_VIEW_REFRESHES_VERSIONS + + +pytestmark = [ + pytest.mark.integration, + pytest.mark.usefixtures('dd_environment'), + pytest.mark.skipif( + not _is_supported(), + reason='metadata collection requires DBM support (ClickHouse 21.8+)', + ), +] + + +@pytest.fixture +def metadata_instance(instance): + instance['dbm'] = True + instance['collect_schemas'] = { + 'enabled': True, + 'run_sync': True, + 'collection_interval': 60, + 'max_tables': 5000, + 'max_columns': 1000, + } + instance['query_metrics'] = {'enabled': False} + instance['query_samples'] = {'enabled': False} + instance['query_completions'] = {'enabled': False} + instance['query_errors'] = {'enabled': False} + instance['parts_and_merges'] = {'enabled': False} + return instance + + +@pytest.fixture(autouse=True) +def stop_orphaned_threads(): + DBMAsyncJob.executor.shutdown(wait=True) + DBMAsyncJob.executor = ThreadPoolExecutor() + + +def _client(instance_config): + return clickhouse_connect.get_client( + host=instance_config['server'], + port=instance_config['port'], + username=instance_config['username'], + password=instance_config['password'], + ) + + +def _catalog_events(aggregator): + return [e for e in aggregator.get_event_platform_events('dbm-metadata') if e.get('kind') == 'clickhouse_databases'] + + +def _databases(catalog_events): + out = [] + for ev in catalog_events: + out.extend(ev.get('metadata') or []) + return out + + +def _merged_database(catalog_events, name): + """Merge every per-row entry for `name` across events into one dict. + + SchemaCollector emits one DatabaseObject per table/view; the same database + name appears multiple times across chunks. The backend dedupes on its side; + tests need to do the same to assert on the full set of tables. Tables and + views share a single `tables` list; views are identified by engine. + """ + tables: list[dict] = [] + found = False + for db in _databases(catalog_events): + if db.get('name') != name: + continue + found = True + tables.extend(db.get('tables') or []) + if not found: + return None + return {'name': name, 'tables': tables} + + +_find_database = _merged_database + + +def test_metadata_payload_emitted(aggregator, metadata_instance, dd_run_check): + client = _client(metadata_instance) + table = 'dd_md_payload_test' + try: + client.command(f'DROP TABLE IF EXISTS default.{table}') + client.command(f'CREATE TABLE default.{table} (id UInt64, ts DateTime) ENGINE = MergeTree ORDER BY id') + + check = ClickhouseCheck('clickhouse', {}, [metadata_instance]) + check.check_id = 'test-collector-id' + dd_run_check(check) + + events = _catalog_events(aggregator) + assert events, 'Expected at least one clickhouse_databases event on dbm-metadata' + + ev = events[-1] + assert ev['dbms'] == 'clickhouse' + assert ev['database_instance'] + assert ev['agent_version'] + assert ev['collection_started_at'] > 0 + assert ev['collection_payloads_count'] == 1 + assert ev['collector_id'] == 'test-collector-id' + assert ev['timestamp'] > 0 + + db = _find_database(events, 'default') + assert db is not None, "Expected the 'default' database to be present in payload" + assert any(t['name'] == table for t in db['tables']), ( + f'Expected table {table} in catalog payload; got: {[t["name"] for t in db["tables"]]}' + ) + finally: + client.command(f'DROP TABLE IF EXISTS default.{table} SYNC') + + +def test_metadata_columns_collected(aggregator, metadata_instance, dd_run_check): + client = _client(metadata_instance) + table = 'dd_md_columns_test' + try: + client.command(f'DROP TABLE IF EXISTS default.{table}') + client.command( + f'CREATE TABLE default.{table} (' + 'id UInt64, ' + 'event_name String, ' + 'created_at DateTime DEFAULT now()' + ') ENGINE = MergeTree ORDER BY id' + ) + + check = ClickhouseCheck('clickhouse', {}, [metadata_instance]) + dd_run_check(check) + + events = _catalog_events(aggregator) + db = _find_database(events, 'default') + assert db is not None + + target = next((t for t in db['tables'] if t['name'] == table), None) + assert target is not None, f'Expected {table} in tables, got {[t["name"] for t in db["tables"]]}' + + col_names = [c['name'] for c in target['columns']] + assert col_names == ['id', 'event_name', 'created_at'], col_names + + types = {c['name']: c['type'] for c in target['columns']} + assert types['id'] == 'UInt64' + assert types['event_name'] == 'String' + + defaults = {c['name']: c['default'] for c in target['columns']} + assert defaults['created_at'], 'DEFAULT expression should round-trip into payload' + finally: + client.command(f'DROP TABLE IF EXISTS default.{table} SYNC') + + +def test_metadata_materialized_view_with_target(aggregator, metadata_instance, dd_run_check): + client = _client(metadata_instance) + src = 'dd_md_mv_src' + target = 'dd_md_mv_target' + mv = 'dd_md_mv_view' + try: + for obj in (mv, target, src): + client.command(f'DROP TABLE IF EXISTS default.{obj}') + + client.command(f'CREATE TABLE default.{src} (id UInt64, val UInt64) ENGINE = MergeTree ORDER BY id') + client.command(f'CREATE TABLE default.{target} (id UInt64, total UInt64) ENGINE = MergeTree ORDER BY id') + client.command( + f'CREATE MATERIALIZED VIEW default.{mv} TO default.{target} AS SELECT id, val AS total FROM default.{src}' + ) + + check = ClickhouseCheck('clickhouse', {}, [metadata_instance]) + dd_run_check(check) + + events = _catalog_events(aggregator) + db = _find_database(events, 'default') + assert db is not None + + view = next((t for t in db['tables'] if t['name'] == mv), None) + assert view is not None, f'Expected view {mv} in payload; got: {[t["name"] for t in db["tables"]]}' + assert view['engine'] == 'MaterializedView' + assert view['create_query'] + assert f'TO default.{target}' in view['create_query'] + assert f'FROM default.{src}' in view['create_query'] + finally: + for obj in (mv, target, src): + client.command(f'DROP TABLE IF EXISTS default.{obj} SYNC') + + +def test_metadata_skips_system_databases(aggregator, metadata_instance, dd_run_check): + check = ClickhouseCheck('clickhouse', {}, [metadata_instance]) + dd_run_check(check) + + db_names = {db['name'] for db in _databases(_catalog_events(aggregator))} + forbidden = {'system', 'INFORMATION_SCHEMA', 'information_schema'} + leaked = db_names & forbidden + assert not leaked, f'System databases leaked into payload: {leaked}' + + +def test_metadata_disabled_emits_no_payload(aggregator, instance, dd_run_check): + instance_config = deepcopy(instance) + instance_config['dbm'] = True + instance_config['collect_schemas'] = {'enabled': False, 'collection_interval': 60} + + check = ClickhouseCheck('clickhouse', {}, [instance_config]) + assert check.metadata is None + dd_run_check(check) + + assert _catalog_events(aggregator) == [], ( + 'Expected no clickhouse_databases payload when collect_schemas is disabled' + ) + + +def test_metadata_exclude_tables_filter(aggregator, metadata_instance, dd_run_check): + client = _client(metadata_instance) + table_keep = 'dd_md_filter_keep' + table_drop = 'dd_md_filter_drop' + try: + for t in (table_keep, table_drop): + client.command(f'DROP TABLE IF EXISTS default.{t}') + client.command(f'CREATE TABLE default.{t} (id UInt64) ENGINE = MergeTree ORDER BY id') + + instance = deepcopy(metadata_instance) + instance['collect_schemas']['exclude_tables'] = [f'^{table_drop}$'] + + check = ClickhouseCheck('clickhouse', {}, [instance]) + dd_run_check(check) + + db = _find_database(_catalog_events(aggregator), 'default') + assert db is not None + table_names = {t['name'] for t in db['tables']} + assert table_keep in table_names + assert table_drop not in table_names + finally: + for t in (table_keep, table_drop): + client.command(f'DROP TABLE IF EXISTS default.{t} SYNC') + + +def test_metadata_include_tables_filter(aggregator, metadata_instance, dd_run_check): + client = _client(metadata_instance) + table_included = 'dd_md_include_target' + table_other = 'dd_md_include_other' + try: + for t in (table_included, table_other): + client.command(f'DROP TABLE IF EXISTS default.{t}') + client.command(f'CREATE TABLE default.{t} (id UInt64) ENGINE = MergeTree ORDER BY id') + + instance = deepcopy(metadata_instance) + instance['collect_schemas']['include_tables'] = [f'^{table_included}$'] + + check = ClickhouseCheck('clickhouse', {}, [instance]) + dd_run_check(check) + + db = _find_database(_catalog_events(aggregator), 'default') + assert db is not None + table_names = {t['name'] for t in db['tables']} + assert table_included in table_names + assert table_other not in table_names + finally: + for t in (table_included, table_other): + client.command(f'DROP TABLE IF EXISTS default.{t} SYNC') + + +@pytest.mark.skipif( + not _supports_view_refreshes(), + reason='system.view_refreshes requires ClickHouse 24.3+', +) +def test_metadata_refreshable_view_status_populated(aggregator, metadata_instance, dd_run_check): + client = _client(metadata_instance) + src = 'dd_md_refresh_src' + target = 'dd_md_refresh_target' + mv = 'dd_md_refreshable_mv' + try: + for obj in (mv, target, src): + client.command(f'DROP TABLE IF EXISTS default.{obj}') + + client.command(f'CREATE TABLE default.{src} (id UInt64, val UInt64) ENGINE = MergeTree ORDER BY id') + client.command(f'CREATE TABLE default.{target} (id UInt64, total UInt64) ENGINE = MergeTree ORDER BY id') + client.command( + f'CREATE MATERIALIZED VIEW default.{mv} REFRESH EVERY 1 HOUR ' + f'TO default.{target} AS SELECT id, val AS total FROM default.{src}', + settings={'allow_experimental_refreshable_materialized_view': 1}, + ) + + check = ClickhouseCheck('clickhouse', {}, [metadata_instance]) + check.check_id = 'test-collector-id' + dd_run_check(check) + + events = _catalog_events(aggregator) + db = _find_database(events, 'default') + assert db is not None + + view = next((t for t in db['tables'] if t['name'] == mv), None) + assert view is not None, f'Expected refreshable view {mv} in payload' + assert view['is_refreshable'] is True + finally: + for obj in (mv, target, src): + client.command(f'DROP TABLE IF EXISTS default.{obj} SYNC') diff --git a/clickhouse/tests/test_unit.py b/clickhouse/tests/test_unit.py index f92811c040fb9..788a3908e0a02 100644 --- a/clickhouse/tests/test_unit.py +++ b/clickhouse/tests/test_unit.py @@ -329,6 +329,21 @@ def test_query_completions_zero_samples_per_hour_defaults(bad_value): assert any('query_completions.samples_per_hour_per_query' in w for w in check._validation_result.warnings) +@pytest.mark.parametrize("bad_value", [0, -1, -100]) +def test_collect_schemas_zero_collection_interval_defaults(bad_value): + """Zero or negative collection_interval must not crash the constructor via ZeroDivisionError.""" + instance = { + 'server': 'localhost', + 'port': 9000, + 'username': 'default', + 'dbm': True, + 'collect_schemas': {'enabled': True, 'collection_interval': bad_value}, + } + check = ClickhouseCheck('clickhouse', {}, [instance]) + assert check._config.collect_schemas.collection_interval > 0 + assert any('collect_schemas.collection_interval' in w for w in check._validation_result.warnings) + + BASE_INSTANCE = {'server': 'myhost.example.com', 'port': 8123, 'username': 'default'} From 97919e60b3d526f40949049927cef42197c37802 Mon Sep 17 00:00:00 2001 From: "dd-octo-sts[bot]" <200755185+dd-octo-sts[bot]@users.noreply.github.com> Date: Wed, 3 Jun 2026 13:04:46 -0400 Subject: [PATCH 3/4] Finalize Agent release 7.79.2 (#23910) * Finalize Agent release 7.79.2 * Apply suggestions from code review Co-authored-by: Sarah Witt --------- Co-authored-by: sarah-witt <33498636+sarah-witt@users.noreply.github.com> Co-authored-by: Sarah Witt --- AGENT_CHANGELOG.md | 5 + AGENT_INTEGRATIONS.md | 261 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 266 insertions(+) diff --git a/AGENT_CHANGELOG.md b/AGENT_CHANGELOG.md index aa7056d0752a3..1371e61f8083e 100644 --- a/AGENT_CHANGELOG.md +++ b/AGENT_CHANGELOG.md @@ -1,3 +1,8 @@ +## Datadog Agent version [7.79.2](https://github.com/DataDog/datadog-agent/blob/master/CHANGELOG.rst#7792) + +### Integration Updates +* SQL Server [23.0.2](https://github.com/DataDog/integrations-core/blob/master/sqlserver/CHANGELOG.md) + ## Datadog Agent version [7.79.1](https://github.com/DataDog/datadog-agent/blob/master/CHANGELOG.rst#7791) ### Integration Updates diff --git a/AGENT_INTEGRATIONS.md b/AGENT_INTEGRATIONS.md index 6f14f8c0495ac..cfbdc2079fc06 100644 --- a/AGENT_INTEGRATIONS.md +++ b/AGENT_INTEGRATIONS.md @@ -1,3 +1,264 @@ +## Datadog Agent version 7.79.2 + +* datadog-active-directory: 4.6.0 +* datadog-activemq-xml: 5.5.0 +* datadog-activemq: 5.3.0 +* datadog-aerospike: 5.4.1 +* datadog-airflow: 7.4.0 +* datadog-amazon-msk: 7.7.0 +* datadog-ambari: 6.5.0 +* datadog-apache: 7.4.1 +* datadog-appgate-sdp: 2.4.1 +* datadog-arangodb: 4.4.1 +* datadog-arctic-wolf-aurora-endpoint-security: 1.0.0 +* datadog-argo-rollouts: 3.4.1 +* datadog-argo-workflows: 3.4.1 +* datadog-argocd: 4.5.0 +* datadog-aspdotnet: 4.6.0 +* datadog-avi-vantage: 6.4.1 +* datadog-aws-neuron: 3.4.1 +* datadog-azure-iot-edge: 6.5.0 +* datadog-barracuda-secure-edge: 1.1.0 +* datadog-bentoml: 1.5.1 +* datadog-beyondtrust-password-safe: 1.2.0 +* datadog-beyondtrust-privileged-remote-access: 1.0.0 +* datadog-boundary: 4.4.1 +* datadog-btrfs: 4.3.0 +* datadog-cacti: 4.5.0 +* datadog-calico: 5.4.1 +* datadog-cassandra-nodetool: 3.4.0 +* datadog-cassandra: 3.3.1 +* datadog-celery: 2.5.1 +* datadog-ceph: 4.5.1 +* datadog-cert-manager: 6.4.1 +* datadog-checkpoint-harmony-endpoint: 1.2.0 +* datadog-checkpoint-quantum-firewall: 1.3.0 +* datadog-checks-base: 37.35.1 +* datadog-checks-dependency-provider: 3.2.0 +* datadog-checks-downloader: 9.1.0 +* datadog-cilium: 6.4.1 +* datadog-cisco-aci: 4.14.2 +* datadog-cisco-asa: 1.0.0 +* datadog-cisco-secure-client: 1.0.0 +* datadog-cisco-secure-firewall: 1.3.0 +* datadog-cisco-secure-web-appliance: 1.3.0 +* datadog-citrix-hypervisor: 6.4.0 +* datadog-clickhouse: 6.6.0 +* datadog-cloud-foundry-api: 5.6.0 +* datadog-cloudera: 3.6.0 +* datadog-cloudgen-firewall: 1.1.0 +* datadog-cockroachdb: 6.4.1 +* datadog-confluent-platform: 3.3.0 +* datadog-consul: 5.4.0 +* datadog-control-m: 1.1.0 +* datadog-coredns: 6.4.1 +* datadog-couch: 9.4.0 +* datadog-couchbase: 6.5.0 +* datadog-crio: 5.4.0 +* datadog-datadog-cluster-agent: 6.5.0 +* datadog-datadog-csi-driver: 1.5.1 +* datadog-dcgm: 4.4.1 +* datadog-delinea-privilege-manager: 1.2.0 +* datadog-delinea-secret-server: 1.3.0 +* datadog-directory: 4.4.0 +* datadog-disk: 7.5.1 +* datadog-dns-check: 5.5.0 +* datadog-do-query-actions: 1.1.0 +* datadog-dotnetclr: 4.6.0 +* datadog-druid: 5.4.0 +* datadog-duckdb: 1.3.0 +* datadog-ecs-fargate: 7.4.0 +* datadog-eks-fargate: 6.5.0 +* datadog-elastic: 9.5.1 +* datadog-envoy: 6.5.1 +* datadog-eset-protect: 1.2.0 +* datadog-esxi: 4.3.0 +* datadog-etcd: 9.4.0 +* datadog-exchange-server: 4.6.0 +* datadog-external-dns: 6.4.0 +* datadog-falco: 2.4.1 +* datadog-flink: 3.2.0 +* datadog-fluentd: 5.5.1 +* datadog-fluxcd: 3.4.1 +* datadog-fly-io: 3.4.1 +* datadog-forescout: 1.0.0 +* datadog-foundationdb: 3.7.0 +* datadog-gearmand: 5.3.0 +* datadog-gitlab-runner: 7.5.0 +* datadog-gitlab: 10.4.1 +* datadog-glusterfs: 3.4.1 +* datadog-go-expvar: 5.4.0 +* datadog-guarddog: 1.2.0 +* datadog-gunicorn: 4.5.0 +* datadog-haproxy: 8.4.1 +* datadog-harbor: 6.4.0 +* datadog-hazelcast: 6.6.0 +* datadog-hdfs-datanode: 7.4.0 +* datadog-hdfs-namenode: 7.4.0 +* datadog-hive: 2.4.0 +* datadog-hivemq: 2.4.0 +* datadog-http-check: 12.6.2 +* datadog-hudi: 4.3.0 +* datadog-hugging-face-tgi: 1.5.1 +* datadog-hyperv: 3.3.0 +* datadog-ibm-ace: 4.5.1 +* datadog-ibm-db2: 4.3.0 +* datadog-ibm-i: 4.5.0 +* datadog-ibm-mq: 8.9.1 +* datadog-ibm-spectrum-lsf: 1.3.0 +* datadog-ibm-was: 5.5.1 +* datadog-iboss: 1.2.0 +* datadog-ignite: 3.4.0 +* datadog-iis: 5.6.0 +* datadog-impala: 4.4.1 +* datadog-infiniband: 1.6.0 +* datadog-istio: 9.5.1 +* datadog-ivanti-connect-secure: 1.2.0 +* datadog-jboss-wildfly: 3.4.0 +* datadog-journald: 3.2.0 +* datadog-juniper-srx-firewall: 1.3.0 +* datadog-kafka-actions: 2.6.0 +* datadog-kafka-consumer: 7.2.1 +* datadog-kafka: 4.5.0 +* datadog-karpenter: 3.4.1 +* datadog-keda: 2.4.1 +* datadog-keycloak: 1.2.0 +* datadog-kong: 6.4.1 +* datadog-krakend: 1.4.1 +* datadog-kube-apiserver-metrics: 7.5.0 +* datadog-kube-controller-manager: 8.4.1 +* datadog-kube-dns: 7.4.0 +* datadog-kube-metrics-server: 6.4.0 +* datadog-kube-proxy: 9.4.0 +* datadog-kube-scheduler: 7.4.1 +* datadog-kubeflow: 2.4.1 +* datadog-kubelet: 10.4.0 +* datadog-kubernetes-cluster-autoscaler: 3.4.1 +* datadog-kubernetes-state: 10.5.0 +* datadog-kubevirt-api: 2.5.1 +* datadog-kubevirt-controller: 2.4.1 +* datadog-kubevirt-handler: 2.5.1 +* datadog-kuma: 2.4.1 +* datadog-kyototycoon: 4.5.0 +* datadog-kyverno: 3.4.1 +* datadog-lighttpd: 5.5.0 +* datadog-linkerd: 7.4.1 +* datadog-linux-audit-logs: 1.2.0 +* datadog-linux-proc-extras: 4.3.0 +* datadog-litellm: 2.4.1 +* datadog-lustre: 1.5.0 +* datadog-mac-audit-logs: 1.4.1 +* datadog-mapr: 3.4.0 +* datadog-mapreduce: 7.4.0 +* datadog-marathon: 5.4.0 +* datadog-marklogic: 6.5.0 +* datadog-mcache: 6.4.0 +* datadog-mesos-master: 6.4.0 +* datadog-mesos-slave: 6.4.0 +* datadog-microsoft-dns: 1.2.0 +* datadog-microsoft-sysmon: 1.2.0 +* datadog-milvus: 2.5.1 +* datadog-mongo: 10.10.0 +* datadog-mysql: 15.16.1 +* datadog-n8n: 2.0.0 +* datadog-nagios: 3.4.0 +* datadog-network: 5.7.0 +* datadog-nfsstat: 3.5.0 +* datadog-nginx-ingress-controller: 5.4.0 +* datadog-nginx: 9.4.1 +* datadog-nutanix: 1.2.0 +* datadog-nvidia-nim: 2.4.1 +* datadog-nvidia-triton: 3.4.1 +* datadog-octopus-deploy: 2.4.0 +* datadog-openldap: 3.3.0 +* datadog-openmetrics: 7.4.1 +* datadog-openstack-controller: 9.6.1 +* datadog-openstack: 4.3.0 +* datadog-openvpn: 1.2.0 +* datadog-ossec-security: 2.2.0 +* datadog-palo-alto-panorama: 1.2.0 +* datadog-pan-firewall: 3.3.0 +* datadog-pdh-check: 4.6.1 +* datadog-pgbouncer: 8.10.0 +* datadog-php-fpm: 6.4.1 +* datadog-ping-federate: 2.2.0 +* datadog-postfix: 3.4.1 +* datadog-postgres: 23.7.0 +* datadog-powerdns-recursor: 5.4.0 +* datadog-prefect: 1.0.1 +* datadog-presto: 3.4.0 +* datadog-process: 5.5.1 +* datadog-prometheus: 6.3.0 +* datadog-proxmox: 2.5.1 +* datadog-proxysql: 7.7.0 +* datadog-pulsar: 3.6.1 +* datadog-quarkus: 2.4.1 +* datadog-rabbitmq: 8.6.1 +* datadog-ray: 3.4.1 +* datadog-redisdb: 8.8.0 +* datadog-rethinkdb: 5.4.0 +* datadog-riak: 5.5.0 +* datadog-riakcs: 4.13.0 +* datadog-sap-hana: 5.5.0 +* datadog-scylla: 5.4.1 +* datadog-sidekiq: 3.2.0 +* datadog-silk: 4.5.0 +* datadog-silverstripe-cms: 1.8.0 +* datadog-singlestore: 4.6.0 +* datadog-slurm: 2.4.0 +* datadog-snmp: 12.3.2 +* datadog-solr: 2.4.0 +* datadog-sonarqube: 5.6.1 +* datadog-sonatype-nexus: 2.3.0 +* datadog-sonicwall-firewall: 1.3.0 +* datadog-spark: 7.7.1 +* datadog-sqlserver: 23.0.2 +* datadog-squid: 5.4.0 +* datadog-ssh-check: 4.8.0 +* datadog-statsd: 3.3.0 +* datadog-strimzi: 4.4.1 +* datadog-supabase: 2.4.1 +* datadog-supervisord: 4.4.1 +* datadog-suricata: 2.2.0 +* datadog-symantec-endpoint-protection: 1.3.0 +* datadog-system-core: 4.3.0 +* datadog-system-swap: 3.3.0 +* datadog-tcp-check: 6.3.0 +* datadog-teamcity: 7.4.1 +* datadog-tekton: 3.4.1 +* datadog-teleport: 3.4.1 +* datadog-temporal: 4.5.1 +* datadog-tenable: 3.2.0 +* datadog-teradata: 4.3.0 +* datadog-tibco-ems: 2.5.0 +* datadog-tls: 5.6.2 +* datadog-tokumx: 3.6.1 +* datadog-tomcat: 4.3.0 +* datadog-torchserve: 4.4.1 +* datadog-traefik-mesh: 3.5.0 +* datadog-traffic-server: 3.6.0 +* datadog-twemproxy: 3.3.0 +* datadog-twistlock: 6.4.0 +* datadog-varnish: 4.4.1 +* datadog-vault: 7.4.1 +* datadog-velero: 3.4.1 +* datadog-vertica: 6.5.0 +* datadog-vllm: 3.4.1 +* datadog-voltdb: 6.4.1 +* datadog-vsphere: 9.4.1 +* datadog-watchguard-firebox: 1.2.0 +* datadog-wazuh: 1.3.0 +* datadog-weaviate: 4.4.1 +* datadog-weblogic: 3.3.0 +* datadog-win32-event-log: 5.6.0 +* datadog-windows-performance-counters: 3.4.0 +* datadog-windows-service: 6.7.1 +* datadog-wmi-check: 4.1.1 +* datadog-yarn: 8.4.1 +* datadog-zeek: 1.2.0 +* datadog-zk: 6.5.1 +* datadog-zscaler-private-access: 1.1.0 + ## Datadog Agent version 7.79.1 * datadog-active-directory: 4.6.0 From 22e54b1592b65cb46b36eedbe92cab351ccf6557 Mon Sep 17 00:00:00 2001 From: Sarah Witt Date: Wed, 3 Jun 2026 13:07:53 -0400 Subject: [PATCH 4/4] Fix kerberos tests (#23923) * fix kerberos tests * rename image --- .../tests/compose/kerberos/kerberos-agent.yaml | 2 +- .../tests/compose/kerberos/kerberos-nginx/Dockerfile | 7 ++++--- datadog_checks_base/tests/compose/kerberos/kerberos.yaml | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/datadog_checks_base/tests/compose/kerberos/kerberos-agent.yaml b/datadog_checks_base/tests/compose/kerberos/kerberos-agent.yaml index a3a082b917372..4316159793f94 100644 --- a/datadog_checks_base/tests/compose/kerberos/kerberos-agent.yaml +++ b/datadog_checks_base/tests/compose/kerberos/kerberos-agent.yaml @@ -54,7 +54,7 @@ services: - "464:8464" web: - image: kerberos-nginx:1.20.2 + image: kerberos-nginx:1.26.2 build: ./kerberos-nginx environment: KRB5_KEYTAB: ${KRB5_KEYTAB} diff --git a/datadog_checks_base/tests/compose/kerberos/kerberos-nginx/Dockerfile b/datadog_checks_base/tests/compose/kerberos/kerberos-nginx/Dockerfile index 6c63410f96c49..0bce0e6dcdf8c 100644 --- a/datadog_checks_base/tests/compose/kerberos/kerberos-nginx/Dockerfile +++ b/datadog_checks_base/tests/compose/kerberos/kerberos-nginx/Dockerfile @@ -1,4 +1,4 @@ -FROM nginx:1.20.2 +FROM nginx:1.26.3 ENV DEBIAN_FRONTEND=noninteractive @@ -15,11 +15,12 @@ RUN apt-get update -y -qq && apt-get install -y --no-install-recommends \ git RUN cd /usr/src && mkdir nginx \ - && curl -kfSL https://nginx.org/download/nginx-1.20.2.tar.gz -o nginx.tar.gz \ + && curl -kfSL https://nginx.org/download/nginx-1.26.3.tar.gz -o nginx.tar.gz \ && tar -xzf nginx.tar.gz -C nginx --strip-components=1 RUN cd /usr/src/nginx \ - && git clone http://github.com/stnoonan/spnego-http-auth-nginx-module.git + && git clone http://github.com/stnoonan/spnego-http-auth-nginx-module.git \ + && git -C spnego-http-auth-nginx-module checkout 7fa3864a86d5 RUN cd /usr/src/nginx \ && ./configure --with-compat --add-dynamic-module=spnego-http-auth-nginx-module \ diff --git a/datadog_checks_base/tests/compose/kerberos/kerberos.yaml b/datadog_checks_base/tests/compose/kerberos/kerberos.yaml index d8007a4bd4c11..c6c89a352cc60 100644 --- a/datadog_checks_base/tests/compose/kerberos/kerberos.yaml +++ b/datadog_checks_base/tests/compose/kerberos/kerberos.yaml @@ -26,7 +26,7 @@ services: - "464:8464" web: - image: kerberos-nginx:1.20.2 + image: kerberos-nginx:1.26.2 build: ./kerberos-nginx container_name: kerberos-nginx environment: