From 43bf5ca68c280bd593b46f303ca0291b5640526d Mon Sep 17 00:00:00 2001
From: NouemanKHAL <noueman.khalikine@datadoghq.com>
Date: Wed, 3 Jun 2026 16:27:07 +0200
Subject: [PATCH 1/4] glusterfs: require trusted provider for gstatus_path
 (#23881)

* feat(glusterfs): require trusted provider for gstatus_path

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* ddev validate all --fix

* fix 3rd party licenses

* chore(glusterfs): add changelog entry for #23881

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* chore(glusterfs): add changelog for security validation onboarding

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* ddev validate licenses

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/config/labeler.yml             | 16 ++++++++--------
 glusterfs/assets/configuration/spec.yaml         |  1 +
 glusterfs/changelog.d/23881.added                |  1 +
 .../glusterfs/config_models/shared.py            |  8 ++++++++
 4 files changed, 18 insertions(+), 8 deletions(-)
 create mode 100644 glusterfs/changelog.d/23881.added
diff --git a/.github/workflows/config/labeler.yml b/.github/workflows/config/labeler.yml
index 0363017ec6cf1..de7ece88baa76 100644
--- a/.github/workflows/config/labeler.yml
+++ b/.github/workflows/config/labeler.yml
@@ -897,10 +897,6 @@ integration/langchain:
 - changed-files:
   - any-glob-to-any-file:
     - langchain/**/*
-integration/lparstats:
-- changed-files:
-  - any-glob-to-any-file:
-    - lparstats/**/*
 integration/lastpass:
 - changed-files:
   - any-glob-to-any-file:
@@ -925,6 +921,10 @@ integration/litellm:
 - changed-files:
   - any-glob-to-any-file:
     - litellm/**/*
+integration/lparstats:
+- changed-files:
+  - any-glob-to-any-file:
+    - lparstats/**/*
 integration/lustre:
 - changed-files:
   - any-glob-to-any-file:
@@ -1009,10 +1009,6 @@ integration/nagios:
 - changed-files:
   - any-glob-to-any-file:
     - nagios/**/*
-integration/nifi:
-- changed-files:
-  - any-glob-to-any-file:
-    - nifi/**/*
 integration/network:
 - changed-files:
   - any-glob-to-any-file:
@@ -1033,6 +1029,10 @@ integration/nginx_ingress_controller:
 - changed-files:
   - any-glob-to-any-file:
     - nginx_ingress_controller/**/*
+integration/nifi:
+- changed-files:
+  - any-glob-to-any-file:
+    - nifi/**/*
 integration/ntp:
 - changed-files:
   - any-glob-to-any-file:
diff --git a/glusterfs/assets/configuration/spec.yaml b/glusterfs/assets/configuration/spec.yaml
index 6855e26b76116..9c2da7e1a592d 100644
--- a/glusterfs/assets/configuration/spec.yaml
+++ b/glusterfs/assets/configuration/spec.yaml
@@ -15,6 +15,7 @@ files:
       value:
         type: string
         example: <datadog-agent>/embedded/sbin/gstatus
+        require_trusted_provider: true
     - template: init_config/default
   - template: instances
     options:
diff --git a/glusterfs/changelog.d/23881.added b/glusterfs/changelog.d/23881.added
new file mode 100644
index 0000000000000..691a43d8733d5
--- /dev/null
+++ b/glusterfs/changelog.d/23881.added
@@ -0,0 +1 @@
+Add support for security validation in models for the `gstatus_path` configuration option.
diff --git a/glusterfs/datadog_checks/glusterfs/config_models/shared.py b/glusterfs/datadog_checks/glusterfs/config_models/shared.py
index 73eab6388015c..cd84acb1a3ee7 100644
--- a/glusterfs/datadog_checks/glusterfs/config_models/shared.py
+++ b/glusterfs/datadog_checks/glusterfs/config_models/shared.py
@@ -19,6 +19,9 @@
 from . import defaults, validators
 
 
+SECURE_FIELD_NAMES = frozenset(['gstatus_path'])
+
+
 class SharedConfig(BaseModel):
     model_config = ConfigDict(
         validate_default=True,
@@ -38,6 +41,11 @@ def _validate(cls, value, info):
         field_name = field.alias or info.field_name
         if field_name in info.context['configured_fields']:
             value = getattr(validators, f'shared_{info.field_name}', identity)(value, field=field)
+
+            if info.field_name in SECURE_FIELD_NAMES:
+                validation.security.check_field_trusted_provider(
+                    info.field_name, value, info.context.get('security_config')
+                )
         else:
             value = getattr(defaults, f'shared_{info.field_name}', lambda: value)()
 

From 17f0700ec8f08178656b1d604279055913a83db9 Mon Sep 17 00:00:00 2001
From: Sangeeta Shivaji Rao <sangeeta.shivajirao@datadoghq.com>
Date: Wed, 3 Jun 2026 13:03:04 -0400
Subject: [PATCH 2/4] clickhouse: add schema collection (collect_schemas)
 (#23899)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add ClickHouse schema collection (collect_schemas)

Adds catalog metadata collection for DBM Schema Explorer: databases,
tables, views, and columns via new ClickhouseMetadata job class.
Introduces collect_schemas config block with include/exclude regex
filters for databases and tables.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* test(clickhouse): remove per-table size gauge test from schema-collection PR

That test checks clickhouse.table.rows/bytes which are emitted by
ClickhouseTableMetrics — a feature in the schema-metrics PR, not here.
Also remove the schema_metrics fixture config that has no effect in this branch.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* fix(clickhouse): rename changelog entry to match PR number 23899

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* test(clickhouse): remove view refresh metric assertions from schema-collection test

clickhouse.view.refresh.status and next_time are emitted by
_collect_view_refresh_metrics(), which lives in the schema-metrics PR.
Keep only the is_refreshable catalog payload assertion here.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* fix(clickhouse): sync CollectSchemas model formatting with spec.yaml

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* fix(clickhouse): validate collect_schemas interval and correct default-off docs

- Normalize collect_schemas.collection_interval in _apply_validated_defaults
  so a 0/negative value is downgraded to the default with a warning instead
  of raising ZeroDivisionError at rate_limit = 1 / collection_interval.
- Fix spec.yaml enabled example (true -> false) and regenerate
  conf.yaml.example so docs match the actual default-off behavior.
- Add regression test for zero/negative collection_interval.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(clickhouse): emit tables and views in a single tables list

Collapse the per-database schema payload to one `tables` list instead of
separate `tables`/`views` arrays, matching the canonical DBM schema shape
used by postgres/mysql/sqlserver (whose backend structs carry only Tables).
Views remain identifiable via the `engine` field (View, MaterializedView,
LiveView, WindowView) and `is_refreshable`.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(clickhouse): bind schema-filter regex patterns as query parameters

User-supplied include/exclude regex patterns were interpolated into the
schema-collection SQL with manual quote-escaping. Bind them as
clickhouse-connect query parameters ({name:String}) instead, eliminating the
SQL-injection surface. Only trusted structural pieces (system-table
identifiers, validated integer limits, the constant system-database list)
remain interpolated.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* Remove redundant int() casts — typed config already guarantees int

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* Remove redundant tuple() and or () guards — defaults applied in config.py

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* Remove redundant collect_schemas null check — always initialized with defaults

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 clickhouse/assets/configuration/spec.yaml     |  93 +++
 clickhouse/changelog.d/23899.added            |   1 +
 .../datadog_checks/clickhouse/clickhouse.py   |  19 +
 .../datadog_checks/clickhouse/config.py       |  11 +
 .../clickhouse/config_models/dict_defaults.py |  15 +
 .../clickhouse/config_models/instance.py      |  18 +
 .../clickhouse/data/conf.yaml.example         |  69 ++
 .../datadog_checks/clickhouse/features.py     |   2 +
 .../datadog_checks/clickhouse/metadata.py     |  52 ++
 .../datadog_checks/clickhouse/schemas.py      | 240 +++++++
 clickhouse/tests/test_config_defaults.py      |  13 +
 clickhouse/tests/test_metadata.py             | 621 ++++++++++++++++++
 clickhouse/tests/test_metadata_integration.py | 317 +++++++++
 clickhouse/tests/test_unit.py                 |  15 +
 14 files changed, 1486 insertions(+)
 create mode 100644 clickhouse/changelog.d/23899.added
 create mode 100644 clickhouse/datadog_checks/clickhouse/metadata.py
 create mode 100644 clickhouse/datadog_checks/clickhouse/schemas.py
 create mode 100644 clickhouse/tests/test_metadata.py
 create mode 100644 clickhouse/tests/test_metadata_integration.py

diff --git a/clickhouse/assets/configuration/spec.yaml b/clickhouse/assets/configuration/spec.yaml
index 4ee3b90e08490..24dff8cb6f90b 100644
--- a/clickhouse/assets/configuration/spec.yaml
+++ b/clickhouse/assets/configuration/spec.yaml
@@ -345,6 +345,99 @@ files:
             type: boolean
             example: false
           fleet_configurable: false
+    - name: collect_schemas
+      description: |
+        Configure collection of ClickHouse catalog metadata (databases,
+        tables, views, columns) for Database Monitoring's Schema Explorer.
+        Requires `dbm: true`.
+      options:
+        - name: enabled
+          description: |
+            Enable collection of catalog metadata. When enabled, the agent
+            polls `system.tables`, `system.columns`, and
+            `system.view_refreshes` (ClickHouse 24.3+) and emits one
+            `kind=clickhouse_databases` payload per cycle.
+          value:
+            type: boolean
+            example: false
+        - name: collection_interval
+          description: |
+            Set the schema collection interval (in seconds). Catalog data
+            changes slowly; 600s (10 min) is a reasonable default.
+          value:
+            type: number
+            example: 600
+        - name: max_tables
+          description: |
+            Maximum number of tables plus views to collect per cycle
+            across all databases.
+          value:
+            type: integer
+            example: 300
+        - name: max_columns
+          description: |
+            Maximum number of columns to collect per table/view.
+          value:
+            type: integer
+            example: 1000
+        - name: max_query_duration
+          description: |
+            Maximum duration of the schema collection queries in seconds.
+            Applied to each query via the `max_execution_time` ClickHouse setting.
+          value:
+            type: integer
+            example: 60
+        - name: include_databases
+          description: |
+            A list of regex patterns to include databases. Any database whose
+            name matches any one of these patterns will be included. If empty,
+            all databases (other than those excluded) are included.
+          value:
+            type: array
+            items:
+              type: string
+            example:
+              - "mydb"
+        - name: exclude_databases
+          description: |
+            A list of regex patterns to exclude databases. Any database whose
+            name matches any one of these patterns will be excluded. The
+            ClickHouse system databases (`system`, `INFORMATION_SCHEMA`,
+            `information_schema`) are always excluded regardless of this setting.
+          value:
+            type: array
+            items:
+              type: string
+            example:
+              - "tmp_.*"
+        - name: include_tables
+          description: |
+            A list of regex patterns to include tables. Any table whose name
+            matches any one of these patterns will be included. If empty, all
+            tables (other than those excluded) are included.
+          value:
+            type: array
+            items:
+              type: string
+            example:
+              - "events.*"
+        - name: exclude_tables
+          description: |
+            A list of regex patterns to exclude tables. Any table whose name
+            matches any one of these patterns will be excluded.
+          value:
+            type: array
+            items:
+              type: string
+            example:
+              - ".*_tmp"
+        - name: run_sync
+          hidden: true
+          description: |
+            Run the metadata collection synchronously. For testing only.
+          value:
+            type: boolean
+            example: false
     - name: parts_and_merges
       description: Configure parts and merges monitoring
       options:
diff --git a/clickhouse/changelog.d/23899.added b/clickhouse/changelog.d/23899.added
new file mode 100644
index 0000000000000..dc8712e95841d
--- /dev/null
+++ b/clickhouse/changelog.d/23899.added
@@ -0,0 +1 @@
+Add ClickHouse schema collection: catalog payload (databases, tables, views, columns) under collect_schemas with include/exclude regex filters for databases and tables.
diff --git a/clickhouse/datadog_checks/clickhouse/clickhouse.py b/clickhouse/datadog_checks/clickhouse/clickhouse.py
index 3f5e66416995d..9f022a5887f7e 100644
--- a/clickhouse/datadog_checks/clickhouse/clickhouse.py
+++ b/clickhouse/datadog_checks/clickhouse/clickhouse.py
@@ -17,6 +17,7 @@
 from .__about__ import __version__
 from .config import build_config, sanitize
 from .health import ClickhouseHealth, HealthEvent, HealthStatus
+from .metadata import ClickhouseMetadata
 from .parts_and_merges import ClickhousePartsAndMerges
 from .query_completions import ClickhouseQueryCompletions
 from .query_errors import ClickhouseQueryErrors
@@ -122,6 +123,12 @@ def _init_dbm_components(self):
         else:
             self.query_errors = None
 
+        # Initialize schema collection (catalog metadata for Schema Explorer)
+        if self._config.dbm and self._config.collect_schemas.enabled:
+            self.metadata = ClickhouseMetadata(self)
+        else:
+            self.metadata = None
+
         # Initialize parts and merges monitoring (from system.parts, merges, mutations, replication_queue)
         if self._config.dbm and self._config.parts_and_merges.enabled:
             self.parts_and_merges = ClickhousePartsAndMerges(self, self._config.parts_and_merges)
@@ -260,6 +267,10 @@ def check(self, _):
         if self.query_errors:
             self.query_errors.run_job_loop(self.tags)
 
+        # Run schema collection if enabled
+        if self.metadata:
+            self.metadata.run_job_loop(self.tags)
+
         # Run parts and merges monitoring if enabled
         if self.parts_and_merges:
             self.parts_and_merges.run_job_loop(self.tags)
@@ -364,6 +375,10 @@ def database_identifier(self) -> str:
             self._database_identifier = template.safe_substitute(**tag_dict)
         return self._database_identifier
 
+    @property
+    def dbms(self) -> str:
+        return "clickhouse"
+
     @property
     def dbms_version(self) -> str:
         """Get the ClickHouse server version."""
@@ -525,6 +540,8 @@ def cancel(self):
             self.query_completions.cancel()
         if self.query_errors:
             self.query_errors.cancel()
+        if self.metadata:
+            self.metadata.cancel()
         if self.parts_and_merges:
             self.parts_and_merges.cancel()
 
@@ -537,6 +554,8 @@ def cancel(self):
             self.query_completions._job_loop_future.result()
         if self.query_errors and self.query_errors._job_loop_future:
             self.query_errors._job_loop_future.result()
+        if self.metadata and self.metadata._job_loop_future:
+            self.metadata._job_loop_future.result()
         if self.parts_and_merges and self.parts_and_merges._job_loop_future:
             self.parts_and_merges._job_loop_future.result()
 
diff --git a/clickhouse/datadog_checks/clickhouse/config.py b/clickhouse/datadog_checks/clickhouse/config.py
index e20b63832c8ed..b425b2936e2a5 100644
--- a/clickhouse/datadog_checks/clickhouse/config.py
+++ b/clickhouse/datadog_checks/clickhouse/config.py
@@ -128,6 +128,10 @@ def build_config(check: ClickhouseCheck) -> Tuple[InstanceConfig, ValidationResu
                 **dict_defaults.instance_parts_and_merges().model_dump(),
                 **(instance.get('parts_and_merges', {})),
             },
+            "collect_schemas": {
+                **dict_defaults.instance_collect_schemas().model_dump(),
+                **(instance.get('collect_schemas', {})),
+            },
             # Tags - ensure we have a list, not None
             "tags": list(instance.get('tags', [])),
             # Other settings
@@ -224,6 +228,13 @@ def _apply_validated_defaults(args: dict, instance: dict, validation_result: Val
             f"parts_and_merges.collection_interval must be greater than 0, defaulting to {default_value} seconds."
         )
 
+    if _safefloat(args.get('collect_schemas', {}).get('collection_interval')) <= 0:
+        default_value = dict_defaults.instance_collect_schemas().collection_interval
+        args['collect_schemas']['collection_interval'] = default_value
+        validation_result.add_warning(
+            f"collect_schemas.collection_interval must be greater than 0, defaulting to {default_value} seconds."
+        )
+
     _pm_defaults = dict_defaults.instance_parts_and_merges()
     for _field in (
         'max_parts_rows',
diff --git a/clickhouse/datadog_checks/clickhouse/config_models/dict_defaults.py b/clickhouse/datadog_checks/clickhouse/config_models/dict_defaults.py
index 3082d0cc60f2d..e7fe5bbec7b5e 100644
--- a/clickhouse/datadog_checks/clickhouse/config_models/dict_defaults.py
+++ b/clickhouse/datadog_checks/clickhouse/config_models/dict_defaults.py
@@ -57,6 +57,21 @@ def instance_query_errors():
     )
 
 
+def instance_collect_schemas():
+    return instance.CollectSchemas(
+        enabled=False,
+        collection_interval=600,
+        max_tables=300,
+        max_columns=1000,
+        max_query_duration=60,
+        include_databases=(),
+        exclude_databases=(),
+        include_tables=(),
+        exclude_tables=(),
+        run_sync=False,
+    )
+
+
 def instance_parts_and_merges():
     return instance.PartsAndMerges(
         enabled=True,
diff --git a/clickhouse/datadog_checks/clickhouse/config_models/instance.py b/clickhouse/datadog_checks/clickhouse/config_models/instance.py
index 4bec26f4cd289..3bc35a160aa02 100644
--- a/clickhouse/datadog_checks/clickhouse/config_models/instance.py
+++ b/clickhouse/datadog_checks/clickhouse/config_models/instance.py
@@ -23,6 +23,23 @@
 SECURE_FIELD_NAMES = frozenset(['tls_ca_cert'])
 
 
+class CollectSchemas(BaseModel):
+    model_config = ConfigDict(
+        arbitrary_types_allowed=True,
+        frozen=True,
+    )
+    collection_interval: Optional[float] = None
+    enabled: Optional[bool] = None
+    exclude_databases: Optional[tuple[str, ...]] = None
+    exclude_tables: Optional[tuple[str, ...]] = None
+    include_databases: Optional[tuple[str, ...]] = None
+    include_tables: Optional[tuple[str, ...]] = None
+    max_columns: Optional[int] = None
+    max_query_duration: Optional[int] = None
+    max_tables: Optional[int] = None
+    run_sync: Optional[bool] = None
+
+
 class CustomQuery(BaseModel):
     model_config = ConfigDict(
         arbitrary_types_allowed=True,
@@ -127,6 +144,7 @@ class InstanceConfig(BaseModel):
         arbitrary_types_allowed=True,
         frozen=True,
     )
+    collect_schemas: Optional[CollectSchemas] = None
     compression: Optional[str] = None
     connect_timeout: Optional[int] = None
     custom_queries: Optional[tuple[CustomQuery, ...]] = None
diff --git a/clickhouse/datadog_checks/clickhouse/data/conf.yaml.example b/clickhouse/datadog_checks/clickhouse/data/conf.yaml.example
index 279e5d96ec433..6d189b4615304 100644
--- a/clickhouse/datadog_checks/clickhouse/data/conf.yaml.example
+++ b/clickhouse/datadog_checks/clickhouse/data/conf.yaml.example
@@ -214,6 +214,75 @@ instances:
         #
         # samples_per_hour_per_query: 60
 
+    ## Configure collection of ClickHouse catalog metadata (databases,
+    ## tables, views, columns) for Database Monitoring's Schema Explorer.
+    ## Requires `dbm: true`.
+    #
+    # collect_schemas:
+
+        ## @param enabled - boolean - optional - default: false
+        ## Enable collection of catalog metadata. When enabled, the agent
+        ## polls `system.tables`, `system.columns`, and
+        ## `system.view_refreshes` (ClickHouse 24.3+) and emits one
+        ## `kind=clickhouse_databases` payload per cycle.
+        #
+        # enabled: false
+
+        ## @param collection_interval - number - optional - default: 600
+        ## Set the schema collection interval (in seconds). Catalog data
+        ## changes slowly; 600s (10 min) is a reasonable default.
+        #
+        # collection_interval: 600
+
+        ## @param max_tables - integer - optional - default: 300
+        ## Maximum number of tables plus views to collect per cycle
+        ## across all databases.
+        #
+        # max_tables: 300
+
+        ## @param max_columns - integer - optional - default: 1000
+        ## Maximum number of columns to collect per table/view.
+        #
+        # max_columns: 1000
+
+        ## @param max_query_duration - integer - optional - default: 60
+        ## Maximum duration of the schema collection queries in seconds.
+        ## Applied to each query via the `max_execution_time` ClickHouse setting.
+        #
+        # max_query_duration: 60
+
+        ## @param include_databases - list of strings - optional
+        ## A list of regex patterns to include databases. Any database whose
+        ## name matches any one of these patterns will be included. If empty,
+        ## all databases (other than those excluded) are included.
+        #
+        # include_databases:
+        #   - mydb
+
+        ## @param exclude_databases - list of strings - optional
+        ## A list of regex patterns to exclude databases. Any database whose
+        ## name matches any one of these patterns will be excluded. The
+        ## ClickHouse system databases (`system`, `INFORMATION_SCHEMA`,
+        ## `information_schema`) are always excluded regardless of this setting.
+        #
+        # exclude_databases:
+        #   - tmp_.*
+
+        ## @param include_tables - list of strings - optional
+        ## A list of regex patterns to include tables. Any table whose name
+        ## matches any one of these patterns will be included. If empty, all
+        ## tables (other than those excluded) are included.
+        #
+        # include_tables:
+        #   - events.*
+
+        ## @param exclude_tables - list of strings - optional
+        ## A list of regex patterns to exclude tables. Any table whose name
+        ## matches any one of these patterns will be excluded.
+        #
+        # exclude_tables:
+        #   - .*_tmp
+
     ## Configure parts and merges monitoring
     #
     # parts_and_merges:
diff --git a/clickhouse/datadog_checks/clickhouse/features.py b/clickhouse/datadog_checks/clickhouse/features.py
index 5fd61bdae93cf..6eb705a785d0c 100644
--- a/clickhouse/datadog_checks/clickhouse/features.py
+++ b/clickhouse/datadog_checks/clickhouse/features.py
@@ -23,6 +23,7 @@ class FeatureKey(Enum):
     QUERY_COMPLETIONS = "query_completions"
     EXPLAIN_PLANS = "explain_plans"
     QUERY_ERRORS = "query_errors"
+    COLLECT_SCHEMAS = "collect_schemas"
     PARTS_AND_MERGES = "parts_and_merges"
     SINGLE_ENDPOINT_MODE = "single_endpoint_mode"
 
@@ -34,6 +35,7 @@ class FeatureKey(Enum):
     FeatureKey.QUERY_COMPLETIONS: 'Query Completions',
     FeatureKey.QUERY_ERRORS: 'Query Errors',
     FeatureKey.EXPLAIN_PLANS: 'Explain Plans',
+    FeatureKey.COLLECT_SCHEMAS: 'Collect Schemas',
     FeatureKey.PARTS_AND_MERGES: 'Parts and Merges',
     FeatureKey.SINGLE_ENDPOINT_MODE: 'Single Endpoint Mode',
 }
diff --git a/clickhouse/datadog_checks/clickhouse/metadata.py b/clickhouse/datadog_checks/clickhouse/metadata.py
new file mode 100644
index 0000000000000..6b09a4f0072f6
--- /dev/null
+++ b/clickhouse/datadog_checks/clickhouse/metadata.py
@@ -0,0 +1,52 @@
+# (C) Datadog, Inc. 2026-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from clickhouse_connect.driver.exceptions import DatabaseError
+
+if TYPE_CHECKING:
+    from datadog_checks.clickhouse import ClickhouseCheck
+
+from datadog_checks.base.utils.db.utils import DBMAsyncJob
+from datadog_checks.base.utils.tracking import tracked_method
+
+from .schemas import ClickhouseSchemaCollector
+
+
+def agent_check_getter(self):
+    return self._check
+
+
+class ClickhouseMetadata(DBMAsyncJob):
+    """Top-level DBM job that drives the schema collector on its configured cadence."""
+
+    def __init__(self, check: ClickhouseCheck):
+        collection_interval = check._config.collect_schemas.collection_interval
+        super(ClickhouseMetadata, self).__init__(
+            check,
+            rate_limit=1 / collection_interval,
+            run_sync=check._config.collect_schemas.run_sync,
+            enabled=check._config.collect_schemas.enabled,
+            dbms='clickhouse',
+            min_collection_interval=check._config.min_collection_interval,
+            expected_db_exceptions=(DatabaseError,),
+            job_name='clickhouse-metadata',
+        )
+        self._check = check
+        self._collection_interval = collection_interval
+        self._schema_collector = ClickhouseSchemaCollector(check)
+        self._schema_collector._cancel_event = self._cancel_event
+
+    def cancel(self):
+        super(ClickhouseMetadata, self).cancel()
+        self._schema_collector.close()
+
+    @tracked_method(agent_check_getter=agent_check_getter)
+    def run_job(self):
+        try:
+            self._schema_collector.collect_schemas()
+        except Exception:
+            self._log.exception("Schema collection failed")
diff --git a/clickhouse/datadog_checks/clickhouse/schemas.py b/clickhouse/datadog_checks/clickhouse/schemas.py
new file mode 100644
index 0000000000000..0512fa8217cb0
--- /dev/null
+++ b/clickhouse/datadog_checks/clickhouse/schemas.py
@@ -0,0 +1,240 @@
+# (C) Datadog, Inc. 2026-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+from __future__ import annotations
+
+import contextlib
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from datadog_checks.clickhouse import ClickhouseCheck
+
+from datadog_checks.base.utils.db.schemas import SchemaCollector, SchemaCollectorConfig
+
+_SYSTEM_DATABASE_NAMES = ("'system'", "'INFORMATION_SCHEMA'", "'information_schema'")
+
+# Single stub so the base-class loop runs exactly once; actual database names
+# come from the `database` column of each cursor row.
+_CLUSTER_STUB = {'name': '_cluster_'}
+
+_TABLES_COLUMNS_QUERY = """\
+WITH
+tables AS (
+    SELECT
+        database,
+        name,
+        engine,
+        toString(uuid) AS uuid,
+        create_table_query,
+        sorting_key,
+        partition_key,
+        primary_key,
+        sampling_key,
+        toInt64(toUnixTimestamp(metadata_modification_time)) AS metadata_modified_at
+    FROM {tables_table}
+    WHERE database NOT IN ({system_dbs})
+      {db_filters}
+      {table_filters}
+    ORDER BY database, name
+    LIMIT 1 BY (database, name)
+    LIMIT {max_tables}
+),
+columns AS (
+    SELECT database, table, name, type, default_expression, comment, toInt32(position) AS position
+    FROM {columns_table}
+    WHERE (database, table) IN (SELECT database, name FROM tables)
+    ORDER BY database, table, position
+    LIMIT 1 BY (database, table, name)
+    LIMIT {limit_columns}
+)
+SELECT
+    t.database,
+    t.name,
+    t.engine,
+    t.uuid,
+    t.create_table_query,
+    t.sorting_key,
+    t.partition_key,
+    t.primary_key,
+    t.sampling_key,
+    t.metadata_modified_at,
+    groupArrayIf({max_columns})(
+        tuple(c.name, c.type, c.default_expression, c.comment, c.position),
+        c.name IS NOT NULL
+    ) AS columns
+FROM tables t
+LEFT JOIN columns c ON t.database = c.database AND t.name = c.table
+GROUP BY
+    t.database, t.name, t.engine, t.uuid, t.create_table_query,
+    t.sorting_key, t.partition_key, t.primary_key, t.sampling_key,
+    t.metadata_modified_at
+ORDER BY t.database, t.name
+"""
+
+
+class ClickhouseSchemaCollectorConfig(SchemaCollectorConfig):
+    max_tables: int
+    max_columns: int
+    max_query_duration: int
+    include_databases: tuple[str, ...]
+    exclude_databases: tuple[str, ...]
+    include_tables: tuple[str, ...]
+    exclude_tables: tuple[str, ...]
+
+
+class ClickhouseSchemaCollector(SchemaCollector):
+    """Collects ClickHouse schema metadata via a single CTE query per cycle."""
+
+    _check: ClickhouseCheck
+    _config: ClickhouseSchemaCollectorConfig
+
+    def __init__(self, check: ClickhouseCheck):
+        config = ClickhouseSchemaCollectorConfig()
+        config.collection_interval = check._config.collect_schemas.collection_interval
+        config.max_tables = check._config.collect_schemas.max_tables
+        config.max_columns = check._config.collect_schemas.max_columns
+        config.max_query_duration = check._config.collect_schemas.max_query_duration
+        config.include_databases = check._config.collect_schemas.include_databases
+        config.exclude_databases = check._config.collect_schemas.exclude_databases
+        config.include_tables = check._config.collect_schemas.include_tables
+        config.exclude_tables = check._config.collect_schemas.exclude_tables
+
+        super().__init__(check, config)
+        self._db_client = None
+        self._cancel_event = None
+
+    @property
+    def kind(self) -> str:
+        return 'clickhouse_databases'
+
+    @property
+    def base_event(self) -> dict[str, Any]:
+        event = super().base_event
+        event['collector_id'] = self._check.check_id
+        return event
+
+    def close(self) -> None:
+        if self._db_client:
+            try:
+                self._db_client.close()
+            except Exception as e:
+                self._log.debug("Error closing schema collector client: %s", e)
+            self._db_client = None
+
+    def _check_cancelled(self) -> None:
+        if self._cancel_event is not None and self._cancel_event.is_set():
+            raise Exception("Job loop cancelled. Aborting query.")
+
+    def _get_databases(self) -> list[dict[str, str]]:
+        return [_CLUSTER_STUB]
+
+    @contextlib.contextmanager
+    def _get_cursor(self, _database_name: str):
+        self._db_client = self._check.create_dbm_client()
+        self._db_client.set_client_setting('max_execution_time', self._config.max_query_duration)
+        try:
+            db_filters, db_params = _build_match_clauses(
+                'database', self._config.include_databases, self._config.exclude_databases, 'db'
+            )
+            table_filters, table_params = _build_match_clauses(
+                'name', self._config.include_tables, self._config.exclude_tables, 'table'
+            )
+
+            # Only structural pieces (trusted system-table identifiers and
+            # validated integer limits) are interpolated; user-supplied regex
+            # patterns are bound as query parameters below.
+            fmt = {
+                'tables_table': self._check.get_system_table('tables'),
+                'columns_table': self._check.get_system_table('columns'),
+                'system_dbs': ", ".join(_SYSTEM_DATABASE_NAMES),
+                'max_tables': self._config.max_tables,
+                'max_columns': self._config.max_columns,
+                'limit_columns': self._config.max_tables * self._config.max_columns,
+                'db_filters': db_filters,
+                'table_filters': table_filters,
+            }
+            query_parameters = {**db_params, **table_params}
+            self._check_cancelled()
+            with self._db_client.query_rows_stream(
+                _TABLES_COLUMNS_QUERY.format(**fmt), parameters=query_parameters
+            ) as stream:
+                yield stream
+        finally:
+            self.close()
+
+    def _get_next(self, cursor) -> tuple | None:
+        return next(cursor, None)
+
+    def _map_row(self, _database: dict[str, str], cursor_row: tuple) -> dict[str, Any]:
+        # Tables and views are emitted in a single `tables` list, matching the
+        # canonical DBM schema payload shape (postgres/mysql/sqlserver). The
+        # `engine` field on each item distinguishes views (View, MaterializedView,
+        # LiveView, WindowView) from regular tables.
+        actual_db_name = cursor_row[0]
+        return {'name': actual_db_name, 'tables': [self._build_item(cursor_row)]}
+
+    def _build_item(self, row: tuple) -> dict[str, Any]:
+        (
+            database,
+            name,
+            engine,
+            uuid_str,
+            create_query,
+            sorting_key,
+            partition_key,
+            primary_key,
+            sampling_key,
+            metadata_modified_at,
+            raw_columns,
+        ) = row
+        cols = [
+            {
+                'name': col[0],
+                'type': col[1],
+                'default': col[2] or '',
+                'comment': col[3] or '',
+                'position': int(col[4] or 0),
+            }
+            for col in (raw_columns or [])
+        ]
+        return {
+            'name': name,
+            'engine': engine,
+            'uuid': uuid_str,
+            'sorting_key': sorting_key or '',
+            'partition_key': partition_key or '',
+            'primary_key': primary_key or '',
+            'sampling_key': sampling_key or '',
+            'create_query': create_query,
+            'columns': cols,
+            'metadata_modified_at': int(metadata_modified_at or 0),
+            'is_refreshable': 'REFRESH' in (create_query or '').upper(),
+        }
+
+
+def _build_match_clauses(
+    column: str,
+    include_patterns: tuple[str, ...],
+    exclude_patterns: tuple[str, ...],
+    param_prefix: str,
+) -> tuple[str, dict[str, str]]:
+    """Build regex match clauses using bound query parameters rather than string
+    interpolation, so user-supplied patterns cannot be used for SQL injection.
+
+    Returns the clause text (with ``{name:String}`` placeholders) and the dict of
+    parameter values to pass to ``query_rows_stream(parameters=...)``.
+    """
+    clauses: list[str] = []
+    params: dict[str, str] = {}
+    for i, pattern in enumerate(exclude_patterns):
+        key = f'{param_prefix}_exclude_{i}'
+        clauses.append(f"AND NOT match({column}, {{{key}:String}})")
+        params[key] = pattern
+    if include_patterns:
+        ors = []
+        for i, pattern in enumerate(include_patterns):
+            key = f'{param_prefix}_include_{i}'
+            ors.append(f"match({column}, {{{key}:String}})")
+            params[key] = pattern
+        clauses.append(f"AND ({' OR '.join(ors)})")
+    return "\n  ".join(clauses), params
diff --git a/clickhouse/tests/test_config_defaults.py b/clickhouse/tests/test_config_defaults.py
index 40929dd30e726..9a600f1d1f39b 100644
--- a/clickhouse/tests/test_config_defaults.py
+++ b/clickhouse/tests/test_config_defaults.py
@@ -70,6 +70,19 @@
         'max_samples_per_collection': 1000,
         'run_sync': False,
     },
+    # === DBM: Schema collector ===
+    'collect_schemas': {
+        'enabled': False,
+        'collection_interval': 600,
+        'max_tables': 300,
+        'max_columns': 1000,
+        'max_query_duration': 60,
+        'include_databases': (),
+        'exclude_databases': (),
+        'include_tables': (),
+        'exclude_tables': (),
+        'run_sync': False,
+    },
     # === DBM: Parts and merges ===
     'parts_and_merges': {
         'enabled': True,
diff --git a/clickhouse/tests/test_metadata.py b/clickhouse/tests/test_metadata.py
new file mode 100644
index 0000000000000..1e05dd37997be
--- /dev/null
+++ b/clickhouse/tests/test_metadata.py
@@ -0,0 +1,621 @@
+# (C) Datadog, Inc. 2026-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+import contextlib
+import json
+import threading
+from unittest import mock
+
+import pytest
+
+from datadog_checks.clickhouse import ClickhouseCheck
+from datadog_checks.clickhouse.metadata import ClickhouseMetadata
+from datadog_checks.clickhouse.schemas import (
+    ClickhouseSchemaCollector,
+    _build_match_clauses,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def _column_row(name='id', type_='UInt64', default='', comment='', position=1):
+    """Column tuple as returned by groupArrayIf in the combined CTE query."""
+    return (name, type_, default, comment, position)
+
+
+def _table_row(
+    database='default',
+    name='events',
+    engine='MergeTree',
+    uuid_str='uuid-1',
+    create_query='CREATE TABLE default.events (id UInt64) ENGINE = MergeTree ORDER BY id',
+    sorting_key='id',
+    partition_key='',
+    primary_key='id',
+    sampling_key='',
+    metadata_modified_at=1700000000,
+    columns=None,
+):
+    return (
+        database,
+        name,
+        engine,
+        uuid_str,
+        create_query,
+        sorting_key,
+        partition_key,
+        primary_key,
+        sampling_key,
+        metadata_modified_at,
+        columns or [],
+    )
+
+
+def _view_row(
+    database='default',
+    name='events_mv',
+    engine='MaterializedView',
+    uuid_str='uuid-2',
+    create_query='CREATE MATERIALIZED VIEW default.events_mv TO default.events_target AS SELECT * FROM default.events',
+    metadata_modified_at=1700001000,
+    columns=None,
+):
+    return _table_row(
+        database=database,
+        name=name,
+        engine=engine,
+        uuid_str=uuid_str,
+        create_query=create_query,
+        sorting_key='',
+        partition_key='',
+        primary_key='',
+        sampling_key='',
+        metadata_modified_at=metadata_modified_at,
+        columns=columns,
+    )
+
+
+@pytest.fixture
+def collect_schemas_instance():
+    return {
+        'server': 'localhost',
+        'port': 9000,
+        'username': 'default',
+        'password': '',
+        'db': 'default',
+        'dbm': True,
+        'collect_schemas': {
+            'enabled': True,
+            'collection_interval': 600,
+            'max_tables': 5000,
+            'max_columns': 1000,
+            'run_sync': True,
+        },
+        'tags': ['test:clickhouse'],
+    }
+
+
+@pytest.fixture
+def check(collect_schemas_instance):
+    return ClickhouseCheck('clickhouse', {}, [collect_schemas_instance])
+
+
+@pytest.fixture
+def collector(check) -> ClickhouseSchemaCollector:
+    return check.metadata._schema_collector
+
+
+def _make_query_result(rows):
+    rows = list(rows)
+    result = mock.MagicMock()
+    result.result_set = rows
+    result.result_rows = rows
+    result.column_names = ('c0',)
+    result.column_types = ()
+    result.summary = {}
+    return result
+
+
+@contextlib.contextmanager
+def _patch_query(collector, table_rows=None):
+    """Mocks the DBM client for the combined tables+columns CTE query."""
+    table_rows = table_rows or []
+
+    @contextlib.contextmanager
+    def fake_stream(query, *args, **kwargs):
+        yield iter(table_rows)
+
+    mock_client = mock.MagicMock()
+    mock_client.query_rows_stream.side_effect = fake_stream
+
+    with mock.patch.object(collector._check, 'create_dbm_client', return_value=mock_client):
+        yield mock_client
+
+
+def _capture_payloads(check):
+    captured: list[dict] = []
+    check.database_monitoring_metadata = lambda raw: captured.append(json.loads(raw))
+    check.gauge = lambda *a, **kw: None
+    return captured
+
+
+def _run_collect(check, table_rows=None):
+    captured = _capture_payloads(check)
+    with _patch_query(check.metadata._schema_collector, table_rows):
+        check.metadata._schema_collector.collect_schemas()
+    return captured
+
+
+@contextlib.contextmanager
+def _capture_all_queries(collector):
+    """Records every SQL string sent through the DBM client."""
+    seen: list[str] = []
+
+    def fake_client_query(query, *args, **kwargs):
+        seen.append(query)
+        return _make_query_result([])
+
+    @contextlib.contextmanager
+    def fake_stream(query, *args, **kwargs):
+        seen.append(query)
+        yield iter([])
+
+    mock_client = mock.MagicMock()
+    mock_client.query.side_effect = fake_client_query
+    mock_client.query_rows_stream.side_effect = fake_stream
+
+    with mock.patch.object(collector._check, 'create_dbm_client', return_value=mock_client):
+        yield seen
+
+
+@contextlib.contextmanager
+def _capture_query_params(collector):
+    """Records (query, parameters) for each query_rows_stream call."""
+    calls: list[tuple[str, dict]] = []
+
+    @contextlib.contextmanager
+    def fake_stream(query, *args, **kwargs):
+        calls.append((query, kwargs.get('parameters') or {}))
+        yield iter([])
+
+    mock_client = mock.MagicMock()
+    mock_client.query_rows_stream.side_effect = fake_stream
+
+    with mock.patch.object(collector._check, 'create_dbm_client', return_value=mock_client):
+        yield calls
+
+
+def test_initialization(check):
+    assert isinstance(check.metadata, ClickhouseMetadata)
+    assert isinstance(check.metadata._schema_collector, ClickhouseSchemaCollector)
+    assert check.metadata._collection_interval == 600
+    assert check.metadata._schema_collector._config.max_tables == 5000
+    assert check.metadata._schema_collector._config.max_columns == 1000
+
+
+def test_kind(collector):
+    assert collector.kind == 'clickhouse_databases'
+
+
+def test_init_collection_interval_omitted_uses_default():
+    check = ClickhouseCheck(
+        'clickhouse',
+        {},
+        [{'server': 'localhost', 'dbm': True, 'collect_schemas': {'enabled': True}}],
+    )
+    assert check.metadata is not None
+    assert check.metadata._collection_interval == 600
+
+
+def test_init_max_tables_omitted_uses_default():
+    check = ClickhouseCheck(
+        'clickhouse',
+        {},
+        [{'server': 'localhost', 'dbm': True, 'collect_schemas': {'enabled': True}}],
+    )
+    assert check.metadata._schema_collector._config.max_tables == 300
+
+
+def test_init_max_query_duration_omitted_uses_default():
+    check = ClickhouseCheck(
+        'clickhouse',
+        {},
+        [{'server': 'localhost', 'dbm': True, 'collect_schemas': {'enabled': True}}],
+    )
+    assert check.metadata._schema_collector._config.max_query_duration == 60
+
+
+def test_init_filters_omitted_default_to_empty_tuples():
+    check = ClickhouseCheck(
+        'clickhouse',
+        {},
+        [{'server': 'localhost', 'dbm': True, 'collect_schemas': {'enabled': True}}],
+    )
+    cfg = check.metadata._schema_collector._config
+    assert cfg.include_databases == ()
+    assert cfg.exclude_databases == ()
+    assert cfg.include_tables == ()
+    assert cfg.exclude_tables == ()
+
+
+def test_disabled_when_dbm_off():
+    check = ClickhouseCheck(
+        'clickhouse',
+        {},
+        [{'server': 'localhost', 'dbm': False, 'collect_schemas': {'enabled': True, 'collection_interval': 600}}],
+    )
+    assert check.metadata is None
+
+
+def test_disabled_by_default_when_dbm_on():
+    check = ClickhouseCheck('clickhouse', {}, [{'server': 'localhost', 'dbm': True}])
+    assert check.metadata is None
+
+
+def test_disabled_when_explicitly_opted_out():
+    check = ClickhouseCheck(
+        'clickhouse',
+        {},
+        [{'server': 'localhost', 'dbm': True, 'collect_schemas': {'enabled': False, 'collection_interval': 600}}],
+    )
+    assert check.metadata is None
+
+
+def test_collect_emits_single_payload_when_small(check):
+    payloads = _run_collect(
+        check,
+        table_rows=[
+            _table_row(name='events', columns=[_column_row(name='id')]),
+            _view_row(name='events_mv'),
+        ],
+    )
+    assert len(payloads) == 1
+    p = payloads[0]
+    assert p['kind'] == 'clickhouse_databases'
+    assert p['dbms'] == 'clickhouse'
+    assert p['collection_payloads_count'] == 1
+    assert p['collection_started_at'] > 0
+    assert 'host' in p
+    assert 'collector_id' in p
+
+
+def test_collect_payload_tables_list_includes_views(check):
+    payloads = _run_collect(
+        check,
+        table_rows=[
+            _table_row(name='events'),
+            _view_row(
+                name='events_mv',
+                create_query=(
+                    'CREATE MATERIALIZED VIEW default.events_mv'
+                    ' REFRESH EVERY 1 HOUR TO default.events_target'
+                    ' AS SELECT * FROM default.events'
+                ),
+            ),
+        ],
+    )
+    dbs = payloads[0]['metadata']
+    # Tables and views share a single `tables` list; views are identified by engine.
+    items = {t['name']: t for db in dbs for t in db['tables']}
+    assert 'events' in items
+    assert 'events_mv' in items
+    assert items['events_mv']['engine'] == 'MaterializedView'
+    assert items['events_mv']['is_refreshable'] is True
+
+
+@pytest.mark.parametrize('engine', ['View', 'LiveView', 'WindowView'])
+def test_collect_view_engines_appear_in_tables_list(check, engine):
+    payloads = _run_collect(check, table_rows=[_view_row(name='some_view', engine=engine)])
+    dbs = payloads[0]['metadata']
+    items = [t for db in dbs for t in db['tables']]
+    assert [t['name'] for t in items] == ['some_view']
+    assert items[0]['engine'] == engine
+
+
+def test_collect_dedupes_replica_rows_via_sql(check):
+    payloads = _run_collect(check, table_rows=[_table_row(name='events')])
+    dbs = payloads[0]['metadata']
+    table_names = [t['name'] for db in dbs for t in db['tables']]
+    assert table_names == ['events']
+
+
+def test_collect_emits_empty_snapshot_marker_when_no_tables(check):
+    # An empty run still emits one terminal payload so the backend receives a
+    # snapshot marker (collection_payloads_count) and can clear stale state.
+    payloads = _run_collect(check, table_rows=[])
+    assert len(payloads) == 1
+    assert payloads[0]['metadata'] == []
+    assert payloads[0]['collection_payloads_count'] == 1
+
+
+def test_collect_marks_view_refreshable_based_on_create_query(check):
+    payloads = _run_collect(
+        check,
+        table_rows=[
+            _view_row(
+                name='refreshable_mv',
+                create_query=(
+                    'CREATE MATERIALIZED VIEW default.refreshable_mv'
+                    ' REFRESH EVERY 1 HOUR TO default.target'
+                    ' AS SELECT * FROM default.src'
+                ),
+            ),
+            _view_row(
+                name='vanilla_view',
+                engine='View',
+                create_query='CREATE VIEW default.vanilla_view AS SELECT 1',
+            ),
+        ],
+    )
+    by_name = {t['name']: t for db in payloads[0]['metadata'] for t in db['tables']}
+    assert by_name['refreshable_mv']['is_refreshable'] is True
+    assert by_name['vanilla_view']['is_refreshable'] is False
+
+
+def test_collect_columns_attached_to_correct_parent(check):
+    payloads = _run_collect(
+        check,
+        table_rows=[
+            _table_row(name='events', columns=[_column_row(name='id')]),
+            _view_row(name='events_mv', columns=[_column_row(name='count', type_='UInt64')]),
+        ],
+    )
+    dbs = payloads[0]['metadata']
+    table = next(t for db in dbs for t in db['tables'] if t['name'] == 'events')
+    view = next(t for db in dbs for t in db['tables'] if t['name'] == 'events_mv')
+    assert [c['name'] for c in table['columns']] == ['id']
+    assert [c['name'] for c in view['columns']] == ['count']
+
+
+def test_collect_chunks_when_payload_chunk_size_exceeded(check):
+    check.metadata._schema_collector._config.payload_chunk_size = 5
+    rows = [_table_row(name=f'big_{i}') for i in range(12)]
+
+    payloads = _run_collect(check, table_rows=rows)
+
+    assert len(payloads) >= 2
+    emitted_total = sum(len(db['tables']) for p in payloads for db in p['metadata'])
+    assert emitted_total == 12
+
+
+def test_collect_collection_payloads_count_only_on_last(check):
+    check.metadata._schema_collector._config.payload_chunk_size = 5
+    rows = [_table_row(name=f'big_{i}') for i in range(12)]
+
+    payloads = _run_collect(check, table_rows=rows)
+
+    for intermediate in payloads[:-1]:
+        assert 'collection_payloads_count' not in intermediate
+    assert payloads[-1]['collection_payloads_count'] == len(payloads)
+
+
+def test_collect_all_chunks_share_collection_started_at(check):
+    check.metadata._schema_collector._config.payload_chunk_size = 5
+    rows = [_table_row(name=f'big_{i}') for i in range(12)]
+
+    payloads = _run_collect(check, table_rows=rows)
+
+    started_ats = {p['collection_started_at'] for p in payloads}
+    assert len(started_ats) == 1
+
+
+def test_cancel_closes_db_client(check):
+    fake_client = mock.MagicMock()
+    check.metadata._schema_collector._db_client = fake_client
+
+    check.metadata.cancel()
+
+    assert check.metadata._schema_collector._db_client is None
+    fake_client.close.assert_called_once()
+
+
+def test_combined_query_dedupes_replicas_before_limit(check):
+    _capture_payloads(check)
+    with _capture_all_queries(check.metadata._schema_collector) as seen_queries:
+        check.metadata._schema_collector.collect_schemas()
+
+    combined_query = next(q for q in seen_queries if 'FROM system.tables' in q)
+    dedup_idx = combined_query.find('LIMIT 1 BY (database, name)')
+    outer_limit_idx = combined_query.find('LIMIT 5000')
+    assert dedup_idx >= 0
+    assert outer_limit_idx >= 0
+    assert dedup_idx < outer_limit_idx
+
+
+def test_combined_query_joins_columns_and_caps_per_table(check):
+    _capture_payloads(check)
+    with _capture_all_queries(check.metadata._schema_collector) as seen_queries:
+        check.metadata._schema_collector.collect_schemas()
+
+    combined_query = next(q for q in seen_queries if 'FROM system.columns' in q)
+    assert '(database, table) IN (' in combined_query
+    assert 'FROM system.tables' in combined_query
+    assert 'LIMIT 1 BY (database, name)' in combined_query
+    assert 'LIMIT 1 BY (database, table, name)' in combined_query
+    # limit_columns = max_tables * max_columns = 5000 * 1000
+    assert 'LIMIT 5000000' in combined_query
+    # per-table cap via groupArrayIf
+    assert 'groupArrayIf(1000)' in combined_query
+
+
+def test_collect_routes_through_cluster_all_replicas_in_single_endpoint_mode(collect_schemas_instance):
+    collect_schemas_instance['single_endpoint_mode'] = True
+    check = ClickhouseCheck('clickhouse', {}, [collect_schemas_instance])
+    _capture_payloads(check)
+    with _capture_all_queries(check.metadata._schema_collector) as seen_queries:
+        check.metadata._schema_collector.collect_schemas()
+
+    joined = '\n'.join(seen_queries)
+    assert "clusterAllReplicas('default', system.tables)" in joined
+    assert "clusterAllReplicas('default', system.columns)" in joined
+
+
+def test_build_match_clauses_empty_returns_empty_string():
+    assert _build_match_clauses('database', (), (), 'db') == ('', {})
+
+
+def test_build_match_clauses_excludes_only():
+    out, params = _build_match_clauses('database', (), ('tmp_.*', 'shadow_.*'), 'db')
+    assert "AND NOT match(database, {db_exclude_0:String})" in out
+    assert "AND NOT match(database, {db_exclude_1:String})" in out
+    assert params == {'db_exclude_0': 'tmp_.*', 'db_exclude_1': 'shadow_.*'}
+
+
+def test_build_match_clauses_includes_become_or_disjunction():
+    out, params = _build_match_clauses('name', ('events.*', 'orders.*'), (), 'table')
+    assert "AND (match(name, {table_include_0:String}) OR match(name, {table_include_1:String}))" in out
+    assert params == {'table_include_0': 'events.*', 'table_include_1': 'orders.*'}
+
+
+def test_build_match_clauses_single_include_pattern():
+    out, params = _build_match_clauses('name', ('only_one.*',), (), 'table')
+    assert "AND (match(name, {table_include_0:String}))" in out
+    assert " OR " not in out
+    assert params == {'table_include_0': 'only_one.*'}
+
+
+def test_build_match_clauses_excludes_appear_before_includes():
+    out, _ = _build_match_clauses('database', ('keep_.*',), ('drop_.*',), 'db')
+    exclude_idx = out.find("AND NOT match(database, {db_exclude_0:String})")
+    include_idx = out.find("AND (match(database, {db_include_0:String}))")
+    assert exclude_idx >= 0 and include_idx >= 0
+    assert exclude_idx < include_idx
+
+
+def test_build_match_clauses_combines_includes_and_excludes():
+    out, params = _build_match_clauses('database', ('keep_.*',), ('drop_.*',), 'db')
+    assert "AND NOT match(database, {db_exclude_0:String})" in out
+    assert "AND (match(database, {db_include_0:String}))" in out
+    assert params == {'db_exclude_0': 'drop_.*', 'db_include_0': 'keep_.*'}
+
+
+def test_build_match_clauses_passes_pattern_verbatim_as_parameter():
+    # SQL-injection guard: a pattern containing a quote is bound as a parameter
+    # value, not escaped/interpolated into the SQL text.
+    out, params = _build_match_clauses('database', (), ("o'reilly_.*",), 'db')
+    assert out == "AND NOT match(database, {db_exclude_0:String})"
+    assert params == {'db_exclude_0': "o'reilly_.*"}
+
+
+def test_database_filters_appear_in_combined_query(collect_schemas_instance):
+    collect_schemas_instance['collect_schemas']['exclude_databases'] = ['tmp_.*']
+    collect_schemas_instance['collect_schemas']['include_databases'] = ['keep_.*']
+    check = ClickhouseCheck('clickhouse', {}, [collect_schemas_instance])
+    _capture_payloads(check)
+    with _capture_query_params(check.metadata._schema_collector) as calls:
+        check.metadata._schema_collector.collect_schemas()
+
+    combined_query, params = next((q, p) for q, p in calls if 'FROM system.tables' in q)
+    assert "AND NOT match(database, {db_exclude_0:String})" in combined_query
+    assert "AND (match(database, {db_include_0:String}))" in combined_query
+    assert params['db_exclude_0'] == 'tmp_.*'
+    assert params['db_include_0'] == 'keep_.*'
+
+
+def test_table_filters_appear_in_combined_query(collect_schemas_instance):
+    collect_schemas_instance['collect_schemas']['include_tables'] = ['events.*']
+    collect_schemas_instance['collect_schemas']['exclude_tables'] = ['tmp_.*']
+    check = ClickhouseCheck('clickhouse', {}, [collect_schemas_instance])
+    _capture_payloads(check)
+    with _capture_query_params(check.metadata._schema_collector) as calls:
+        check.metadata._schema_collector.collect_schemas()
+
+    combined_query, params = next((q, p) for q, p in calls if 'FROM system.tables' in q)
+    assert "AND NOT match(name, {table_exclude_0:String})" in combined_query
+    assert "AND (match(name, {table_include_0:String}))" in combined_query
+    assert params['table_exclude_0'] == 'tmp_.*'
+    assert params['table_include_0'] == 'events.*'
+
+
+def test_all_cluster_fanout_queries_dedupe_replica_rows(check):
+    """Every query that hits a system table needs LIMIT 1 BY to prevent replica fan-out duplicates."""
+    _capture_payloads(check)
+    with _capture_all_queries(check.metadata._schema_collector) as seen_queries:
+        check.metadata._schema_collector.collect_schemas()
+
+    combined_query = next(q for q in seen_queries if 'FROM system.tables' in q)
+
+    assert 'LIMIT 1 BY (database, name)' in combined_query
+    assert 'LIMIT 1 BY (database, table, name)' in combined_query
+
+
+def test_system_databases_excluded_from_all_queries(collect_schemas_instance):
+    """All cluster-wide queries hard-exclude ClickHouse's internal databases."""
+    check = ClickhouseCheck('clickhouse', {}, [collect_schemas_instance])
+    _capture_payloads(check)
+    with _capture_all_queries(check.metadata._schema_collector) as seen_queries:
+        check.metadata._schema_collector.collect_schemas()
+
+    for kw in ('system.tables', 'system.columns'):
+        q = next(q for q in seen_queries if kw in q)
+        assert "database NOT IN (" in q
+
+
+def test_collect_uses_local_system_tables_in_direct_mode(check):
+    _capture_payloads(check)
+    with _capture_all_queries(check.metadata._schema_collector) as seen_queries:
+        check.metadata._schema_collector.collect_schemas()
+
+    joined = '\n'.join(seen_queries)
+    assert 'clusterAllReplicas' not in joined
+    assert 'FROM system.tables' in joined
+    assert 'FROM system.columns' in joined
+
+
+def test_max_execution_time_set_on_client(collector):
+    _capture_payloads(collector._check)
+    with _patch_query(collector) as mock_client:
+        collector.collect_schemas()
+
+    mock_client.set_client_setting.assert_called_once_with('max_execution_time', collector._config.max_query_duration)
+
+
+def test_main_query_failure_closes_client(collector):
+    mock_client = mock.MagicMock()
+    mock_client.query_rows_stream.return_value.__enter__.side_effect = Exception("main query failed")
+
+    _capture_payloads(collector._check)
+    with mock.patch.object(collector._check, 'create_dbm_client', return_value=mock_client):
+        with pytest.raises(Exception, match="main query failed"):
+            collector.collect_schemas()
+
+    mock_client.close.assert_called_once()
+    assert collector._db_client is None
+
+
+def test_payload_chunking(check, collector):
+    # Set a small chunk size so 7 tables produce 3 separate payloads.
+    collector._config.payload_chunk_size = 3
+    table_rows = [_table_row(name=f'tbl_{i}') for i in range(7)]
+    captured = _run_collect(check, table_rows=table_rows)
+
+    # Three payloads: rows 0-2, rows 3-5, row 6
+    assert len(captured) == 3
+
+    # Only the last payload carries collection_payloads_count (snapshot marker)
+    assert 'collection_payloads_count' not in captured[0]
+    assert 'collection_payloads_count' not in captured[1]
+    assert captured[2]['collection_payloads_count'] == 3
+
+    # Non-final chunks hold exactly chunk_size rows; final chunk holds the remainder
+    assert len(captured[0]['metadata']) == 3
+    assert len(captured[1]['metadata']) == 3
+    assert len(captured[2]['metadata']) == 1
+
+    # Every table appears exactly once across all payloads
+    all_names = [
+        entry['tables'][0]['name'] for payload in captured for entry in payload['metadata'] if entry.get('tables')
+    ]
+    assert sorted(all_names) == sorted(f'tbl_{i}' for i in range(7))
+
+    # Schema kind is correct on every payload
+    assert all(p['kind'] == 'clickhouse_databases' for p in captured)
+
+
+def test_cancel_event_aborts_before_query(collector):
+    cancel_event = threading.Event()
+    collector._cancel_event = cancel_event
+    cancel_event.set()
+
+    with pytest.raises(Exception, match="cancelled"):
+        collector._check_cancelled()
diff --git a/clickhouse/tests/test_metadata_integration.py b/clickhouse/tests/test_metadata_integration.py
new file mode 100644
index 0000000000000..f5139f1fabe7b
--- /dev/null
+++ b/clickhouse/tests/test_metadata_integration.py
@@ -0,0 +1,317 @@
+# (C) Datadog, Inc. 2026-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+from concurrent.futures.thread import ThreadPoolExecutor
+from copy import deepcopy
+
+import clickhouse_connect
+import pytest
+
+from datadog_checks.base.utils.db.utils import DBMAsyncJob
+from datadog_checks.clickhouse import ClickhouseCheck
+
+from .common import CLICKHOUSE_VERSION
+
+UNSUPPORTED_VERSIONS = {'18', '19', '20', '21.8', '22.7'}
+NO_VIEW_REFRESHES_VERSIONS = UNSUPPORTED_VERSIONS | {'23.2', '23.8'}
+
+
+def _is_supported():
+    if CLICKHOUSE_VERSION == 'latest':
+        return True
+    return CLICKHOUSE_VERSION not in UNSUPPORTED_VERSIONS
+
+
+def _supports_view_refreshes():
+    if CLICKHOUSE_VERSION == 'latest':
+        return True
+    return CLICKHOUSE_VERSION not in NO_VIEW_REFRESHES_VERSIONS
+
+
+pytestmark = [
+    pytest.mark.integration,
+    pytest.mark.usefixtures('dd_environment'),
+    pytest.mark.skipif(
+        not _is_supported(),
+        reason='metadata collection requires DBM support (ClickHouse 21.8+)',
+    ),
+]
+
+
+@pytest.fixture
+def metadata_instance(instance):
+    instance['dbm'] = True
+    instance['collect_schemas'] = {
+        'enabled': True,
+        'run_sync': True,
+        'collection_interval': 60,
+        'max_tables': 5000,
+        'max_columns': 1000,
+    }
+    instance['query_metrics'] = {'enabled': False}
+    instance['query_samples'] = {'enabled': False}
+    instance['query_completions'] = {'enabled': False}
+    instance['query_errors'] = {'enabled': False}
+    instance['parts_and_merges'] = {'enabled': False}
+    return instance
+
+
+@pytest.fixture(autouse=True)
+def stop_orphaned_threads():
+    DBMAsyncJob.executor.shutdown(wait=True)
+    DBMAsyncJob.executor = ThreadPoolExecutor()
+
+
+def _client(instance_config):
+    return clickhouse_connect.get_client(
+        host=instance_config['server'],
+        port=instance_config['port'],
+        username=instance_config['username'],
+        password=instance_config['password'],
+    )
+
+
+def _catalog_events(aggregator):
+    return [e for e in aggregator.get_event_platform_events('dbm-metadata') if e.get('kind') == 'clickhouse_databases']
+
+
+def _databases(catalog_events):
+    out = []
+    for ev in catalog_events:
+        out.extend(ev.get('metadata') or [])
+    return out
+
+
+def _merged_database(catalog_events, name):
+    """Merge every per-row entry for `name` across events into one dict.
+
+    SchemaCollector emits one DatabaseObject per table/view; the same database
+    name appears multiple times across chunks. The backend dedupes on its side;
+    tests need to do the same to assert on the full set of tables. Tables and
+    views share a single `tables` list; views are identified by engine.
+    """
+    tables: list[dict] = []
+    found = False
+    for db in _databases(catalog_events):
+        if db.get('name') != name:
+            continue
+        found = True
+        tables.extend(db.get('tables') or [])
+    if not found:
+        return None
+    return {'name': name, 'tables': tables}
+
+
+_find_database = _merged_database
+
+
+def test_metadata_payload_emitted(aggregator, metadata_instance, dd_run_check):
+    client = _client(metadata_instance)
+    table = 'dd_md_payload_test'
+    try:
+        client.command(f'DROP TABLE IF EXISTS default.{table}')
+        client.command(f'CREATE TABLE default.{table} (id UInt64, ts DateTime) ENGINE = MergeTree ORDER BY id')
+
+        check = ClickhouseCheck('clickhouse', {}, [metadata_instance])
+        check.check_id = 'test-collector-id'
+        dd_run_check(check)
+
+        events = _catalog_events(aggregator)
+        assert events, 'Expected at least one clickhouse_databases event on dbm-metadata'
+
+        ev = events[-1]
+        assert ev['dbms'] == 'clickhouse'
+        assert ev['database_instance']
+        assert ev['agent_version']
+        assert ev['collection_started_at'] > 0
+        assert ev['collection_payloads_count'] == 1
+        assert ev['collector_id'] == 'test-collector-id'
+        assert ev['timestamp'] > 0
+
+        db = _find_database(events, 'default')
+        assert db is not None, "Expected the 'default' database to be present in payload"
+        assert any(t['name'] == table for t in db['tables']), (
+            f'Expected table {table} in catalog payload; got: {[t["name"] for t in db["tables"]]}'
+        )
+    finally:
+        client.command(f'DROP TABLE IF EXISTS default.{table} SYNC')
+
+
+def test_metadata_columns_collected(aggregator, metadata_instance, dd_run_check):
+    client = _client(metadata_instance)
+    table = 'dd_md_columns_test'
+    try:
+        client.command(f'DROP TABLE IF EXISTS default.{table}')
+        client.command(
+            f'CREATE TABLE default.{table} ('
+            'id UInt64, '
+            'event_name String, '
+            'created_at DateTime DEFAULT now()'
+            ') ENGINE = MergeTree ORDER BY id'
+        )
+
+        check = ClickhouseCheck('clickhouse', {}, [metadata_instance])
+        dd_run_check(check)
+
+        events = _catalog_events(aggregator)
+        db = _find_database(events, 'default')
+        assert db is not None
+
+        target = next((t for t in db['tables'] if t['name'] == table), None)
+        assert target is not None, f'Expected {table} in tables, got {[t["name"] for t in db["tables"]]}'
+
+        col_names = [c['name'] for c in target['columns']]
+        assert col_names == ['id', 'event_name', 'created_at'], col_names
+
+        types = {c['name']: c['type'] for c in target['columns']}
+        assert types['id'] == 'UInt64'
+        assert types['event_name'] == 'String'
+
+        defaults = {c['name']: c['default'] for c in target['columns']}
+        assert defaults['created_at'], 'DEFAULT expression should round-trip into payload'
+    finally:
+        client.command(f'DROP TABLE IF EXISTS default.{table} SYNC')
+
+
+def test_metadata_materialized_view_with_target(aggregator, metadata_instance, dd_run_check):
+    client = _client(metadata_instance)
+    src = 'dd_md_mv_src'
+    target = 'dd_md_mv_target'
+    mv = 'dd_md_mv_view'
+    try:
+        for obj in (mv, target, src):
+            client.command(f'DROP TABLE IF EXISTS default.{obj}')
+
+        client.command(f'CREATE TABLE default.{src} (id UInt64, val UInt64) ENGINE = MergeTree ORDER BY id')
+        client.command(f'CREATE TABLE default.{target} (id UInt64, total UInt64) ENGINE = MergeTree ORDER BY id')
+        client.command(
+            f'CREATE MATERIALIZED VIEW default.{mv} TO default.{target} AS SELECT id, val AS total FROM default.{src}'
+        )
+
+        check = ClickhouseCheck('clickhouse', {}, [metadata_instance])
+        dd_run_check(check)
+
+        events = _catalog_events(aggregator)
+        db = _find_database(events, 'default')
+        assert db is not None
+
+        view = next((t for t in db['tables'] if t['name'] == mv), None)
+        assert view is not None, f'Expected view {mv} in payload; got: {[t["name"] for t in db["tables"]]}'
+        assert view['engine'] == 'MaterializedView'
+        assert view['create_query']
+        assert f'TO default.{target}' in view['create_query']
+        assert f'FROM default.{src}' in view['create_query']
+    finally:
+        for obj in (mv, target, src):
+            client.command(f'DROP TABLE IF EXISTS default.{obj} SYNC')
+
+
+def test_metadata_skips_system_databases(aggregator, metadata_instance, dd_run_check):
+    check = ClickhouseCheck('clickhouse', {}, [metadata_instance])
+    dd_run_check(check)
+
+    db_names = {db['name'] for db in _databases(_catalog_events(aggregator))}
+    forbidden = {'system', 'INFORMATION_SCHEMA', 'information_schema'}
+    leaked = db_names & forbidden
+    assert not leaked, f'System databases leaked into payload: {leaked}'
+
+
+def test_metadata_disabled_emits_no_payload(aggregator, instance, dd_run_check):
+    instance_config = deepcopy(instance)
+    instance_config['dbm'] = True
+    instance_config['collect_schemas'] = {'enabled': False, 'collection_interval': 60}
+
+    check = ClickhouseCheck('clickhouse', {}, [instance_config])
+    assert check.metadata is None
+    dd_run_check(check)
+
+    assert _catalog_events(aggregator) == [], (
+        'Expected no clickhouse_databases payload when collect_schemas is disabled'
+    )
+
+
+def test_metadata_exclude_tables_filter(aggregator, metadata_instance, dd_run_check):
+    client = _client(metadata_instance)
+    table_keep = 'dd_md_filter_keep'
+    table_drop = 'dd_md_filter_drop'
+    try:
+        for t in (table_keep, table_drop):
+            client.command(f'DROP TABLE IF EXISTS default.{t}')
+            client.command(f'CREATE TABLE default.{t} (id UInt64) ENGINE = MergeTree ORDER BY id')
+
+        instance = deepcopy(metadata_instance)
+        instance['collect_schemas']['exclude_tables'] = [f'^{table_drop}$']
+
+        check = ClickhouseCheck('clickhouse', {}, [instance])
+        dd_run_check(check)
+
+        db = _find_database(_catalog_events(aggregator), 'default')
+        assert db is not None
+        table_names = {t['name'] for t in db['tables']}
+        assert table_keep in table_names
+        assert table_drop not in table_names
+    finally:
+        for t in (table_keep, table_drop):
+            client.command(f'DROP TABLE IF EXISTS default.{t} SYNC')
+
+
+def test_metadata_include_tables_filter(aggregator, metadata_instance, dd_run_check):
+    client = _client(metadata_instance)
+    table_included = 'dd_md_include_target'
+    table_other = 'dd_md_include_other'
+    try:
+        for t in (table_included, table_other):
+            client.command(f'DROP TABLE IF EXISTS default.{t}')
+            client.command(f'CREATE TABLE default.{t} (id UInt64) ENGINE = MergeTree ORDER BY id')
+
+        instance = deepcopy(metadata_instance)
+        instance['collect_schemas']['include_tables'] = [f'^{table_included}$']
+
+        check = ClickhouseCheck('clickhouse', {}, [instance])
+        dd_run_check(check)
+
+        db = _find_database(_catalog_events(aggregator), 'default')
+        assert db is not None
+        table_names = {t['name'] for t in db['tables']}
+        assert table_included in table_names
+        assert table_other not in table_names
+    finally:
+        for t in (table_included, table_other):
+            client.command(f'DROP TABLE IF EXISTS default.{t} SYNC')
+
+
+@pytest.mark.skipif(
+    not _supports_view_refreshes(),
+    reason='system.view_refreshes requires ClickHouse 24.3+',
+)
+def test_metadata_refreshable_view_status_populated(aggregator, metadata_instance, dd_run_check):
+    client = _client(metadata_instance)
+    src = 'dd_md_refresh_src'
+    target = 'dd_md_refresh_target'
+    mv = 'dd_md_refreshable_mv'
+    try:
+        for obj in (mv, target, src):
+            client.command(f'DROP TABLE IF EXISTS default.{obj}')
+
+        client.command(f'CREATE TABLE default.{src} (id UInt64, val UInt64) ENGINE = MergeTree ORDER BY id')
+        client.command(f'CREATE TABLE default.{target} (id UInt64, total UInt64) ENGINE = MergeTree ORDER BY id')
+        client.command(
+            f'CREATE MATERIALIZED VIEW default.{mv} REFRESH EVERY 1 HOUR '
+            f'TO default.{target} AS SELECT id, val AS total FROM default.{src}',
+            settings={'allow_experimental_refreshable_materialized_view': 1},
+        )
+
+        check = ClickhouseCheck('clickhouse', {}, [metadata_instance])
+        check.check_id = 'test-collector-id'
+        dd_run_check(check)
+
+        events = _catalog_events(aggregator)
+        db = _find_database(events, 'default')
+        assert db is not None
+
+        view = next((t for t in db['tables'] if t['name'] == mv), None)
+        assert view is not None, f'Expected refreshable view {mv} in payload'
+        assert view['is_refreshable'] is True
+    finally:
+        for obj in (mv, target, src):
+            client.command(f'DROP TABLE IF EXISTS default.{obj} SYNC')
diff --git a/clickhouse/tests/test_unit.py b/clickhouse/tests/test_unit.py
index f92811c040fb9..788a3908e0a02 100644
--- a/clickhouse/tests/test_unit.py
+++ b/clickhouse/tests/test_unit.py
@@ -329,6 +329,21 @@ def test_query_completions_zero_samples_per_hour_defaults(bad_value):
     assert any('query_completions.samples_per_hour_per_query' in w for w in check._validation_result.warnings)
 
 
+@pytest.mark.parametrize("bad_value", [0, -1, -100])
+def test_collect_schemas_zero_collection_interval_defaults(bad_value):
+    """Zero or negative collection_interval must not crash the constructor via ZeroDivisionError."""
+    instance = {
+        'server': 'localhost',
+        'port': 9000,
+        'username': 'default',
+        'dbm': True,
+        'collect_schemas': {'enabled': True, 'collection_interval': bad_value},
+    }
+    check = ClickhouseCheck('clickhouse', {}, [instance])
+    assert check._config.collect_schemas.collection_interval > 0
+    assert any('collect_schemas.collection_interval' in w for w in check._validation_result.warnings)
+
+
 BASE_INSTANCE = {'server': 'myhost.example.com', 'port': 8123, 'username': 'default'}
 
 

From 97919e60b3d526f40949049927cef42197c37802 Mon Sep 17 00:00:00 2001
From: "dd-octo-sts[bot]" <200755185+dd-octo-sts[bot]@users.noreply.github.com>
Date: Wed, 3 Jun 2026 13:04:46 -0400
Subject: [PATCH 3/4] Finalize Agent release 7.79.2 (#23910)

* Finalize Agent release 7.79.2

* Apply suggestions from code review

Co-authored-by: Sarah Witt <sarah.witt@datadoghq.com>

---------

Co-authored-by: sarah-witt <33498636+sarah-witt@users.noreply.github.com>
Co-authored-by: Sarah Witt <sarah.witt@datadoghq.com>
---
 AGENT_CHANGELOG.md    |   5 +
 AGENT_INTEGRATIONS.md | 261 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 266 insertions(+)

diff --git a/AGENT_CHANGELOG.md b/AGENT_CHANGELOG.md
index aa7056d0752a3..1371e61f8083e 100644
--- a/AGENT_CHANGELOG.md
+++ b/AGENT_CHANGELOG.md
@@ -1,3 +1,8 @@
+## Datadog Agent version [7.79.2](https://github.com/DataDog/datadog-agent/blob/master/CHANGELOG.rst#7792)
+
+### Integration Updates
+* SQL Server [23.0.2](https://github.com/DataDog/integrations-core/blob/master/sqlserver/CHANGELOG.md)
+
 ## Datadog Agent version [7.79.1](https://github.com/DataDog/datadog-agent/blob/master/CHANGELOG.rst#7791)
 
 ### Integration Updates
diff --git a/AGENT_INTEGRATIONS.md b/AGENT_INTEGRATIONS.md
index 6f14f8c0495ac..cfbdc2079fc06 100644
--- a/AGENT_INTEGRATIONS.md
+++ b/AGENT_INTEGRATIONS.md
@@ -1,3 +1,264 @@
+## Datadog Agent version 7.79.2
+
+* datadog-active-directory: 4.6.0
+* datadog-activemq-xml: 5.5.0
+* datadog-activemq: 5.3.0
+* datadog-aerospike: 5.4.1
+* datadog-airflow: 7.4.0
+* datadog-amazon-msk: 7.7.0
+* datadog-ambari: 6.5.0
+* datadog-apache: 7.4.1
+* datadog-appgate-sdp: 2.4.1
+* datadog-arangodb: 4.4.1
+* datadog-arctic-wolf-aurora-endpoint-security: 1.0.0
+* datadog-argo-rollouts: 3.4.1
+* datadog-argo-workflows: 3.4.1
+* datadog-argocd: 4.5.0
+* datadog-aspdotnet: 4.6.0
+* datadog-avi-vantage: 6.4.1
+* datadog-aws-neuron: 3.4.1
+* datadog-azure-iot-edge: 6.5.0
+* datadog-barracuda-secure-edge: 1.1.0
+* datadog-bentoml: 1.5.1
+* datadog-beyondtrust-password-safe: 1.2.0
+* datadog-beyondtrust-privileged-remote-access: 1.0.0
+* datadog-boundary: 4.4.1
+* datadog-btrfs: 4.3.0
+* datadog-cacti: 4.5.0
+* datadog-calico: 5.4.1
+* datadog-cassandra-nodetool: 3.4.0
+* datadog-cassandra: 3.3.1
+* datadog-celery: 2.5.1
+* datadog-ceph: 4.5.1
+* datadog-cert-manager: 6.4.1
+* datadog-checkpoint-harmony-endpoint: 1.2.0
+* datadog-checkpoint-quantum-firewall: 1.3.0
+* datadog-checks-base: 37.35.1
+* datadog-checks-dependency-provider: 3.2.0
+* datadog-checks-downloader: 9.1.0
+* datadog-cilium: 6.4.1
+* datadog-cisco-aci: 4.14.2
+* datadog-cisco-asa: 1.0.0
+* datadog-cisco-secure-client: 1.0.0
+* datadog-cisco-secure-firewall: 1.3.0
+* datadog-cisco-secure-web-appliance: 1.3.0
+* datadog-citrix-hypervisor: 6.4.0
+* datadog-clickhouse: 6.6.0
+* datadog-cloud-foundry-api: 5.6.0
+* datadog-cloudera: 3.6.0
+* datadog-cloudgen-firewall: 1.1.0
+* datadog-cockroachdb: 6.4.1
+* datadog-confluent-platform: 3.3.0
+* datadog-consul: 5.4.0
+* datadog-control-m: 1.1.0
+* datadog-coredns: 6.4.1
+* datadog-couch: 9.4.0
+* datadog-couchbase: 6.5.0
+* datadog-crio: 5.4.0
+* datadog-datadog-cluster-agent: 6.5.0
+* datadog-datadog-csi-driver: 1.5.1
+* datadog-dcgm: 4.4.1
+* datadog-delinea-privilege-manager: 1.2.0
+* datadog-delinea-secret-server: 1.3.0
+* datadog-directory: 4.4.0
+* datadog-disk: 7.5.1
+* datadog-dns-check: 5.5.0
+* datadog-do-query-actions: 1.1.0
+* datadog-dotnetclr: 4.6.0
+* datadog-druid: 5.4.0
+* datadog-duckdb: 1.3.0
+* datadog-ecs-fargate: 7.4.0
+* datadog-eks-fargate: 6.5.0
+* datadog-elastic: 9.5.1
+* datadog-envoy: 6.5.1
+* datadog-eset-protect: 1.2.0
+* datadog-esxi: 4.3.0
+* datadog-etcd: 9.4.0
+* datadog-exchange-server: 4.6.0
+* datadog-external-dns: 6.4.0
+* datadog-falco: 2.4.1
+* datadog-flink: 3.2.0
+* datadog-fluentd: 5.5.1
+* datadog-fluxcd: 3.4.1
+* datadog-fly-io: 3.4.1
+* datadog-forescout: 1.0.0
+* datadog-foundationdb: 3.7.0
+* datadog-gearmand: 5.3.0
+* datadog-gitlab-runner: 7.5.0
+* datadog-gitlab: 10.4.1
+* datadog-glusterfs: 3.4.1
+* datadog-go-expvar: 5.4.0
+* datadog-guarddog: 1.2.0
+* datadog-gunicorn: 4.5.0
+* datadog-haproxy: 8.4.1
+* datadog-harbor: 6.4.0
+* datadog-hazelcast: 6.6.0
+* datadog-hdfs-datanode: 7.4.0
+* datadog-hdfs-namenode: 7.4.0
+* datadog-hive: 2.4.0
+* datadog-hivemq: 2.4.0
+* datadog-http-check: 12.6.2
+* datadog-hudi: 4.3.0
+* datadog-hugging-face-tgi: 1.5.1
+* datadog-hyperv: 3.3.0
+* datadog-ibm-ace: 4.5.1
+* datadog-ibm-db2: 4.3.0
+* datadog-ibm-i: 4.5.0
+* datadog-ibm-mq: 8.9.1
+* datadog-ibm-spectrum-lsf: 1.3.0
+* datadog-ibm-was: 5.5.1
+* datadog-iboss: 1.2.0
+* datadog-ignite: 3.4.0
+* datadog-iis: 5.6.0
+* datadog-impala: 4.4.1
+* datadog-infiniband: 1.6.0
+* datadog-istio: 9.5.1
+* datadog-ivanti-connect-secure: 1.2.0
+* datadog-jboss-wildfly: 3.4.0
+* datadog-journald: 3.2.0
+* datadog-juniper-srx-firewall: 1.3.0
+* datadog-kafka-actions: 2.6.0
+* datadog-kafka-consumer: 7.2.1
+* datadog-kafka: 4.5.0
+* datadog-karpenter: 3.4.1
+* datadog-keda: 2.4.1
+* datadog-keycloak: 1.2.0
+* datadog-kong: 6.4.1
+* datadog-krakend: 1.4.1
+* datadog-kube-apiserver-metrics: 7.5.0
+* datadog-kube-controller-manager: 8.4.1
+* datadog-kube-dns: 7.4.0
+* datadog-kube-metrics-server: 6.4.0
+* datadog-kube-proxy: 9.4.0
+* datadog-kube-scheduler: 7.4.1
+* datadog-kubeflow: 2.4.1
+* datadog-kubelet: 10.4.0
+* datadog-kubernetes-cluster-autoscaler: 3.4.1
+* datadog-kubernetes-state: 10.5.0
+* datadog-kubevirt-api: 2.5.1
+* datadog-kubevirt-controller: 2.4.1
+* datadog-kubevirt-handler: 2.5.1
+* datadog-kuma: 2.4.1
+* datadog-kyototycoon: 4.5.0
+* datadog-kyverno: 3.4.1
+* datadog-lighttpd: 5.5.0
+* datadog-linkerd: 7.4.1
+* datadog-linux-audit-logs: 1.2.0
+* datadog-linux-proc-extras: 4.3.0
+* datadog-litellm: 2.4.1
+* datadog-lustre: 1.5.0
+* datadog-mac-audit-logs: 1.4.1
+* datadog-mapr: 3.4.0
+* datadog-mapreduce: 7.4.0
+* datadog-marathon: 5.4.0
+* datadog-marklogic: 6.5.0
+* datadog-mcache: 6.4.0
+* datadog-mesos-master: 6.4.0
+* datadog-mesos-slave: 6.4.0
+* datadog-microsoft-dns: 1.2.0
+* datadog-microsoft-sysmon: 1.2.0
+* datadog-milvus: 2.5.1
+* datadog-mongo: 10.10.0
+* datadog-mysql: 15.16.1
+* datadog-n8n: 2.0.0
+* datadog-nagios: 3.4.0
+* datadog-network: 5.7.0
+* datadog-nfsstat: 3.5.0
+* datadog-nginx-ingress-controller: 5.4.0
+* datadog-nginx: 9.4.1
+* datadog-nutanix: 1.2.0
+* datadog-nvidia-nim: 2.4.1
+* datadog-nvidia-triton: 3.4.1
+* datadog-octopus-deploy: 2.4.0
+* datadog-openldap: 3.3.0
+* datadog-openmetrics: 7.4.1
+* datadog-openstack-controller: 9.6.1
+* datadog-openstack: 4.3.0
+* datadog-openvpn: 1.2.0
+* datadog-ossec-security: 2.2.0
+* datadog-palo-alto-panorama: 1.2.0
+* datadog-pan-firewall: 3.3.0
+* datadog-pdh-check: 4.6.1
+* datadog-pgbouncer: 8.10.0
+* datadog-php-fpm: 6.4.1
+* datadog-ping-federate: 2.2.0
+* datadog-postfix: 3.4.1
+* datadog-postgres: 23.7.0
+* datadog-powerdns-recursor: 5.4.0
+* datadog-prefect: 1.0.1
+* datadog-presto: 3.4.0
+* datadog-process: 5.5.1
+* datadog-prometheus: 6.3.0
+* datadog-proxmox: 2.5.1
+* datadog-proxysql: 7.7.0
+* datadog-pulsar: 3.6.1
+* datadog-quarkus: 2.4.1
+* datadog-rabbitmq: 8.6.1
+* datadog-ray: 3.4.1
+* datadog-redisdb: 8.8.0
+* datadog-rethinkdb: 5.4.0
+* datadog-riak: 5.5.0
+* datadog-riakcs: 4.13.0
+* datadog-sap-hana: 5.5.0
+* datadog-scylla: 5.4.1
+* datadog-sidekiq: 3.2.0
+* datadog-silk: 4.5.0
+* datadog-silverstripe-cms: 1.8.0
+* datadog-singlestore: 4.6.0
+* datadog-slurm: 2.4.0
+* datadog-snmp: 12.3.2
+* datadog-solr: 2.4.0
+* datadog-sonarqube: 5.6.1
+* datadog-sonatype-nexus: 2.3.0
+* datadog-sonicwall-firewall: 1.3.0
+* datadog-spark: 7.7.1
+* datadog-sqlserver: 23.0.2
+* datadog-squid: 5.4.0
+* datadog-ssh-check: 4.8.0
+* datadog-statsd: 3.3.0
+* datadog-strimzi: 4.4.1
+* datadog-supabase: 2.4.1
+* datadog-supervisord: 4.4.1
+* datadog-suricata: 2.2.0
+* datadog-symantec-endpoint-protection: 1.3.0
+* datadog-system-core: 4.3.0
+* datadog-system-swap: 3.3.0
+* datadog-tcp-check: 6.3.0
+* datadog-teamcity: 7.4.1
+* datadog-tekton: 3.4.1
+* datadog-teleport: 3.4.1
+* datadog-temporal: 4.5.1
+* datadog-tenable: 3.2.0
+* datadog-teradata: 4.3.0
+* datadog-tibco-ems: 2.5.0
+* datadog-tls: 5.6.2
+* datadog-tokumx: 3.6.1
+* datadog-tomcat: 4.3.0
+* datadog-torchserve: 4.4.1
+* datadog-traefik-mesh: 3.5.0
+* datadog-traffic-server: 3.6.0
+* datadog-twemproxy: 3.3.0
+* datadog-twistlock: 6.4.0
+* datadog-varnish: 4.4.1
+* datadog-vault: 7.4.1
+* datadog-velero: 3.4.1
+* datadog-vertica: 6.5.0
+* datadog-vllm: 3.4.1
+* datadog-voltdb: 6.4.1
+* datadog-vsphere: 9.4.1
+* datadog-watchguard-firebox: 1.2.0
+* datadog-wazuh: 1.3.0
+* datadog-weaviate: 4.4.1
+* datadog-weblogic: 3.3.0
+* datadog-win32-event-log: 5.6.0
+* datadog-windows-performance-counters: 3.4.0
+* datadog-windows-service: 6.7.1
+* datadog-wmi-check: 4.1.1
+* datadog-yarn: 8.4.1
+* datadog-zeek: 1.2.0
+* datadog-zk: 6.5.1
+* datadog-zscaler-private-access: 1.1.0
+
 ## Datadog Agent version 7.79.1
 
 * datadog-active-directory: 4.6.0

From 22e54b1592b65cb46b36eedbe92cab351ccf6557 Mon Sep 17 00:00:00 2001
From: Sarah Witt <sarah.witt@datadoghq.com>
Date: Wed, 3 Jun 2026 13:07:53 -0400
Subject: [PATCH 4/4] Fix kerberos tests (#23923)

* fix kerberos tests

* rename image
---
 .../tests/compose/kerberos/kerberos-agent.yaml             | 2 +-
 .../tests/compose/kerberos/kerberos-nginx/Dockerfile       | 7 ++++---
 datadog_checks_base/tests/compose/kerberos/kerberos.yaml   | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/datadog_checks_base/tests/compose/kerberos/kerberos-agent.yaml b/datadog_checks_base/tests/compose/kerberos/kerberos-agent.yaml
index a3a082b917372..4316159793f94 100644
--- a/datadog_checks_base/tests/compose/kerberos/kerberos-agent.yaml
+++ b/datadog_checks_base/tests/compose/kerberos/kerberos-agent.yaml
@@ -54,7 +54,7 @@ services:
       - "464:8464"
 
   web:
-    image: kerberos-nginx:1.20.2
+    image: kerberos-nginx:1.26.2
     build: ./kerberos-nginx
     environment:
       KRB5_KEYTAB: ${KRB5_KEYTAB}
diff --git a/datadog_checks_base/tests/compose/kerberos/kerberos-nginx/Dockerfile b/datadog_checks_base/tests/compose/kerberos/kerberos-nginx/Dockerfile
index 6c63410f96c49..0bce0e6dcdf8c 100644
--- a/datadog_checks_base/tests/compose/kerberos/kerberos-nginx/Dockerfile
+++ b/datadog_checks_base/tests/compose/kerberos/kerberos-nginx/Dockerfile
@@ -1,4 +1,4 @@
-FROM nginx:1.20.2
+FROM nginx:1.26.3
 
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -15,11 +15,12 @@ RUN apt-get update -y -qq && apt-get install -y --no-install-recommends \
   git
 
 RUN cd /usr/src && mkdir nginx \
-  && curl -kfSL https://nginx.org/download/nginx-1.20.2.tar.gz -o nginx.tar.gz \
+  && curl -kfSL https://nginx.org/download/nginx-1.26.3.tar.gz -o nginx.tar.gz \
   && tar -xzf nginx.tar.gz -C nginx --strip-components=1
 
 RUN cd /usr/src/nginx \
-  && git clone http://github.com/stnoonan/spnego-http-auth-nginx-module.git
+  && git clone http://github.com/stnoonan/spnego-http-auth-nginx-module.git \
+  && git -C spnego-http-auth-nginx-module checkout 7fa3864a86d5
 
 RUN cd /usr/src/nginx \
   && ./configure --with-compat --add-dynamic-module=spnego-http-auth-nginx-module \
diff --git a/datadog_checks_base/tests/compose/kerberos/kerberos.yaml b/datadog_checks_base/tests/compose/kerberos/kerberos.yaml
index d8007a4bd4c11..c6c89a352cc60 100644
--- a/datadog_checks_base/tests/compose/kerberos/kerberos.yaml
+++ b/datadog_checks_base/tests/compose/kerberos/kerberos.yaml
@@ -26,7 +26,7 @@ services:
       - "464:8464"
 
   web:
-    image: kerberos-nginx:1.20.2
+    image: kerberos-nginx:1.26.2
     build: ./kerberos-nginx
     container_name: kerberos-nginx
     environment: