elementary-data
diff --git a/‎integration_tests/dbt_project/macros/replace_empty_strings_with_nulls.sql‎
Lines changed: 7 additions & 3 deletions b/‎integration_tests/dbt_project/macros/replace_empty_strings_with_nulls.sql‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎integration_tests/docker-compose.yml‎
Lines changed: 1 addition & 0 deletions b/‎integration_tests/docker-compose.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎integration_tests/docker/clickhouse/users.xml‎
Lines changed: 8 additions & 0 deletions b/‎integration_tests/docker/clickhouse/users.xml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎integration_tests/tests/adapter_query_runner.py‎
Lines changed: 10 additions & 0 deletions b/‎integration_tests/tests/adapter_query_runner.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎integration_tests/tests/data_seeder.py‎
Lines changed: 126 additions & 2 deletions b/‎integration_tests/tests/data_seeder.py‎
Lines changed: 126 additions & 2 deletions
diff --git a/‎integration_tests/tests/dbt_project.py‎
Lines changed: 24 additions & 58 deletions b/‎integration_tests/tests/dbt_project.py‎
Lines changed: 24 additions & 58 deletions
diff --git a/‎integration_tests/tests/test_all_columns_anomalies.py‎
Lines changed: 0 additions & 10 deletions b/‎integration_tests/tests/test_all_columns_anomalies.py‎
Lines changed: 0 additions & 10 deletions
@@ -1,16 +1,20 @@
+{# This macro is only used for BigQuery fusion seeds (see dbt_project.py _fix_seed_if_needed).
+   ClickHouse uses ClickHouseDirectSeeder (data_seeder.py) which creates Nullable(String)
+   columns directly, so no post-hoc repair is needed. #}
 {% macro replace_empty_strings_with_nulls(table_name) %}
     {% set relation = ref(table_name) %}
     {% set columns = adapter.get_columns_in_relation(relation) %}
 
     {% for col in columns %}
         {% set data_type = elementary.get_column_data_type(col) %}
         {% set normalized_data_type = elementary.normalize_data_type(data_type) %}
-        
+
         {% if normalized_data_type == "string" %}
+            {% set quoted_col = adapter.quote(col["name"]) %}
             {% set update_query %}
                 update {{ relation }}
-                set {{ col["name"] }} = NULL
-                where {{ col["name"] }} = ''
+                set {{ quoted_col }} = NULL
+                where {{ quoted_col }} = ''
             {% endset %}
             {% do elementary.run_query(update_query) %}
         {% endif %}
 
@@ -20,6 +20,7 @@ services:
       - "9000:9000"
     volumes:
       - clickhouse:/var/lib/clickhouse
+      - ./docker/clickhouse/users.xml:/etc/clickhouse-server/users.d/elementary.xml
     environment:
       CLICKHOUSE_DB: default
       CLICKHOUSE_USER: default
 
@@ -0,0 +1,8 @@
+<clickhouse>
+    <profiles>
+        <default>
+            <join_use_nulls>1</join_use_nulls>
+            <mutations_sync>1</mutations_sync>
+        </default>
+    </profiles>
+</clickhouse>
@@ -239,6 +239,16 @@ def has_non_ref_jinja(query: str) -> bool:
         stripped = _SOURCE_PATTERN.sub("", stripped)
         return bool(_JINJA_EXPR_PATTERN.search(stripped))
 
+    def execute_sql(self, sql: str) -> None:
+        """Execute a SQL statement that does not return results (DDL/DML)."""
+        with self._adapter.connection_named("execute_sql"):
+            self._adapter.execute(sql, fetch=False)
+
+    @property
+    def schema_name(self) -> str:
+        """Return the base schema name from the adapter credentials."""
+        return self._adapter.config.credentials.schema
+
     def run_query(self, prerendered_query: str) -> List[Dict[str, Any]]:
         """Render Jinja refs/sources and execute a query, returning rows as dicts.
 
 
@@ -1,12 +1,13 @@
 import csv
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Generator, List
+from typing import TYPE_CHECKING, Generator, List
 
 from elementary.clients.dbt.base_dbt_runner import BaseDbtRunner
 from logger import get_logger
 
-# TODO: Write more performant data seeders per adapter.
+if TYPE_CHECKING:
+    from adapter_query_runner import AdapterQueryRunner
 
 logger = get_logger(__name__)
 
@@ -48,3 +49,126 @@ def seed(self, data: List[dict], table_name: str) -> Generator[None, None, None]
                 yield
         finally:
             seed_path.unlink()
+
+
+# Maximum number of rows per INSERT VALUES statement.
+_INSERT_BATCH_SIZE = 500
+
+
+class ClickHouseDirectSeeder:
+    """Fast seeder for ClickHouse: executes CREATE TABLE + INSERT directly.
+
+    Bypasses the ``dbt seed`` *subprocess* (and its post-hoc NULL repair),
+    but still writes a CSV file to the seeds directory so that dbt can
+    discover the seed node for ``{{ ref() }}`` resolution during
+    ``run_operation``.
+
+    Column types are inferred from the Python values in the seed data and
+    wrapped in ``Nullable()`` so that NULL values are preserved correctly
+    (ClickHouse columns are non-Nullable by default).
+    """
+
+    def __init__(
+        self,
+        query_runner: "AdapterQueryRunner",
+        schema: str,
+        seeds_dir_path: Path,
+    ) -> None:
+        self._query_runner = query_runner
+        self._schema = schema
+        self._seeds_dir_path = seeds_dir_path
+
+    @staticmethod
+    def _infer_column_type(values: List[object]) -> str:
+        """Infer a ClickHouse column type from a list of Python values.
+
+        Examines non-None, non-empty-string values and returns a
+        ``Nullable(...)`` type string.  Falls back to ``Nullable(String)``
+        when all values are None/empty or when types are mixed.
+        """
+        non_null = [v for v in values if v is not None and v != ""]
+        if not non_null:
+            return "Nullable(String)"
+
+        # bool is a subclass of int in Python, so check it first.
+        # dbt seed infers "True"/"False" CSV values as boolean; dbt-clickhouse
+        # maps this to Bool (alias for UInt8).
+        if all(isinstance(v, bool) for v in non_null):
+            return "Nullable(Bool)"
+        if all(isinstance(v, int) and not isinstance(v, bool) for v in non_null):
+            return "Nullable(Int64)"
+        if all(
+            isinstance(v, (int, float)) and not isinstance(v, bool) for v in non_null
+        ):
+            return "Nullable(Float64)"
+        return "Nullable(String)"
+
+    @staticmethod
+    def _escape(value: object) -> str:
+        """Escape a value for a ClickHouse SQL literal.
+
+        Returns ``NULL`` for None / empty-string, unquoted literals for
+        numeric / boolean types, and a quoted+escaped string otherwise.
+        """
+        if value is None or (isinstance(value, str) and value == ""):
+            return "NULL"
+        # Booleans → ClickHouse Bool literals (true/false).
+        if isinstance(value, bool):
+            return "true" if value else "false"
+        if isinstance(value, (int, float)):
+            return str(value)
+        text = str(value)
+        text = text.replace("\\", "\\\\")
+        text = text.replace("'", "\\'")
+        return f"'{text}'"
+
+    @contextmanager
+    def seed(self, data: List[dict], table_name: str) -> Generator[None, None, None]:
+        """Create a table with correctly-typed Nullable columns and insert data.
+
+        A CSV file is written to the seeds directory so that dbt can
+        discover the seed node for ``{{ ref() }}`` resolution.  The file
+        is removed when the context manager exits.
+        """
+        columns = list(data[0].keys())
+        col_types = {
+            col: self._infer_column_type([row.get(col) for row in data])
+            for col in columns
+        }
+        col_defs = ", ".join(f"`{col}` {col_types[col]}" for col in columns)
+        fq_table = f"`{self._schema}`.`{table_name}`"
+
+        # Write a CSV so dbt discovers the seed node (needed for {{ ref() }}).
+        seed_path = self._seeds_dir_path / f"{table_name}.csv"
+        with seed_path.open("w") as f:
+            writer = csv.DictWriter(f, fieldnames=columns)
+            writer.writeheader()
+            writer.writerows(data)
+
+        try:
+            self._query_runner.execute_sql(f"DROP TABLE IF EXISTS {fq_table}")
+            self._query_runner.execute_sql(
+                f"CREATE TABLE {fq_table} ({col_defs}) "
+                f"ENGINE = MergeTree() ORDER BY tuple()"
+            )
+
+            for batch_start in range(0, len(data), _INSERT_BATCH_SIZE):
+                batch = data[batch_start : batch_start + _INSERT_BATCH_SIZE]
+                rows_sql = ", ".join(
+                    "(" + ", ".join(self._escape(row.get(c)) for c in columns) + ")"
+                    for row in batch
+                )
+                self._query_runner.execute_sql(
+                    f"INSERT INTO {fq_table} VALUES {rows_sql}"
+                )
+
+            logger.info(
+                "ClickHouseDirectSeeder: loaded %d rows into %s (%s)",
+                len(data),
+                fq_table,
+                ", ".join(f"{c}: {t}" for c, t in col_types.items()),
+            )
+
+            yield
+        finally:
+            seed_path.unlink(missing_ok=True)
@@ -7,29 +7,16 @@
 from uuid import uuid4
 
 from adapter_query_runner import AdapterQueryRunner, UnsupportedJinjaError
-from data_seeder import DbtDataSeeder
+from data_seeder import ClickHouseDirectSeeder, DbtDataSeeder
 from dbt_utils import get_database_and_schema_properties
 from elementary.clients.dbt.base_dbt_runner import BaseDbtRunner
 from elementary.clients.dbt.factory import RunnerMethod, create_dbt_runner
 from logger import get_logger
 from ruamel.yaml import YAML
-from tenacity import (
-    RetryCallState,
-    retry,
-    retry_if_result,
-    stop_after_attempt,
-    wait_fixed,
-)
 
 PYTEST_XDIST_WORKER = os.environ.get("PYTEST_XDIST_WORKER", None)
 SCHEMA_NAME_SUFFIX = f"_{PYTEST_XDIST_WORKER}" if PYTEST_XDIST_WORKER else ""
 
-# Retry settings for the run_operation fallback path.  run_operation() can
-# intermittently return an empty list when the MACRO_RESULT_PATTERN log line
-# is not captured from dbt's output.
-_RUN_QUERY_MAX_RETRIES = 3
-_RUN_QUERY_RETRY_DELAY_SECONDS = 0.5
-
 _DEFAULT_VARS = {
     "disable_dbt_invocation_autoupload": True,
     "disable_dbt_artifacts_autoupload": True,
@@ -92,51 +79,21 @@ def run_query(self, prerendered_query: str):
         except UnsupportedJinjaError:
             logger.debug("Query contains complex Jinja; falling back to run_operation")
 
-        # Slow path: full Jinja rendering via run_operation (with retry).
+        # Slow path: full Jinja rendering via run_operation.
         return self._run_query_with_run_operation(prerendered_query)
 
-    @staticmethod
-    def _log_retry(retry_state: RetryCallState) -> None:
-        """Tenacity before_sleep callback — logs each retry with attempt number."""
-        logger.warning(
-            "run_operation('elementary.render_run_query') returned no output; "
-            "retry %d/%d in %.1fs",
-            retry_state.attempt_number,
-            _RUN_QUERY_MAX_RETRIES,
-            _RUN_QUERY_RETRY_DELAY_SECONDS,
-        )
-
-    @retry(
-        retry=retry_if_result(lambda r: r is None),
-        stop=stop_after_attempt(_RUN_QUERY_MAX_RETRIES),
-        wait=wait_fixed(_RUN_QUERY_RETRY_DELAY_SECONDS),
-        before_sleep=_log_retry.__func__,
-        reraise=True,
-    )
-    def _run_operation_with_retry(self, prerendered_query: str) -> Optional[list]:
-        """Call run_operation and return the parsed result, or None to trigger retry."""
+    def _run_query_with_run_operation(self, prerendered_query: str):
+        """Execute a query via run_operation."""
         run_operation_results = self.dbt_runner.run_operation(
             "elementary.render_run_query",
             macro_args={"prerendered_query": prerendered_query},
         )
-        if run_operation_results:
-            return json.loads(run_operation_results[0])
-        return None
-
-    def _run_query_with_run_operation(self, prerendered_query: str):
-        """Execute a query via run_operation with retry on empty output.
-
-        run_operation() can intermittently return an empty list when the
-        MACRO_RESULT_PATTERN log line is not captured from dbt's output.
-        """
-        result = self._run_operation_with_retry(prerendered_query)
-        if result is None:
+        if not run_operation_results:
             raise RuntimeError(
-                f"run_operation('elementary.render_run_query') returned no output "
-                f"after {_RUN_QUERY_MAX_RETRIES} attempts. "
+                f"run_operation('elementary.render_run_query') returned no output. "
                 f"Query: {prerendered_query!r}"
             )
-        return result
+        return json.loads(run_operation_results[0])
 
     @staticmethod
     def read_table_query(
@@ -326,15 +283,25 @@ def test(
             }
             return [test_result] if multiple_results else test_result
 
-    def seed(self, data: List[dict], table_name: str):
-        with DbtDataSeeder(
+    def _create_seeder(
+        self,
+    ) -> Union[DbtDataSeeder, "ClickHouseDirectSeeder"]:
+        """Return the appropriate seeder for the current target."""
+        if self.target == "clickhouse":
+            runner = self._get_query_runner()
+            schema = runner.schema_name + SCHEMA_NAME_SUFFIX
+            return ClickHouseDirectSeeder(runner, schema, self.seeds_dir_path)
+        return DbtDataSeeder(
             self.dbt_runner, self.project_dir_path, self.seeds_dir_path
-        ).seed(data, table_name):
+        )
+
+    def seed(self, data: List[dict], table_name: str):
+        with self._create_seeder().seed(data, table_name):
             self._fix_seed_if_needed(table_name)
 
-    def _fix_seed_if_needed(self, table_name: str):
+    def _fix_seed_if_needed(self, table_name: str) -> None:
         # Hack for BigQuery - seems like we get empty strings instead of nulls in seeds, so we
-        # fix them here
+        # fix them here.
         if self.runner_method == RunnerMethod.FUSION and self.target == "bigquery":
             self.dbt_runner.run_operation(
                 "elementary_tests.replace_empty_strings_with_nulls",
@@ -345,9 +312,8 @@ def _fix_seed_if_needed(self, table_name: str):
     def seed_context(
         self, data: List[dict], table_name: str
     ) -> Generator[None, None, None]:
-        with DbtDataSeeder(
-            self.dbt_runner, self.project_dir_path, self.seeds_dir_path
-        ).seed(data, table_name):
+        with self._create_seeder().seed(data, table_name):
+            self._fix_seed_if_needed(table_name)
             yield
 
     @contextmanager
 
@@ -13,8 +13,6 @@
 }
 
 
-# Anomalies currently not supported on ClickHouse
-@pytest.mark.skip_targets(["clickhouse"])
 def test_anomalyless_all_columns_anomalies(test_id: str, dbt_project: DbtProject):
     utc_today = datetime.utcnow().date()
     data: List[Dict[str, Any]] = [
@@ -31,8 +29,6 @@ def test_anomalyless_all_columns_anomalies(test_id: str, dbt_project: DbtProject
     assert all([res["status"] == "pass" for res in test_results])
 
 
-# Anomalies currently not supported on ClickHouse
-@pytest.mark.skip_targets(["clickhouse"])
 def test_anomalous_all_columns_anomalies(test_id: str, dbt_project: DbtProject):
     utc_today = datetime.utcnow().date()
     test_date, *training_dates = generate_dates(base_date=utc_today - timedelta(1))
@@ -57,8 +53,6 @@ def test_anomalous_all_columns_anomalies(test_id: str, dbt_project: DbtProject):
     assert col_to_status == {"superhero": "fail", TIMESTAMP_COLUMN: "pass"}
 
 
-# Anomalies currently not supported on ClickHouse
-@pytest.mark.skip_targets(["clickhouse"])
 def test_all_columns_anomalies_with_where_parameter(
     test_id: str, dbt_project: DbtProject
 ):
@@ -128,8 +122,6 @@ def test_all_columns_anomalies_with_where_parameter(
     }
 
 
-# Anomalies currently not supported on ClickHouse
-@pytest.mark.skip_targets(["clickhouse"])
 def test_anomalyless_all_columns_anomalies_all_monitors_sanity(
     test_id: str, dbt_project: DbtProject
 ):
@@ -155,8 +147,6 @@ def test_anomalyless_all_columns_anomalies_all_monitors_sanity(
     assert all([res["status"] == "pass" for res in test_results])
 
 
-# Anomalies currently not supported on ClickHouse
-@pytest.mark.skip_targets(["clickhouse"])
 @pytest.mark.parametrize(
     "exclude_detection,expected_status",
     [