From 9e35d352a153787a08f9639786bc99b486446826 Mon Sep 17 00:00:00 2001 From: fedeflowers Date: Sun, 15 Mar 2026 20:04:50 +0100 Subject: [PATCH 01/24] feat: add variable substitution support for check definitions --- src/databricks/labs/dqx/engine.py | 38 ++- src/databricks/labs/dqx/utils.py | 129 ++++++++++ tests/integration/test_apply_checks.py | 159 ++++++++++++ .../test_apply_checks_and_save_in_table.py | 57 +++++ tests/unit/test_utils.py | 237 ++++++++++++++++++ 5 files changed, 614 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/dqx/engine.py b/src/databricks/labs/dqx/engine.py index 792fa5328..88c3d17ea 100644 --- a/src/databricks/labs/dqx/engine.py +++ b/src/databricks/labs/dqx/engine.py @@ -47,7 +47,7 @@ from databricks.labs.dqx.telemetry import telemetry_logger, log_telemetry, log_dataframe_telemetry from databricks.sdk import WorkspaceClient from databricks.labs.dqx.errors import InvalidCheckError, InvalidConfigError, InvalidParameterError -from databricks.labs.dqx.utils import list_tables, safe_strip_file_from_path +from databricks.labs.dqx.utils import list_tables, safe_strip_file_from_path, apply_variables from databricks.labs.dqx.io import is_one_time_trigger logger = logging.getLogger(__name__) @@ -220,6 +220,7 @@ def apply_checks_by_metadata( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, ref_dfs: dict[str, DataFrame] | None = None, + variables: dict[str, Any] | None = None, ) -> DataFrame | tuple[DataFrame, Observation]: """Apply data quality checks defined as metadata to the given DataFrame. @@ -232,11 +233,14 @@ def apply_checks_by_metadata( (rows appear in both DataFrames). custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). ref_dfs: Optional reference DataFrames to use in the checks. + variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` + placeholders in all string values of the check definitions before validation and deserialization. Returns: A DataFrame with errors and warnings result columns and an optional Observation which tracks data quality summary metrics. Summary metrics are returned by any `DQEngine` with an `observer` specified. """ + checks = apply_variables(checks, variables) dq_rule_checks = deserialize_checks(checks, custom_check_functions) return self.apply_checks(df, dq_rule_checks, ref_dfs) @@ -247,6 +251,7 @@ def apply_checks_by_metadata_and_split( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, ref_dfs: dict[str, DataFrame] | None = None, + variables: dict[str, Any] | None = None, ) -> tuple[DataFrame, DataFrame] | tuple[DataFrame, DataFrame, Observation]: """Apply data quality checks defined as metadata to the given DataFrame and split the results into two DataFrames ("good" and "bad"). @@ -260,6 +265,8 @@ def apply_checks_by_metadata_and_split( (rows appear in both DataFrames). custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). ref_dfs: Optional reference DataFrames to use in the checks. + variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` + placeholders in all string values of the check definitions before validation and deserialization. Returns: A tuple of two DataFrames: "good" (may include rows with warnings but no result columns) and "bad" (rows @@ -269,6 +276,7 @@ def apply_checks_by_metadata_and_split( Raises: InvalidCheckError: If any of the checks are invalid. """ + checks = apply_variables(checks, variables) dq_rule_checks = deserialize_checks(checks, custom_check_functions) good_df, bad_df, *observations = self.apply_checks_and_split(df, dq_rule_checks, ref_dfs) @@ -283,6 +291,7 @@ def validate_checks( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, validate_custom_check_functions: bool = True, + variables: dict[str, Any] | None = None, ) -> ChecksValidationStatus: """ Validate checks defined as metadata to ensure they conform to the expected structure and types. @@ -294,10 +303,13 @@ def validate_checks( checks: List of checks to apply to the DataFrame. Each check should be a dictionary. custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). validate_custom_check_functions: If True, validate custom check functions. + variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` + placeholders in all string values of the check definitions before validation. Returns: ChecksValidationStatus indicating the validation result. """ + checks = apply_variables(checks, variables) return ChecksValidator.validate_checks(checks, custom_check_functions, validate_custom_check_functions) def get_invalid(self, df: DataFrame) -> DataFrame: @@ -614,6 +626,7 @@ def apply_checks_by_metadata( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, ref_dfs: dict[str, DataFrame] | None = None, + variables: dict[str, Any] | None = None, ) -> DataFrame | tuple[DataFrame, Observation]: """Apply data quality checks defined as metadata to the given DataFrame. @@ -626,13 +639,15 @@ def apply_checks_by_metadata( (rows appear in both DataFrames). custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). ref_dfs: Optional reference DataFrames to use in the checks. + variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` + placeholders in all string values of the check definitions before validation and deserialization. Returns: A DataFrame with errors and warnings result columns and an optional Observation which tracks data quality summary metrics. Summary metrics are returned by any `DQEngine` with an `observer` specified. """ log_dataframe_telemetry(self.ws, self.spark, df) - return self._engine.apply_checks_by_metadata(df, checks, custom_check_functions, ref_dfs) + return self._engine.apply_checks_by_metadata(df, checks, custom_check_functions, ref_dfs, variables) @telemetry_logger("engine", "apply_checks_by_metadata_and_split") def apply_checks_by_metadata_and_split( @@ -641,6 +656,7 @@ def apply_checks_by_metadata_and_split( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, ref_dfs: dict[str, DataFrame] | None = None, + variables: dict[str, Any] | None = None, ) -> tuple[DataFrame, DataFrame] | tuple[DataFrame, DataFrame, Observation]: """Apply data quality checks defined as metadata to the given DataFrame and split the results into two DataFrames ("good" and "bad"). @@ -654,6 +670,8 @@ def apply_checks_by_metadata_and_split( (rows appear in both DataFrames). custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). ref_dfs: Optional reference DataFrames to use in the checks. + variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` + placeholders in all string values of the check definitions before validation and deserialization. Returns: A tuple of two DataFrames: "good" (may include rows with warnings but no result columns) and "bad" (rows @@ -661,7 +679,7 @@ def apply_checks_by_metadata_and_split( quality summary metrics. Summary metrics are returned by any `DQEngine` with an `observer` specified. """ log_dataframe_telemetry(self.ws, self.spark, df) - return self._engine.apply_checks_by_metadata_and_split(df, checks, custom_check_functions, ref_dfs) + return self._engine.apply_checks_by_metadata_and_split(df, checks, custom_check_functions, ref_dfs, variables) @telemetry_logger("engine", "apply_checks_and_save_in_table") def apply_checks_and_save_in_table( @@ -758,6 +776,7 @@ def apply_checks_by_metadata_and_save_in_table( custom_check_functions: dict[str, Callable] | None = None, ref_dfs: dict[str, DataFrame] | None = None, checks_location: str | None = None, + variables: dict[str, Any] | None = None, ) -> None: """ Apply metadata-defined data quality checks to input data and save results. @@ -782,6 +801,8 @@ def apply_checks_by_metadata_and_save_in_table( to callables/modules (e.g., globals()). ref_dfs: Optional reference DataFrames used by checks. checks_location: Optional location of the checks. Used for reporting in the summary metrics table only. + variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` + placeholders in all string values of the check definitions before validation and deserialization. """ logger.info(f"Applying checks to {input_config.location}") @@ -792,7 +813,9 @@ def apply_checks_by_metadata_and_save_in_table( quarantine_streaming_query = None if quarantine_config: - check_result = self.apply_checks_by_metadata_and_split(df, checks, custom_check_functions, ref_dfs) + check_result = self.apply_checks_by_metadata_and_split( + df, checks, custom_check_functions, ref_dfs, variables + ) if self._engine.observer: good_df, bad_df, batch_observation = check_result else: @@ -801,7 +824,7 @@ def apply_checks_by_metadata_and_save_in_table( quarantine_streaming_query = save_dataframe_as_table(bad_df, quarantine_config) target_streaming_query = quarantine_streaming_query else: - check_result = self.apply_checks_by_metadata(df, checks, custom_check_functions, ref_dfs) + check_result = self.apply_checks_by_metadata(df, checks, custom_check_functions, ref_dfs, variables) if self._engine.observer: checked_df, batch_observation = check_result else: @@ -958,6 +981,7 @@ def validate_checks( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, validate_custom_check_functions: bool = True, + variables: dict[str, Any] | None = None, ) -> ChecksValidationStatus: """ Validate checks defined as metadata to ensure they conform to the expected structure and types. @@ -969,11 +993,13 @@ def validate_checks( checks: List of checks to apply to the DataFrame. Each check should be a dictionary. custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). validate_custom_check_functions: If True, validate custom check functions. + variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` + placeholders in all string values of the check definitions before validation. Returns: ChecksValidationStatus indicating the validation result. """ - return DQEngineCore.validate_checks(checks, custom_check_functions, validate_custom_check_functions) + return DQEngineCore.validate_checks(checks, custom_check_functions, validate_custom_check_functions, variables) def get_invalid(self, df: DataFrame) -> DataFrame: """ diff --git a/src/databricks/labs/dqx/utils.py b/src/databricks/labs/dqx/utils.py index 5ff8fa168..f8e13bb2e 100644 --- a/src/databricks/labs/dqx/utils.py +++ b/src/databricks/labs/dqx/utils.py @@ -5,6 +5,7 @@ import re from decimal import Decimal from importlib.util import find_spec +from collections.abc import Callable, Generator from typing import Any from fnmatch import fnmatch from pathlib import Path @@ -30,6 +31,8 @@ COLUMN_NORMALIZE_EXPRESSION = re.compile("[^a-zA-Z0-9]+") COLUMN_PATTERN = re.compile(r"Column<'(.*?)(?: AS (\w+))?'>$", re.DOTALL) INVALID_COLUMN_NAME_PATTERN = re.compile(r"[\s,;{}\(\)\n\t=]+") +_UNRESOLVED_PLACEHOLDER_PATTERN = re.compile(r"\{\{.*?\}\}") +_SCALAR_VARIABLE_TYPES = (str, int, float, bool, Decimal) def get_column_name_or_alias( @@ -527,6 +530,132 @@ def missing_required_packages(packages: list[str]) -> bool: return not all(find_spec(spec) for spec in packages) +def _literal_replacer(val: str) -> Callable[[re.Match], str]: + """Return a ``re.sub`` replacer that always returns *val* literally.""" + + def replacer(_: re.Match) -> str: + return val + + return replacer + + +def _replace_template(text: str, variables: dict[str, str]) -> str: + """Replace ``{{ key }}`` placeholders in *text* with values from *variables*. + + Tolerates whitespace inside braces (e.g. ``{{ key }}``, ``{{key}}``). + Uses a lambda replacement to avoid backslash interpretation in values. + + Args: + text: Input string potentially containing ``{{ key }}`` placeholders. + variables: Pre-stringified mapping of placeholder names to values. + + Returns: + String with all matching placeholders replaced. + """ + for key, val in variables.items(): + pattern = r"\{\{\s*" + re.escape(key) + r"\s*\}\}" + text = re.sub(pattern, _literal_replacer(val), text) + return text + + +def _substitute_variables(obj: Any, variables: dict[str, str]) -> Any: + """Recursively replace ``{{ key }}`` placeholders in all string values within *obj*. + + Traverses dicts, lists, and strings. Non-string/non-collection values are + returned unchanged. Dict keys are not substituted. + + Args: + obj: A string, dict, list, or other value to process. + variables: Pre-stringified mapping of placeholder names to values. + + Returns: + A new object with all string values having placeholders replaced. + """ + if isinstance(obj, str): + return _replace_template(obj, variables) + if isinstance(obj, dict): + return {k: _substitute_variables(v, variables) for k, v in obj.items()} + if isinstance(obj, list): + return [_substitute_variables(item, variables) for item in obj] + return obj + + +def _validate_variable_types(variables: dict[str, Any]) -> None: + """Raise :class:`InvalidParameterError` if any variable value is not a supported scalar type.""" + for key, val in variables.items(): + if not isinstance(val, _SCALAR_VARIABLE_TYPES): + raise InvalidParameterError( + f"Variable '{key}' has unsupported type '{type(val).__name__}'. " + f"Only scalar types are supported: str, int, float, bool, Decimal." + ) + + +def apply_variables(checks: list[dict], variables: dict[str, Any] | None) -> list[dict]: + """Apply variable substitution to check definitions. + + Replaces ``{{ key }}`` placeholders in all string values of *checks* with the + corresponding values from *variables*. The original *checks* list is never mutated. + + Variable values must be scalar types (``str``, ``int``, ``float``, ``bool``, + ``Decimal``). Non-string scalars are converted via ``str()`` — for example, + ``{"threshold": 10}`` becomes ``"10"`` in the substituted string. Collection + types (``list``, ``dict``, ``set``, etc.) are rejected with + :class:`~databricks.labs.dqx.errors.InvalidParameterError` because their + ``str()`` representation is rarely meaningful in SQL or column expressions. + + Logs a warning for any ``{{ ... }}`` placeholders that remain unresolved after + substitution (e.g. misspelled variable names). + + Args: + checks: List of check definition dictionaries (metadata format). + variables: Mapping of placeholder names to scalar replacement values. + If ``None`` or empty the checks are returned unchanged. + + Returns: + A new list of check dicts with placeholders resolved, or the original list + when no substitution is needed. + + Raises: + InvalidParameterError: If any variable value is not a supported scalar type. + """ + if not variables: + return checks + + _validate_variable_types(variables) + str_variables = {k: str(v) for k, v in variables.items()} + resolved: list[dict] = _substitute_variables(checks, str_variables) + + # Warn about any remaining unresolved placeholders + for check_def in resolved: + for value in _iter_strings(check_def): + if _UNRESOLVED_PLACEHOLDER_PATTERN.search(value): + logger.warning(f"Unresolved placeholder found after variable substitution: '{value}'") + + return resolved + + +def _iter_strings(obj: Any) -> Generator[str, None, None]: + """Yield all string values found recursively in *obj*. + + Traverses dicts (values only) and lists. Non-string leaf values are skipped. + Used to scan resolved check definitions for unresolved ``{{ ... }}`` placeholders. + + Args: + obj: A string, dict, list, or other value to traverse. + + Yields: + Every string value found in the nested structure. + """ + if isinstance(obj, str): + yield obj + elif isinstance(obj, dict): + for value in obj.values(): + yield from _iter_strings(value) + elif isinstance(obj, list): + for item in obj: + yield from _iter_strings(item) + + def get_file_extension(file_path: str | os.PathLike) -> str: """ Extract file extension from a file path. diff --git a/tests/integration/test_apply_checks.py b/tests/integration/test_apply_checks.py index 3c6065e3c..022395f5e 100755 --- a/tests/integration/test_apply_checks.py +++ b/tests/integration/test_apply_checks.py @@ -9555,3 +9555,162 @@ def test_apply_checks_by_metadata_skip_checks_with_missing_columns(ws, spark): SCHEMA + complex_cols_schema + REPORTING_COLUMNS, ) assert_df_equality(checked, expected, ignore_nullable=True) + + +def test_apply_checks_by_metadata_with_variables(ws, spark): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + test_df = spark.createDataFrame([[1, 3, 3], [2, None, 4], [None, 4, None]], SCHEMA) + + checks = [ + { + "criticality": "error", + "check": { + "function": "is_not_null_and_not_empty", + "arguments": {"column": "{{ col }}"}, + }, + }, + ] + variables = {"col": "b"} + + checked = dq_engine.apply_checks_by_metadata(test_df, checks, variables=variables) + + expected = spark.createDataFrame( + [ + [1, 3, 3, None, None], + [ + 2, + None, + 4, + [ + { + "name": "b_is_null_or_empty", + "message": "Column 'b' value is null or empty", + "columns": ["b"], + "filter": None, + "function": "is_not_null_and_not_empty", + "run_time": RUN_TIME, + "run_id": RUN_ID, + "user_metadata": {}, + } + ], + None, + ], + [None, 4, None, None, None], + ], + EXPECTED_SCHEMA, + ) + assert_df_equality(checked, expected, ignore_nullable=True) + + +def test_apply_checks_by_metadata_and_split_with_variables(ws, spark): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + test_df = spark.createDataFrame([[1, 3, 3], [2, None, 4], [None, 4, None]], SCHEMA) + + checks = [ + { + "criticality": "error", + "name": "{{ col }}_null_check", + "check": { + "function": "is_not_null_and_not_empty", + "arguments": {"column": "{{ col }}"}, + }, + }, + { + "criticality": "warn", + "check": { + "function": "sql_expression", + "arguments": {"expression": "{{ expr_col }} > {{ threshold }}"}, + }, + }, + ] + variables = {"col": "b", "expr_col": "a", "threshold": 1} + + good, bad = dq_engine.apply_checks_by_metadata_and_split(test_df, checks, variables=variables) + + # Row [1, 3, 3]: b is not null, a > 1 passes -> good only + # Row [2, None, 4]: b is null (error), a > 1 passes -> bad only + # Row [None, 4, None]: b is not null, a is null so "a > 1" fails (warn) -> both good and bad + assert good.count() == 2 + assert bad.count() == 2 + + +def test_apply_checks_by_metadata_with_variables_name_and_filter(ws, spark): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + test_df = spark.createDataFrame([[1, 3, 3], [2, None, 4], [None, 4, None]], SCHEMA) + + checks = [ + { + "criticality": "error", + "name": "{{ col }}_greater_than_{{ threshold }}", + "check": { + "function": "sql_expression", + "arguments": {"expression": "{{ col }} > {{ threshold }}"}, + }, + "filter": "{{ filter_col }} IS NOT NULL", + }, + ] + variables = {"col": "a", "threshold": 1, "filter_col": "a"} + + checked = dq_engine.apply_checks_by_metadata(test_df, checks, variables=variables) + + # Row with a=1 should have an error since a > 1 is false + result_rows = checked.collect() + row_a1 = [r for r in result_rows if r["a"] == 1][0] + assert row_a1["_errors"] is not None + assert len(row_a1["_errors"]) == 1 + assert row_a1["_errors"][0]["name"] == "a_greater_than_1" + + # Row with a=2 should have no errors + row_a2 = [r for r in result_rows if r["a"] == 2][0] + assert row_a2["_errors"] is None + + # Row with a=None should have no errors (filtered out) + row_null = [r for r in result_rows if r["a"] is None][0] + assert row_null["_errors"] is None + + +def test_validate_checks_with_variables(ws): + checks = [ + { + "criticality": "{{ crit }}", + "check": { + "function": "is_not_null", + "arguments": {"column": "{{ col }}"}, + }, + }, + ] + variables = {"crit": "error", "col": "b"} + + status = DQEngine.validate_checks(checks, variables=variables) + assert not status.has_errors + + +def test_validate_checks_with_variables_invalid_after_substitution(ws): + checks = [ + { + "criticality": "{{ crit }}", + "check": { + "function": "is_not_null", + "arguments": {"column": "b"}, + }, + }, + ] + variables = {"crit": "not_a_valid_criticality"} + + status = DQEngine.validate_checks(checks, variables=variables) + assert status.has_errors + + +def test_validate_checks_without_variables_fails_on_placeholders(ws): + checks = [ + { + "criticality": "{{ crit }}", + "check": { + "function": "is_not_null", + "arguments": {"column": "b"}, + }, + }, + ] + + status = DQEngine.validate_checks(checks) + assert status.has_errors diff --git a/tests/integration/test_apply_checks_and_save_in_table.py b/tests/integration/test_apply_checks_and_save_in_table.py index c6b8b1cca..ee9b7ec85 100644 --- a/tests/integration/test_apply_checks_and_save_in_table.py +++ b/tests/integration/test_apply_checks_and_save_in_table.py @@ -2084,3 +2084,60 @@ def test_apply_checks_and_save_in_tables_with_patterns_and_ref_df(ws, spark, mak schema=expected_schema, ) assert_df_equality(actual_df, expected_df, ignore_nullable=True) + + +def test_apply_checks_by_metadata_and_save_in_table_with_variables(ws, spark, make_schema, make_random): + catalog_name = TEST_CATALOG + schema = make_schema(catalog_name=catalog_name) + input_table = f"{catalog_name}.{schema.name}.{make_random(8).lower()}" + output_table = f"{catalog_name}.{schema.name}.{make_random(8).lower()}" + + test_schema = "a: int, b: int, c: string" + test_df = spark.createDataFrame([[1, 2, "valid"], [None, 3, "error"], [4, None, "warn"]], test_schema) + test_df.write.format("delta").mode("overwrite").saveAsTable(input_table) + + checks = [ + { + "name": "{{ col }}_is_null", + "criticality": "{{ crit }}", + "check": {"function": "is_not_null", "arguments": {"column": "{{ col }}"}}, + }, + ] + variables = {"col": "a", "crit": "error"} + + engine = DQEngine(ws, spark=spark, extra_params=EXTRA_PARAMS) + engine.apply_checks_by_metadata_and_save_in_table( + checks=checks, + input_config=InputConfig(location=input_table), + output_config=OutputConfig(location=output_table, mode="overwrite"), + variables=variables, + ) + + actual_df = spark.table(output_table) + expected_schema = test_schema + REPORTING_COLUMNS + expected_df = spark.createDataFrame( + [ + [1, 2, "valid", None, None], + [ + None, + 3, + "error", + [ + { + "name": "a_is_null", + "message": "Column 'a' value is null", + "columns": ["a"], + "filter": None, + "function": "is_not_null", + "run_time": RUN_TIME, + "run_id": RUN_ID, + "user_metadata": {}, + } + ], + None, + ], + [4, None, "warn", None, None], + ], + schema=expected_schema, + ) + assert_df_equality(actual_df, expected_df, ignore_nullable=True) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 2537a8181..1b50a6be7 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,4 +1,6 @@ +import logging from datetime import date, datetime +from decimal import Decimal from typing import Any from pathlib import Path from unittest.mock import Mock @@ -17,6 +19,7 @@ safe_strip_file_from_path, missing_required_packages, get_file_extension, + apply_variables, ) from databricks.labs.dqx.rule import normalize_bound_args from databricks.labs.dqx.errors import InvalidParameterError, InvalidConfigError @@ -414,3 +417,237 @@ def test_get_file_extension_with_path_object(): """Test get_file_extension function with Path object.""" file_path = Path("/path/to/file.json") assert get_file_extension(file_path) == ".json" + + +def test_apply_variables_replaces_all_string_fields(): + checks = [ + { + "criticality": "error", + "name": "{{ col }}_not_null", + "check": { + "function": "is_not_null", + "arguments": {"column": "{{ col }}"}, + }, + "filter": "{{ filter_col }} = 'active'", + } + ] + variables = {"col": "email", "filter_col": "status"} + result = apply_variables(checks, variables) + + assert result[0]["name"] == "email_not_null" + assert result[0]["check"]["arguments"]["column"] == "email" + assert result[0]["filter"] == "status = 'active'" + + +def test_apply_variables_none_variables(): + checks = [{"name": "{{ x }}"}] + result = apply_variables(checks, None) + assert result is checks # same object, no copy + assert result[0]["name"] == "{{ x }}" + + +def test_apply_variables_empty_variables(): + checks = [{"name": "{{ x }}"}] + result = apply_variables(checks, {}) + assert result is checks # same object, no copy + assert result[0]["name"] == "{{ x }}" + + +def test_apply_variables_non_string_values_converted(): + checks = [ + { + "check": { + "function": "sql_expression", + "arguments": {"expression": "{{ col }} > {{ threshold }}"}, + }, + } + ] + variables = {"col": "age", "threshold": 18} + result = apply_variables(checks, variables) + assert result[0]["check"]["arguments"]["expression"] == "age > 18" + + +def test_apply_variables_does_not_mutate_original(): + checks = [ + { + "name": "{{ col }}_check", + "check": { + "function": "is_not_null", + "arguments": {"column": "{{ col }}"}, + }, + } + ] + variables = {"col": "name"} + apply_variables(checks, variables) + + # Original must be unchanged + assert checks[0]["name"] == "{{ col }}_check" + assert checks[0]["check"]["arguments"]["column"] == "{{ col }}" + + +def test_apply_variables_nested_dicts(): + checks = [ + { + "check": { + "function": "sql_expression", + "arguments": { + "expression": "{{ col }} IS NOT NULL", + }, + }, + "user_metadata": {"owner": "{{ team }}"}, + } + ] + variables = {"col": "id", "team": "data-eng"} + result = apply_variables(checks, variables) + + assert result[0]["check"]["arguments"]["expression"] == "id IS NOT NULL" + assert result[0]["user_metadata"]["owner"] == "data-eng" + + +def test_apply_variables_partial_replacement(): + checks = [{"name": "{{ p1 }}_greater_than_{{ threshold }}"}] + variables = {"p1": "column1", "threshold": 10} + result = apply_variables(checks, variables) + assert result[0]["name"] == "column1_greater_than_10" + + +def test_apply_variables_unresolved_placeholder_warning(caplog): + checks = [{"name": "{{ resolved }}_{{ unresolved }}"}] + variables = {"resolved": "ok"} + with caplog.at_level(logging.WARNING, logger="databricks.labs.dqx.utils"): + result = apply_variables(checks, variables) + + assert result[0]["name"] == "ok_{{ unresolved }}" + assert any("Unresolved placeholder" in msg for msg in caplog.messages) + + +def test_apply_variables_whitespace_tolerance(): + checks = [ + {"a": "{{x}}", "b": "{{ x }}", "c": "{{ x }}"}, + ] + variables = {"x": "val"} + result = apply_variables(checks, variables) + assert result[0]["a"] == "val" + assert result[0]["b"] == "val" + assert result[0]["c"] == "val" + + +def test_apply_variables_non_string_dict_values_untouched(): + checks = [ + { + "criticality": "error", + "check": { + "function": "is_in_list", + "arguments": {"column": "{{ col }}", "allowed": [1, 2, 3]}, + }, + } + ] + variables = {"col": "status"} + result = apply_variables(checks, variables) + assert result[0]["check"]["arguments"]["column"] == "status" + assert result[0]["check"]["arguments"]["allowed"] == [1, 2, 3] + assert result[0]["criticality"] == "error" + + +def test_apply_variables_for_each_column(): + checks = [ + { + "criticality": "error", + "check": { + "function": "is_not_null", + "for_each_column": ["{{ col1 }}", "{{ col2 }}"], + }, + } + ] + variables = {"col1": "first_name", "col2": "last_name"} + result = apply_variables(checks, variables) + assert result[0]["check"]["for_each_column"] == ["first_name", "last_name"] + + +def test_apply_variables_multiple_checks(): + checks = [ + { + "name": "{{ col }}_not_null", + "check": {"function": "is_not_null", "arguments": {"column": "{{ col }}"}}, + }, + { + "name": "{{ col2 }}_not_empty", + "check": {"function": "is_not_empty", "arguments": {"column": "{{ col2 }}"}}, + }, + ] + variables = {"col": "a", "col2": "b"} + result = apply_variables(checks, variables) + assert result[0]["name"] == "a_not_null" + assert result[0]["check"]["arguments"]["column"] == "a" + assert result[1]["name"] == "b_not_empty" + assert result[1]["check"]["arguments"]["column"] == "b" + + +def test_apply_variables_empty_checks_list(): + result = apply_variables([], {"col": "x"}) + assert result == [] + + +def test_apply_variables_empty_string_value(): + checks = [{"name": "prefix_{{ col }}_suffix"}] + result = apply_variables(checks, {"col": ""}) + assert result[0]["name"] == "prefix__suffix" + + +def test_apply_variables_value_contains_braces(): + """Variable value itself contains {{ }} — should NOT be re-expanded.""" + checks = [{"expr": "{{ col }}"}] + result = apply_variables(checks, {"col": "{{ other }}"}) + assert result[0]["expr"] == "{{ other }}" + + +def test_apply_variables_key_with_regex_special_chars(): + """Variable keys with regex metacharacters must be escaped properly.""" + checks = [{"name": "{{ col.name }}_check", "filter": "{{ col+1 }} > 0"}] + variables = {"col.name": "revenue", "col+1": "amount"} + result = apply_variables(checks, variables) + assert result[0]["name"] == "revenue_check" + assert result[0]["filter"] == "amount > 0" + + +def test_apply_variables_same_placeholder_repeated_in_string(): + checks = [{"expr": "{{ x }} + {{ x }}"}] + result = apply_variables(checks, {"x": "col"}) + assert result[0]["expr"] == "col + col" + + +def test_apply_variables_deeply_nested(): + checks = [{"a": {"b": {"c": {"d": "{{ v }}"}}}}] + result = apply_variables(checks, {"v": "deep"}) + assert result[0]["a"]["b"]["c"]["d"] == "deep" + + +def test_apply_variables_value_with_backslash(): + """Backslashes in values should be treated literally (no regex group refs).""" + checks = [{"path": "{{ p }}"}] + result = apply_variables(checks, {"p": r"C:\Users\test"}) + assert result[0]["path"] == r"C:\Users\test" + + +def test_apply_variables_rejects_list_value(): + checks = [{"check": {"arguments": {"column": "{{ col }}"}}}] + with pytest.raises(InvalidParameterError, match="unsupported type 'list'"): + apply_variables(checks, {"col": ["a", "b"]}) + + +def test_apply_variables_rejects_dict_value(): + checks = [{"check": {"arguments": {"column": "{{ col }}"}}}] + with pytest.raises(InvalidParameterError, match="unsupported type 'dict'"): + apply_variables(checks, {"col": {"nested": "value"}}) + + +def test_apply_variables_accepts_decimal_value(): + checks = [{"expr": "col > {{ threshold }}"}] + result = apply_variables(checks, {"threshold": Decimal("3.14")}) + assert result[0]["expr"] == "col > 3.14" + + +def test_apply_variables_accepts_bool_value(): + checks = [{"expr": "{{ flag }}"}] + result = apply_variables(checks, {"flag": True}) + assert result[0]["expr"] == "True" From ecc09c2067c1185a6e6171a98a95b6ee0b7db76c Mon Sep 17 00:00:00 2001 From: Federico Fiorio <45632804+fedeflowers@users.noreply.github.com> Date: Wed, 18 Mar 2026 21:59:39 +0100 Subject: [PATCH 02/24] Apply suggestions from code review Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/databricks/labs/dqx/engine.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/databricks/labs/dqx/engine.py b/src/databricks/labs/dqx/engine.py index 88c3d17ea..b02128d2c 100644 --- a/src/databricks/labs/dqx/engine.py +++ b/src/databricks/labs/dqx/engine.py @@ -647,7 +647,9 @@ def apply_checks_by_metadata( summary metrics. Summary metrics are returned by any `DQEngine` with an `observer` specified. """ log_dataframe_telemetry(self.ws, self.spark, df) - return self._engine.apply_checks_by_metadata(df, checks, custom_check_functions, ref_dfs, variables) + return self._engine.apply_checks_by_metadata( + df, checks, custom_check_functions, ref_dfs, variables=variables + ) @telemetry_logger("engine", "apply_checks_by_metadata_and_split") def apply_checks_by_metadata_and_split( @@ -679,7 +681,9 @@ def apply_checks_by_metadata_and_split( quality summary metrics. Summary metrics are returned by any `DQEngine` with an `observer` specified. """ log_dataframe_telemetry(self.ws, self.spark, df) - return self._engine.apply_checks_by_metadata_and_split(df, checks, custom_check_functions, ref_dfs, variables) + return self._engine.apply_checks_by_metadata_and_split( + df, checks, custom_check_functions, ref_dfs, variables=variables + ) @telemetry_logger("engine", "apply_checks_and_save_in_table") def apply_checks_and_save_in_table( @@ -814,7 +818,11 @@ def apply_checks_by_metadata_and_save_in_table( if quarantine_config: check_result = self.apply_checks_by_metadata_and_split( - df, checks, custom_check_functions, ref_dfs, variables + df, + checks=checks, + custom_check_functions=custom_check_functions, + ref_dfs=ref_dfs, + variables=variables, ) if self._engine.observer: good_df, bad_df, batch_observation = check_result @@ -824,7 +832,13 @@ def apply_checks_by_metadata_and_save_in_table( quarantine_streaming_query = save_dataframe_as_table(bad_df, quarantine_config) target_streaming_query = quarantine_streaming_query else: - check_result = self.apply_checks_by_metadata(df, checks, custom_check_functions, ref_dfs, variables) + check_result = self.apply_checks_by_metadata( + df, + checks=checks, + custom_check_functions=custom_check_functions, + ref_dfs=ref_dfs, + variables=variables, + ) if self._engine.observer: checked_df, batch_observation = check_result else: @@ -999,7 +1013,12 @@ def validate_checks( Returns: ChecksValidationStatus indicating the validation result. """ - return DQEngineCore.validate_checks(checks, custom_check_functions, validate_custom_check_functions, variables) + return DQEngineCore.validate_checks( + checks, + custom_check_functions, + validate_custom_check_functions, + variables=variables, + ) def get_invalid(self, df: DataFrame) -> DataFrame: """ From 870afe6d971cec3d280ca4e88d31b321db458be3 Mon Sep 17 00:00:00 2001 From: fedeflowers Date: Wed, 18 Mar 2026 22:09:56 +0100 Subject: [PATCH 03/24] add variables param in contracts apply_checks_by_metadata, apply_checks_by_metadata_and_split, validate_checks to be consistent with the downstream implementation --- src/databricks/labs/dqx/base.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/dqx/base.py b/src/databricks/labs/dqx/base.py index 7a4fdf8f7..24a85bfca 100644 --- a/src/databricks/labs/dqx/base.py +++ b/src/databricks/labs/dqx/base.py @@ -1,7 +1,7 @@ import abc from collections.abc import Callable from functools import cached_property -from typing import final +from typing import Any, final from pyspark.sql import DataFrame, Observation from databricks.labs.dqx.checks_validator import ChecksValidationStatus from databricks.labs.dqx.rule import DQRule @@ -81,6 +81,7 @@ def apply_checks_by_metadata( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, ref_dfs: dict[str, DataFrame] | None = None, + variables: dict[str, Any] | None = None, ) -> DataFrame | tuple[DataFrame, Observation]: """ Apply data quality checks defined as metadata to the given DataFrame. @@ -94,6 +95,8 @@ def apply_checks_by_metadata( (rows appear in both DataFrames). custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). ref_dfs: Optional reference DataFrames to use in the checks. + variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` + placeholders in all string values of the check definitions before validation and deserialization. Returns: A DataFrame with errors and warnings result columns and an optional Observation which tracks data quality @@ -107,6 +110,7 @@ def apply_checks_by_metadata_and_split( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, ref_dfs: dict[str, DataFrame] | None = None, + variables: dict[str, Any] | None = None, ) -> tuple[DataFrame, DataFrame] | tuple[DataFrame, DataFrame, Observation]: """Apply data quality checks defined as metadata to the given DataFrame and split the results into two DataFrames ("good" and "bad"). @@ -120,6 +124,8 @@ def apply_checks_by_metadata_and_split( (rows appear in both DataFrames). custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). ref_dfs: Optional reference DataFrames to use in the checks. + variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` + placeholders in all string values of the check definitions before validation and deserialization. Returns: A tuple of two DataFrames: "good" (may include rows with warnings but no result columns) and "bad" (rows @@ -133,6 +139,7 @@ def validate_checks( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, validate_custom_check_functions: bool = True, + variables: dict[str, Any] | None = None, ) -> ChecksValidationStatus: """ Validate checks defined as metadata to ensure they conform to the expected structure and types. @@ -144,6 +151,8 @@ def validate_checks( checks: List of checks to apply to the DataFrame. Each check should be a dictionary. custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). validate_custom_check_functions: If True, validate custom check functions. + variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` + placeholders in all string values of the check definitions before validation and deserialization. Returns: ChecksValidationStatus indicating the validation result. From 877e74850e5fa70d6176e6c99cf2ae4ad34b4b25 Mon Sep 17 00:00:00 2001 From: fedeflowers Date: Thu, 19 Mar 2026 14:12:35 +0100 Subject: [PATCH 04/24] add change to parametrize variables from load_checks instead of apply_checks --- src/databricks/labs/dqx/base.py | 13 +-- src/databricks/labs/dqx/engine.py | 61 +++------- tests/integration/test_apply_checks.py | 21 ++-- .../test_apply_checks_and_save_in_table.py | 4 +- tests/unit/test_load_checks.py | 108 +++++++++++++++++- 5 files changed, 136 insertions(+), 71 deletions(-) diff --git a/src/databricks/labs/dqx/base.py b/src/databricks/labs/dqx/base.py index 24a85bfca..8710f75f2 100644 --- a/src/databricks/labs/dqx/base.py +++ b/src/databricks/labs/dqx/base.py @@ -81,7 +81,6 @@ def apply_checks_by_metadata( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, ref_dfs: dict[str, DataFrame] | None = None, - variables: dict[str, Any] | None = None, ) -> DataFrame | tuple[DataFrame, Observation]: """ Apply data quality checks defined as metadata to the given DataFrame. @@ -95,8 +94,6 @@ def apply_checks_by_metadata( (rows appear in both DataFrames). custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). ref_dfs: Optional reference DataFrames to use in the checks. - variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` - placeholders in all string values of the check definitions before validation and deserialization. Returns: A DataFrame with errors and warnings result columns and an optional Observation which tracks data quality @@ -110,7 +107,6 @@ def apply_checks_by_metadata_and_split( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, ref_dfs: dict[str, DataFrame] | None = None, - variables: dict[str, Any] | None = None, ) -> tuple[DataFrame, DataFrame] | tuple[DataFrame, DataFrame, Observation]: """Apply data quality checks defined as metadata to the given DataFrame and split the results into two DataFrames ("good" and "bad"). @@ -124,8 +120,6 @@ def apply_checks_by_metadata_and_split( (rows appear in both DataFrames). custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). ref_dfs: Optional reference DataFrames to use in the checks. - variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` - placeholders in all string values of the check definitions before validation and deserialization. Returns: A tuple of two DataFrames: "good" (may include rows with warnings but no result columns) and "bad" (rows @@ -139,7 +133,6 @@ def validate_checks( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, validate_custom_check_functions: bool = True, - variables: dict[str, Any] | None = None, ) -> ChecksValidationStatus: """ Validate checks defined as metadata to ensure they conform to the expected structure and types. @@ -151,8 +144,6 @@ def validate_checks( checks: List of checks to apply to the DataFrame. Each check should be a dictionary. custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). validate_custom_check_functions: If True, validate custom check functions. - variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` - placeholders in all string values of the check definitions before validation and deserialization. Returns: ChecksValidationStatus indicating the validation result. @@ -184,7 +175,7 @@ def get_valid(self, df: DataFrame) -> DataFrame: @staticmethod @abc.abstractmethod - def load_checks_from_local_file(filepath: str) -> list[dict]: + def load_checks_from_local_file(filepath: str, variables: dict[str, Any] | None = None) -> list[dict]: """ Load DQ rules (checks) from a local JSON or YAML file. @@ -192,6 +183,8 @@ def load_checks_from_local_file(filepath: str) -> list[dict]: Args: filepath: Path to a file containing checks definitions. + variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` + placeholders in all string values of the check definitions before returning. Returns: List of DQ rules (checks). diff --git a/src/databricks/labs/dqx/engine.py b/src/databricks/labs/dqx/engine.py index b02128d2c..f35cc56da 100644 --- a/src/databricks/labs/dqx/engine.py +++ b/src/databricks/labs/dqx/engine.py @@ -220,7 +220,6 @@ def apply_checks_by_metadata( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, ref_dfs: dict[str, DataFrame] | None = None, - variables: dict[str, Any] | None = None, ) -> DataFrame | tuple[DataFrame, Observation]: """Apply data quality checks defined as metadata to the given DataFrame. @@ -233,14 +232,11 @@ def apply_checks_by_metadata( (rows appear in both DataFrames). custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). ref_dfs: Optional reference DataFrames to use in the checks. - variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` - placeholders in all string values of the check definitions before validation and deserialization. Returns: A DataFrame with errors and warnings result columns and an optional Observation which tracks data quality summary metrics. Summary metrics are returned by any `DQEngine` with an `observer` specified. """ - checks = apply_variables(checks, variables) dq_rule_checks = deserialize_checks(checks, custom_check_functions) return self.apply_checks(df, dq_rule_checks, ref_dfs) @@ -251,7 +247,6 @@ def apply_checks_by_metadata_and_split( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, ref_dfs: dict[str, DataFrame] | None = None, - variables: dict[str, Any] | None = None, ) -> tuple[DataFrame, DataFrame] | tuple[DataFrame, DataFrame, Observation]: """Apply data quality checks defined as metadata to the given DataFrame and split the results into two DataFrames ("good" and "bad"). @@ -265,8 +260,6 @@ def apply_checks_by_metadata_and_split( (rows appear in both DataFrames). custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). ref_dfs: Optional reference DataFrames to use in the checks. - variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` - placeholders in all string values of the check definitions before validation and deserialization. Returns: A tuple of two DataFrames: "good" (may include rows with warnings but no result columns) and "bad" (rows @@ -276,7 +269,6 @@ def apply_checks_by_metadata_and_split( Raises: InvalidCheckError: If any of the checks are invalid. """ - checks = apply_variables(checks, variables) dq_rule_checks = deserialize_checks(checks, custom_check_functions) good_df, bad_df, *observations = self.apply_checks_and_split(df, dq_rule_checks, ref_dfs) @@ -291,7 +283,6 @@ def validate_checks( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, validate_custom_check_functions: bool = True, - variables: dict[str, Any] | None = None, ) -> ChecksValidationStatus: """ Validate checks defined as metadata to ensure they conform to the expected structure and types. @@ -303,13 +294,10 @@ def validate_checks( checks: List of checks to apply to the DataFrame. Each check should be a dictionary. custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). validate_custom_check_functions: If True, validate custom check functions. - variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` - placeholders in all string values of the check definitions before validation. Returns: ChecksValidationStatus indicating the validation result. """ - checks = apply_variables(checks, variables) return ChecksValidator.validate_checks(checks, custom_check_functions, validate_custom_check_functions) def get_invalid(self, df: DataFrame) -> DataFrame: @@ -342,7 +330,7 @@ def get_valid(self, df: DataFrame) -> DataFrame: ) @staticmethod - def load_checks_from_local_file(filepath: str) -> list[dict]: + def load_checks_from_local_file(filepath: str, variables: dict[str, Any] | None = None) -> list[dict]: """ Load DQ rules (checks) from a local JSON or YAML file. @@ -350,11 +338,14 @@ def load_checks_from_local_file(filepath: str) -> list[dict]: Args: filepath: Path to a file containing checks definitions. + variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` + placeholders in all string values of the check definitions before returning. Returns: List of DQ rules. """ - return FileChecksStorageHandler().load(FileChecksStorageConfig(location=filepath)) + checks = FileChecksStorageHandler().load(FileChecksStorageConfig(location=filepath)) + return apply_variables(checks=checks, variables=variables) @staticmethod def save_checks_in_local_file(checks: list[dict], filepath: str): @@ -626,7 +617,6 @@ def apply_checks_by_metadata( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, ref_dfs: dict[str, DataFrame] | None = None, - variables: dict[str, Any] | None = None, ) -> DataFrame | tuple[DataFrame, Observation]: """Apply data quality checks defined as metadata to the given DataFrame. @@ -639,8 +629,6 @@ def apply_checks_by_metadata( (rows appear in both DataFrames). custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). ref_dfs: Optional reference DataFrames to use in the checks. - variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` - placeholders in all string values of the check definitions before validation and deserialization. Returns: A DataFrame with errors and warnings result columns and an optional Observation which tracks data quality @@ -648,7 +636,7 @@ def apply_checks_by_metadata( """ log_dataframe_telemetry(self.ws, self.spark, df) return self._engine.apply_checks_by_metadata( - df, checks, custom_check_functions, ref_dfs, variables=variables + df=df, checks=checks, custom_check_functions=custom_check_functions, ref_dfs=ref_dfs ) @telemetry_logger("engine", "apply_checks_by_metadata_and_split") @@ -658,7 +646,6 @@ def apply_checks_by_metadata_and_split( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, ref_dfs: dict[str, DataFrame] | None = None, - variables: dict[str, Any] | None = None, ) -> tuple[DataFrame, DataFrame] | tuple[DataFrame, DataFrame, Observation]: """Apply data quality checks defined as metadata to the given DataFrame and split the results into two DataFrames ("good" and "bad"). @@ -672,8 +659,6 @@ def apply_checks_by_metadata_and_split( (rows appear in both DataFrames). custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). ref_dfs: Optional reference DataFrames to use in the checks. - variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` - placeholders in all string values of the check definitions before validation and deserialization. Returns: A tuple of two DataFrames: "good" (may include rows with warnings but no result columns) and "bad" (rows @@ -682,7 +667,7 @@ def apply_checks_by_metadata_and_split( """ log_dataframe_telemetry(self.ws, self.spark, df) return self._engine.apply_checks_by_metadata_and_split( - df, checks, custom_check_functions, ref_dfs, variables=variables + df=df, checks=checks, custom_check_functions=custom_check_functions, ref_dfs=ref_dfs ) @telemetry_logger("engine", "apply_checks_and_save_in_table") @@ -780,7 +765,6 @@ def apply_checks_by_metadata_and_save_in_table( custom_check_functions: dict[str, Callable] | None = None, ref_dfs: dict[str, DataFrame] | None = None, checks_location: str | None = None, - variables: dict[str, Any] | None = None, ) -> None: """ Apply metadata-defined data quality checks to input data and save results. @@ -805,8 +789,6 @@ def apply_checks_by_metadata_and_save_in_table( to callables/modules (e.g., globals()). ref_dfs: Optional reference DataFrames used by checks. checks_location: Optional location of the checks. Used for reporting in the summary metrics table only. - variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` - placeholders in all string values of the check definitions before validation and deserialization. """ logger.info(f"Applying checks to {input_config.location}") @@ -818,11 +800,7 @@ def apply_checks_by_metadata_and_save_in_table( if quarantine_config: check_result = self.apply_checks_by_metadata_and_split( - df, - checks=checks, - custom_check_functions=custom_check_functions, - ref_dfs=ref_dfs, - variables=variables, + df=df, checks=checks, custom_check_functions=custom_check_functions, ref_dfs=ref_dfs ) if self._engine.observer: good_df, bad_df, batch_observation = check_result @@ -833,11 +811,7 @@ def apply_checks_by_metadata_and_save_in_table( target_streaming_query = quarantine_streaming_query else: check_result = self.apply_checks_by_metadata( - df, - checks=checks, - custom_check_functions=custom_check_functions, - ref_dfs=ref_dfs, - variables=variables, + df=df, checks=checks, custom_check_functions=custom_check_functions, ref_dfs=ref_dfs ) if self._engine.observer: checked_df, batch_observation = check_result @@ -995,7 +969,6 @@ def validate_checks( checks: list[dict], custom_check_functions: dict[str, Callable] | None = None, validate_custom_check_functions: bool = True, - variables: dict[str, Any] | None = None, ) -> ChecksValidationStatus: """ Validate checks defined as metadata to ensure they conform to the expected structure and types. @@ -1007,17 +980,14 @@ def validate_checks( checks: List of checks to apply to the DataFrame. Each check should be a dictionary. custom_check_functions: Optional dictionary with custom check functions (e.g., *globals()* of the calling module). validate_custom_check_functions: If True, validate custom check functions. - variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` - placeholders in all string values of the check definitions before validation. Returns: ChecksValidationStatus indicating the validation result. """ return DQEngineCore.validate_checks( - checks, - custom_check_functions, - validate_custom_check_functions, - variables=variables, + checks=checks, + custom_check_functions=custom_check_functions, + validate_custom_check_functions=validate_custom_check_functions, ) def get_invalid(self, df: DataFrame) -> DataFrame: @@ -1147,7 +1117,7 @@ def save_results_in_table( ) @telemetry_logger("engine", "load_checks") - def load_checks(self, config: BaseChecksStorageConfig) -> list[dict]: + def load_checks(self, config: BaseChecksStorageConfig, variables: dict[str, Any] | None = None) -> list[dict]: """Load DQ rules (checks) from the storage backend described by *config*. This method delegates to a storage handler selected by the factory @@ -1164,6 +1134,8 @@ def load_checks(self, config: BaseChecksStorageConfig) -> list[dict]: Args: config: Configuration object describing the storage backend. + variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` + placeholders in all string values of the check definitions before returning. Returns: List of DQ rules (checks) represented as dictionaries. @@ -1172,7 +1144,8 @@ def load_checks(self, config: BaseChecksStorageConfig) -> list[dict]: InvalidConfigError: If the configuration type is unsupported. """ handler = self._checks_handler_factory.create(config) - return handler.load(config) + checks = handler.load(config) + return apply_variables(checks=checks, variables=variables) @telemetry_logger("engine", "save_checks") def save_checks(self, checks: list[dict], config: BaseChecksStorageConfig) -> None: diff --git a/tests/integration/test_apply_checks.py b/tests/integration/test_apply_checks.py index 022395f5e..9511ba63e 100755 --- a/tests/integration/test_apply_checks.py +++ b/tests/integration/test_apply_checks.py @@ -15,6 +15,7 @@ from databricks.labs.dqx.check_funcs import sql_query from databricks.labs.dqx.config import OutputConfig, FileChecksStorageConfig, ExtraParams, RunConfig from databricks.labs.dqx.engine import DQEngine +from databricks.labs.dqx.utils import apply_variables from databricks.labs.dqx.rule import ( DQForEachColRule, register_rule, @@ -9570,9 +9571,9 @@ def test_apply_checks_by_metadata_with_variables(ws, spark): }, }, ] - variables = {"col": "b"} + checks = apply_variables(checks, {"col": "b"}) - checked = dq_engine.apply_checks_by_metadata(test_df, checks, variables=variables) + checked = dq_engine.apply_checks_by_metadata(test_df, checks) expected = spark.createDataFrame( [ @@ -9623,9 +9624,9 @@ def test_apply_checks_by_metadata_and_split_with_variables(ws, spark): }, }, ] - variables = {"col": "b", "expr_col": "a", "threshold": 1} + checks = apply_variables(checks, {"col": "b", "expr_col": "a", "threshold": 1}) - good, bad = dq_engine.apply_checks_by_metadata_and_split(test_df, checks, variables=variables) + good, bad = dq_engine.apply_checks_by_metadata_and_split(test_df, checks) # Row [1, 3, 3]: b is not null, a > 1 passes -> good only # Row [2, None, 4]: b is null (error), a > 1 passes -> bad only @@ -9649,9 +9650,9 @@ def test_apply_checks_by_metadata_with_variables_name_and_filter(ws, spark): "filter": "{{ filter_col }} IS NOT NULL", }, ] - variables = {"col": "a", "threshold": 1, "filter_col": "a"} + checks = apply_variables(checks, {"col": "a", "threshold": 1, "filter_col": "a"}) - checked = dq_engine.apply_checks_by_metadata(test_df, checks, variables=variables) + checked = dq_engine.apply_checks_by_metadata(test_df, checks) # Row with a=1 should have an error since a > 1 is false result_rows = checked.collect() @@ -9679,9 +9680,9 @@ def test_validate_checks_with_variables(ws): }, }, ] - variables = {"crit": "error", "col": "b"} + checks = apply_variables(checks, {"crit": "error", "col": "b"}) - status = DQEngine.validate_checks(checks, variables=variables) + status = DQEngine.validate_checks(checks) assert not status.has_errors @@ -9695,9 +9696,9 @@ def test_validate_checks_with_variables_invalid_after_substitution(ws): }, }, ] - variables = {"crit": "not_a_valid_criticality"} + checks = apply_variables(checks, {"crit": "not_a_valid_criticality"}) - status = DQEngine.validate_checks(checks, variables=variables) + status = DQEngine.validate_checks(checks) assert status.has_errors diff --git a/tests/integration/test_apply_checks_and_save_in_table.py b/tests/integration/test_apply_checks_and_save_in_table.py index ee9b7ec85..242c88538 100644 --- a/tests/integration/test_apply_checks_and_save_in_table.py +++ b/tests/integration/test_apply_checks_and_save_in_table.py @@ -14,6 +14,7 @@ ) from databricks.labs.dqx.engine import DQEngine from databricks.labs.dqx.errors import InvalidConfigError +from databricks.labs.dqx.utils import apply_variables from databricks.labs.dqx.rule import DQRowRule, DQDatasetRule from tests.integration.conftest import EXTRA_PARAMS, RUN_TIME, RUN_ID, REPORTING_COLUMNS @@ -2103,14 +2104,13 @@ def test_apply_checks_by_metadata_and_save_in_table_with_variables(ws, spark, ma "check": {"function": "is_not_null", "arguments": {"column": "{{ col }}"}}, }, ] - variables = {"col": "a", "crit": "error"} + checks = apply_variables(checks, {"col": "a", "crit": "error"}) engine = DQEngine(ws, spark=spark, extra_params=EXTRA_PARAMS) engine.apply_checks_by_metadata_and_save_in_table( checks=checks, input_config=InputConfig(location=input_table), output_config=OutputConfig(location=output_table, mode="overwrite"), - variables=variables, ) actual_df = spark.table(output_table) diff --git a/tests/unit/test_load_checks.py b/tests/unit/test_load_checks.py index c0bdb2bd3..ab4fa90a8 100644 --- a/tests/unit/test_load_checks.py +++ b/tests/unit/test_load_checks.py @@ -1,15 +1,20 @@ from unittest.mock import create_autospec import pytest +from pyspark.sql import SparkSession + +from databricks.labs.dqx.checks_storage import ( + BaseChecksStorageHandlerFactory, + ChecksStorageHandler, + VolumeFileChecksStorageHandler, +) +from databricks.labs.dqx.config import FileChecksStorageConfig, VolumeFileChecksStorageConfig +from databricks.labs.dqx.engine import DQEngine, DQEngineCore +from databricks.labs.dqx.errors import InvalidCheckError, CheckDownloadError, InvalidConfigError from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.sdk.service.files import DownloadResponse -from databricks.labs.dqx.checks_storage import VolumeFileChecksStorageHandler -from databricks.labs.dqx.config import VolumeFileChecksStorageConfig -from databricks.labs.dqx.engine import DQEngineCore -from databricks.labs.dqx.errors import InvalidCheckError, CheckDownloadError, InvalidConfigError - def test_load_checks_from_local_file_json(make_local_check_file_as_json, expected_checks): file = make_local_check_file_as_json @@ -84,3 +89,96 @@ def test_file_download_contents_read_none(): with pytest.raises(NotFound, match="No contents at Unity Catalog volume path"): handler.load(VolumeFileChecksStorageConfig(location="/Volumes/catalog/schema/volume/test_path.yml")) + + +def test_load_checks_from_local_file_with_variables(tmp_path): + content = """- criticality: "{{ crit }}" + check: + function: is_not_null + arguments: + column: "{{ col }}" +""" + file_path = tmp_path / "checks.yml" + file_path.write_text(content, encoding="utf-8") + + checks = DQEngineCore.load_checks_from_local_file(str(file_path), variables={"crit": "error", "col": "id"}) + + assert checks == [ + {"criticality": "error", "check": {"function": "is_not_null", "arguments": {"column": "id"}}}, + ] + + +def test_load_checks_from_local_file_variables_none(tmp_path): + content = """- criticality: error + check: + function: is_not_null + arguments: + column: id +""" + file_path = tmp_path / "checks.yml" + file_path.write_text(content, encoding="utf-8") + + checks = DQEngineCore.load_checks_from_local_file(str(file_path), variables=None) + + assert checks == [ + {"criticality": "error", "check": {"function": "is_not_null", "arguments": {"column": "id"}}}, + ] + + +def test_load_checks_from_local_file_variables_empty(tmp_path): + content = """- criticality: error + check: + function: is_not_null + arguments: + column: id +""" + file_path = tmp_path / "checks.yml" + file_path.write_text(content, encoding="utf-8") + + checks = DQEngineCore.load_checks_from_local_file(str(file_path), variables={}) + + assert checks == [ + {"criticality": "error", "check": {"function": "is_not_null", "arguments": {"column": "id"}}}, + ] + + +def test_load_checks_with_variables(): + ws = create_autospec(WorkspaceClient) + mock_spark = create_autospec(SparkSession) + + raw_checks = [ + {"criticality": "{{ crit }}", "check": {"function": "is_not_null", "arguments": {"column": "{{ col }}"}}} + ] + + mock_factory = create_autospec(BaseChecksStorageHandlerFactory) + mock_handler = create_autospec(ChecksStorageHandler) + mock_factory.create.return_value = mock_handler + mock_handler.load.return_value = raw_checks + + engine = DQEngine(ws, spark=mock_spark, checks_handler_factory=mock_factory) + config = FileChecksStorageConfig(location="checks.yml") + + checks = engine.load_checks(config, variables={"crit": "error", "col": "id"}) + + assert checks == [ + {"criticality": "error", "check": {"function": "is_not_null", "arguments": {"column": "id"}}}, + ] + + +def test_load_checks_variables_none(): + ws = create_autospec(WorkspaceClient) + mock_spark = create_autospec(SparkSession) + + raw_checks = [{"criticality": "error", "check": {"function": "is_not_null", "arguments": {"column": "id"}}}] + + mock_factory = create_autospec(BaseChecksStorageHandlerFactory) + mock_handler = create_autospec(ChecksStorageHandler) + mock_factory.create.return_value = mock_handler + mock_handler.load.return_value = raw_checks + + engine = DQEngine(ws, spark=mock_spark, checks_handler_factory=mock_factory) + config = FileChecksStorageConfig(location="checks.yml") + + checks = engine.load_checks(config, variables=None) + + assert checks == raw_checks From a3a78de4c92d458f205ba20030f46ba8f912904e Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Fri, 20 Mar 2026 07:47:56 +0100 Subject: [PATCH 05/24] Restore test_apply_checks_by_metadata_and_save_in_table_loads_checks_from_table The test was accidentally replaced during a merge. Restore the original test that covers loading checks from a Delta table via checks_location, and keep the new variables test as a separate addition. Co-authored-by: Isaac --- .../test_apply_checks_and_save_in_table.py | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/tests/integration/test_apply_checks_and_save_in_table.py b/tests/integration/test_apply_checks_and_save_in_table.py index 952799b74..d07d738a0 100644 --- a/tests/integration/test_apply_checks_and_save_in_table.py +++ b/tests/integration/test_apply_checks_and_save_in_table.py @@ -2175,6 +2175,89 @@ def test_apply_checks_and_save_in_table_loads_checks_from_table(ws, spark, make_ assert_df_equality(actual_df, expected_df, ignore_nullable=True) +def test_apply_checks_by_metadata_and_save_in_table_loads_checks_from_table(ws, spark, make_schema, make_random): + catalog_name = TEST_CATALOG + schema = make_schema(catalog_name=catalog_name) + input_table = f"{catalog_name}.{schema.name}.{make_random(10).lower()}" + output_table = f"{catalog_name}.{schema.name}.{make_random(10).lower()}" + checks_table = f"{catalog_name}.{schema.name}.{make_random(10).lower()}" + + # Create test data and save to source table + test_schema = "a: int, b: int, c: string" + test_df = spark.createDataFrame([[1, 2, "valid"], [None, 3, "error"], [4, None, "warn"]], test_schema) + test_df.write.format("delta").mode("overwrite").saveAsTable(input_table) + + # Save checks to a delta table + checks_metadata = [ + { + "name": "a_is_null", + "criticality": "error", + "check": {"function": "is_not_null", "arguments": {"column": "a"}}, + }, + { + "name": "b_is_null", + "criticality": "warn", + "check": {"function": "is_not_null", "arguments": {"column": "b"}}, + }, + ] + engine = DQEngine(ws, spark=spark, extra_params=EXTRA_PARAMS) + engine.save_checks(checks_metadata, config=TableChecksStorageConfig(location=checks_table)) + + # Apply checks by metadata loading from table via checks_location (no checks param) + engine.apply_checks_by_metadata_and_save_in_table( + input_config=InputConfig(location=input_table), + output_config=OutputConfig(location=output_table, mode="overwrite"), + checks_location=checks_table, + ) + + # Verify the table was created and contains the expected data + actual_df = spark.table(output_table) + expected_schema = test_schema + REPORTING_COLUMNS + expected_df = spark.createDataFrame( + [ + [1, 2, "valid", None, None], + [ + None, + 3, + "error", + [ + { + "name": "a_is_null", + "message": "Column 'a' value is null", + "columns": ["a"], + "filter": None, + "function": "is_not_null", + "run_time": RUN_TIME, + "run_id": RUN_ID, + "user_metadata": {}, + } + ], + None, + ], + [ + 4, + None, + "warn", + None, + [ + { + "name": "b_is_null", + "message": "Column 'b' value is null", + "columns": ["b"], + "filter": None, + "function": "is_not_null", + "run_time": RUN_TIME, + "run_id": RUN_ID, + "user_metadata": {}, + } + ], + ], + ], + schema=expected_schema, + ) + assert_df_equality(actual_df, expected_df, ignore_nullable=True) + + def test_apply_checks_by_metadata_and_save_in_table_with_variables(ws, spark, make_schema, make_random): catalog_name = TEST_CATALOG schema = make_schema(catalog_name=catalog_name) From b896032bed9ea1b76f961b9fdbbe2cf747286f36 Mon Sep 17 00:00:00 2001 From: fedeflowers Date: Mon, 23 Mar 2026 17:02:50 +0100 Subject: [PATCH 06/24] add EXTRA_PARAMS compatibility, added unit and integrations tests for checking new parametrization of varaibles implementation, improved resolve_varaible algorithm, fixed variable hints and support for datetime variables resolution --- src/databricks/labs/dqx/base.py | 14 +- src/databricks/labs/dqx/config.py | 2 + src/databricks/labs/dqx/engine.py | 43 ++- src/databricks/labs/dqx/utils.py | 114 +++---- tests/integration/test_apply_checks.py | 160 --------- .../test_apply_checks_and_save_in_table.py | 24 +- .../test_apply_checks_variables.py | 303 ++++++++++++++++++ tests/unit/test_load_checks.py | 68 +++- tests/unit/test_utils.py | 186 ++++++++--- 9 files changed, 616 insertions(+), 298 deletions(-) create mode 100644 tests/integration/test_apply_checks_variables.py diff --git a/src/databricks/labs/dqx/base.py b/src/databricks/labs/dqx/base.py index 8710f75f2..74d77f9e6 100644 --- a/src/databricks/labs/dqx/base.py +++ b/src/databricks/labs/dqx/base.py @@ -1,12 +1,15 @@ import abc from collections.abc import Callable from functools import cached_property -from typing import Any, final +from typing import final + from pyspark.sql import DataFrame, Observation + +from databricks.labs.dqx.__about__ import __version__ from databricks.labs.dqx.checks_validator import ChecksValidationStatus from databricks.labs.dqx.rule import DQRule +from databricks.labs.dqx.utils import VariableValue from databricks.sdk import WorkspaceClient -from databricks.labs.dqx.__about__ import __version__ class DQEngineBase(abc.ABC): @@ -175,15 +178,18 @@ def get_valid(self, df: DataFrame) -> DataFrame: @staticmethod @abc.abstractmethod - def load_checks_from_local_file(filepath: str, variables: dict[str, Any] | None = None) -> list[dict]: + def load_checks_from_local_file(filepath: str, variables: dict[str, VariableValue] | None = None) -> list[dict]: """ Load DQ rules (checks) from a local JSON or YAML file. The returned checks can be used as input to *apply_checks_by_metadata*. + **Security note:** variable values substituted into **sql_expression** checks are + not sanitized. Callers must ensure that variable values come from trusted sources. + Args: filepath: Path to a file containing checks definitions. - variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` + variables: Optional mapping of placeholder names to replacement values. Replaces **{{ key }}** placeholders in all string values of the check definitions before returning. Returns: diff --git a/src/databricks/labs/dqx/config.py b/src/databricks/labs/dqx/config.py index d4041c29f..2b139b116 100644 --- a/src/databricks/labs/dqx/config.py +++ b/src/databricks/labs/dqx/config.py @@ -4,6 +4,7 @@ from databricks.labs.dqx.checks_serializer import SerializerFactory from databricks.labs.dqx.errors import InvalidConfigError, InvalidParameterError +from databricks.labs.dqx.utils import VariableValue __all__ = [ "WorkspaceConfig", @@ -215,6 +216,7 @@ class ExtraParams: user_metadata: dict[str, str] = field(default_factory=dict) run_time_overwrite: str | None = None run_id_overwrite: str | None = None + variables: dict[str, VariableValue] = field(default_factory=dict) @dataclass diff --git a/src/databricks/labs/dqx/engine.py b/src/databricks/labs/dqx/engine.py index ff0d4b63c..ea267da4f 100644 --- a/src/databricks/labs/dqx/engine.py +++ b/src/databricks/labs/dqx/engine.py @@ -51,7 +51,7 @@ from databricks.labs.dqx.telemetry import telemetry_logger, log_telemetry, log_dataframe_telemetry from databricks.sdk import WorkspaceClient from databricks.labs.dqx.errors import InvalidCheckError, InvalidConfigError, InvalidParameterError -from databricks.labs.dqx.utils import list_tables, safe_strip_file_from_path, apply_variables +from databricks.labs.dqx.utils import list_tables, safe_strip_file_from_path, resolve_variables, VariableValue from databricks.labs.dqx.io import is_one_time_trigger logger = logging.getLogger(__name__) @@ -337,22 +337,25 @@ def get_valid(self, df: DataFrame) -> DataFrame: ) @staticmethod - def load_checks_from_local_file(filepath: str, variables: dict[str, Any] | None = None) -> list[dict]: + def load_checks_from_local_file(filepath: str, variables: dict[str, VariableValue] | None = None) -> list[dict]: """ Load DQ rules (checks) from a local JSON or YAML file. The returned checks can be used as input to *apply_checks_by_metadata*. + **Security note:** variable values substituted into **sql_expression** checks are + not sanitized. Callers must ensure that variable values come from trusted sources. + Args: filepath: Path to a file containing checks definitions. - variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` + variables: Optional mapping of placeholder names to replacement values. Replaces **{{ key }}** placeholders in all string values of the check definitions before returning. Returns: List of DQ rules. """ checks = FileChecksStorageHandler().load(FileChecksStorageConfig(location=filepath)) - return apply_variables(checks=checks, variables=variables) + return resolve_variables(checks=checks, variables=variables) @staticmethod def save_checks_in_local_file(checks: list[dict], filepath: str): @@ -573,8 +576,9 @@ def __init__( ): super().__init__(workspace_client) + self._extra_params = extra_params or ExtraParams() self.spark = SparkSession.builder.getOrCreate() if spark is None else spark - self._engine = engine or DQEngineCore(workspace_client, spark, extra_params, observer) + self._engine = engine or DQEngineCore(workspace_client, spark, self._extra_params, observer) self._config_serializer = config_serializer or ConfigSerializer(workspace_client) self._checks_handler_factory: BaseChecksStorageHandlerFactory = ( checks_handler_factory or ChecksStorageHandlerFactory(self.ws, self.spark) @@ -1174,7 +1178,9 @@ def save_results_in_table( ) @telemetry_logger("engine", "load_checks") - def load_checks(self, config: BaseChecksStorageConfig, variables: dict[str, Any] | None = None) -> list[dict]: + def load_checks( + self, config: BaseChecksStorageConfig, variables: dict[str, VariableValue] | None = None + ) -> list[dict]: """Load DQ rules (checks) from the storage backend described by *config*. This method delegates to a storage handler selected by the factory @@ -1189,9 +1195,15 @@ def load_checks(self, config: BaseChecksStorageConfig, variables: dict[str, Any] - *InstallationChecksStorageConfig* (installation directory); - *VolumeFileChecksStorageConfig* (Unity Catalog volume file); + Per-call *variables* are merged with engine-level defaults from + *ExtraParams.variables* (per-call values take precedence on conflict). + + **Security note:** variable values substituted into **sql_expression** checks are + not sanitized. Callers must ensure that variable values come from trusted sources. + Args: config: Configuration object describing the storage backend. - variables: Optional mapping of placeholder names to replacement values. Replaces ``{{ key }}`` + variables: Optional mapping of placeholder names to replacement values. Replaces **{{ key }}** placeholders in all string values of the check definitions before returning. Returns: @@ -1202,7 +1214,22 @@ def load_checks(self, config: BaseChecksStorageConfig, variables: dict[str, Any] """ handler = self._checks_handler_factory.create(config) checks = handler.load(config) - return apply_variables(checks=checks, variables=variables) + merged = self._merge_variables(variables) + return resolve_variables(checks=checks, variables=merged) + + def _merge_variables(self, per_call: dict[str, VariableValue] | None) -> dict[str, VariableValue] | None: + """Merge engine-level default variables with per-call overrides. + + Per-call values take precedence over engine-level defaults. + """ + defaults = self._extra_params.variables + if not defaults and not per_call: + return None + if not defaults: + return per_call + if not per_call: + return defaults + return {**defaults, **per_call} @telemetry_logger("engine", "save_checks") def save_checks(self, checks: list[dict], config: BaseChecksStorageConfig) -> None: diff --git a/src/databricks/labs/dqx/utils.py b/src/databricks/labs/dqx/utils.py index a42a2f6a7..9a0dc4241 100644 --- a/src/databricks/labs/dqx/utils.py +++ b/src/databricks/labs/dqx/utils.py @@ -6,7 +6,6 @@ from decimal import Decimal from enum import Enum from importlib.util import find_spec -from collections.abc import Callable, Generator from typing import Any from fnmatch import fnmatch from pathlib import Path @@ -33,8 +32,11 @@ COLUMN_NORMALIZE_EXPRESSION = re.compile("[^a-zA-Z0-9]+") COLUMN_PATTERN = re.compile(r"Column<'(.*?)(?: AS (\w+))?'>$", re.DOTALL) INVALID_COLUMN_NAME_PATTERN = re.compile(r"[\s,;{}\(\)\n\t=]+") -_UNRESOLVED_PLACEHOLDER_PATTERN = re.compile(r"\{\{.*?\}\}") -_SCALAR_VARIABLE_TYPES = (str, int, float, bool, Decimal) +_UNRESOLVED_PLACEHOLDER_PATTERN = re.compile(r"\{\{[^}]*\}\}") +_SCALAR_VARIABLE_TYPES = (str, int, float, bool, Decimal, datetime.date, datetime.datetime, datetime.time) + +VariableValue = str | int | float | bool | Decimal | datetime.date | datetime.datetime | datetime.time +"""Supported scalar types for variable substitution values.""" def get_column_name_or_alias( @@ -544,36 +546,41 @@ def missing_required_packages(packages: list[str]) -> bool: return not all(find_spec(spec) for spec in packages) -def _literal_replacer(val: str) -> Callable[[re.Match], str]: - """Return a ``re.sub`` replacer that always returns *val* literally.""" - - def replacer(_: re.Match) -> str: - return val - - return replacer - - def _replace_template(text: str, variables: dict[str, str]) -> str: - """Replace ``{{ key }}`` placeholders in *text* with values from *variables*. + """Replace **{{ key }}** placeholders in *text* with values from *variables*. - Tolerates whitespace inside braces (e.g. ``{{ key }}``, ``{{key}}``). - Uses a lambda replacement to avoid backslash interpretation in values. + Uses a single-pass regex substitution. + Tolerates whitespace inside braces (e.g. **{{ key }}**, **{{key}}**). + Logs a warning if any unresolved **{{ ... }}** placeholders remain after substitution. Args: - text: Input string potentially containing ``{{ key }}`` placeholders. + text: Input string potentially containing **{{ key }}** placeholders. variables: Pre-stringified mapping of placeholder names to values. Returns: String with all matching placeholders replaced. """ - for key, val in variables.items(): - pattern = r"\{\{\s*" + re.escape(key) + r"\s*\}\}" - text = re.sub(pattern, _literal_replacer(val), text) - return text + if not variables: + if _UNRESOLVED_PLACEHOLDER_PATTERN.search(text): + logger.warning("Unresolved placeholder found: '%s'", text) # pylint: disable=logging-too-many-args + return text + + def _resolve(match_obj: re.Match[str]) -> str: + key = match_obj.group(0).strip("{} \t") + if key in variables: + return variables[key] + unresolved.append(key) + return match_obj.group(0) + unresolved: list[str] = [] + output = _UNRESOLVED_PLACEHOLDER_PATTERN.sub(_resolve, text) + if unresolved: + logger.warning("Unresolved placeholders found: %s", unresolved) # pylint: disable=logging-too-many-args + return output -def _substitute_variables(obj: Any, variables: dict[str, str]) -> Any: - """Recursively replace ``{{ key }}`` placeholders in all string values within *obj*. + +def _substitute_variables(obj: object, variables: dict[str, str]) -> object: + """Recursively replace **{{ key }}** placeholders in all string values within *obj*. Traverses dicts, lists, and strings. Non-string/non-collection values are returned unchanged. Dict keys are not substituted. @@ -594,36 +601,41 @@ def _substitute_variables(obj: Any, variables: dict[str, str]) -> Any: return obj -def _validate_variable_types(variables: dict[str, Any]) -> None: +def _validate_variable_types(variables: dict[str, VariableValue]) -> None: """Raise :class:`InvalidParameterError` if any variable value is not a supported scalar type.""" for key, val in variables.items(): if not isinstance(val, _SCALAR_VARIABLE_TYPES): raise InvalidParameterError( f"Variable '{key}' has unsupported type '{type(val).__name__}'. " - f"Only scalar types are supported: str, int, float, bool, Decimal." + f"Only scalar types are supported: str, int, float, bool, Decimal, " + f"datetime.date, datetime.datetime, datetime.time." ) -def apply_variables(checks: list[dict], variables: dict[str, Any] | None) -> list[dict]: - """Apply variable substitution to check definitions. +def resolve_variables(checks: list[dict], variables: dict[str, VariableValue] | None) -> list[dict]: + """Resolve variable substitution in check definitions. - Replaces ``{{ key }}`` placeholders in all string values of *checks* with the + Replaces **{{ key }}** placeholders in all string values of *checks* with the corresponding values from *variables*. The original *checks* list is never mutated. - Variable values must be scalar types (``str``, ``int``, ``float``, ``bool``, - ``Decimal``). Non-string scalars are converted via ``str()`` — for example, - ``{"threshold": 10}`` becomes ``"10"`` in the substituted string. Collection - types (``list``, ``dict``, ``set``, etc.) are rejected with - :class:`~databricks.labs.dqx.errors.InvalidParameterError` because their - ``str()`` representation is rarely meaningful in SQL or column expressions. + Variable values must be scalar types (**str**, **int**, **float**, **bool**, + **Decimal**, **datetime.date**, **datetime.datetime**, **datetime.time**). + Non-string scalars are converted via **str()** — for example, **{"threshold": 10}** becomes **"10"** in + the substituted string. Collection types (**list**, **dict**, **set**, etc.) are + rejected with :class:`~databricks.labs.dqx.errors.InvalidParameterError` because + their **str()** representation is rarely meaningful in SQL or column expressions. - Logs a warning for any ``{{ ... }}`` placeholders that remain unresolved after + Logs a warning for any **{{ ... }}** placeholders that remain unresolved after substitution (e.g. misspelled variable names). + **Security note:** variable values substituted into **sql_expression** checks are + not sanitized and are passed directly to **F.expr()**. Callers must ensure that + variable values come from trusted sources to prevent SQL injection. + Args: checks: List of check definition dictionaries (metadata format). variables: Mapping of placeholder names to scalar replacement values. - If ``None`` or empty the checks are returned unchanged. + If **None** or empty the checks are returned unchanged. Returns: A new list of check dicts with placeholders resolved, or the original list @@ -637,37 +649,7 @@ def apply_variables(checks: list[dict], variables: dict[str, Any] | None) -> lis _validate_variable_types(variables) str_variables = {k: str(v) for k, v in variables.items()} - resolved: list[dict] = _substitute_variables(checks, str_variables) - - # Warn about any remaining unresolved placeholders - for check_def in resolved: - for value in _iter_strings(check_def): - if _UNRESOLVED_PLACEHOLDER_PATTERN.search(value): - logger.warning(f"Unresolved placeholder found after variable substitution: '{value}'") - - return resolved - - -def _iter_strings(obj: Any) -> Generator[str, None, None]: - """Yield all string values found recursively in *obj*. - - Traverses dicts (values only) and lists. Non-string leaf values are skipped. - Used to scan resolved check definitions for unresolved ``{{ ... }}`` placeholders. - - Args: - obj: A string, dict, list, or other value to traverse. - - Yields: - Every string value found in the nested structure. - """ - if isinstance(obj, str): - yield obj - elif isinstance(obj, dict): - for value in obj.values(): - yield from _iter_strings(value) - elif isinstance(obj, list): - for item in obj: - yield from _iter_strings(item) + return _substitute_variables(checks, str_variables) # type: ignore[return-value] def get_file_extension(file_path: str | os.PathLike) -> str: diff --git a/tests/integration/test_apply_checks.py b/tests/integration/test_apply_checks.py index 2faadbb36..98715e5dd 100755 --- a/tests/integration/test_apply_checks.py +++ b/tests/integration/test_apply_checks.py @@ -13,7 +13,6 @@ from databricks.labs.dqx.check_funcs import sql_query from databricks.labs.dqx.config import OutputConfig, FileChecksStorageConfig, ExtraParams, RunConfig from databricks.labs.dqx.engine import DQEngine -from databricks.labs.dqx.utils import apply_variables from databricks.labs.dqx.rule import ( DQForEachColRule, register_rule, @@ -9991,162 +9990,3 @@ def test_apply_checks_by_metadata_skip_checks_with_missing_columns(ws, spark): SCHEMA + complex_cols_schema + REPORTING_COLUMNS, ) assert_df_equality(checked, expected, ignore_nullable=True) - - -def test_apply_checks_by_metadata_with_variables(ws, spark): - dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) - test_df = spark.createDataFrame([[1, 3, 3], [2, None, 4], [None, 4, None]], SCHEMA) - - checks = [ - { - "criticality": "error", - "check": { - "function": "is_not_null_and_not_empty", - "arguments": {"column": "{{ col }}"}, - }, - }, - ] - checks = apply_variables(checks, {"col": "b"}) - - checked = dq_engine.apply_checks_by_metadata(test_df, checks) - - expected = spark.createDataFrame( - [ - [1, 3, 3, None, None], - [ - 2, - None, - 4, - [ - { - "name": "b_is_null_or_empty", - "message": "Column 'b' value is null or empty", - "columns": ["b"], - "filter": None, - "function": "is_not_null_and_not_empty", - "run_time": RUN_TIME, - "run_id": RUN_ID, - "user_metadata": {}, - } - ], - None, - ], - [None, 4, None, None, None], - ], - EXPECTED_SCHEMA, - ) - assert_df_equality(checked, expected, ignore_nullable=True) - - -def test_apply_checks_by_metadata_and_split_with_variables(ws, spark): - dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) - test_df = spark.createDataFrame([[1, 3, 3], [2, None, 4], [None, 4, None]], SCHEMA) - - checks = [ - { - "criticality": "error", - "name": "{{ col }}_null_check", - "check": { - "function": "is_not_null_and_not_empty", - "arguments": {"column": "{{ col }}"}, - }, - }, - { - "criticality": "warn", - "check": { - "function": "sql_expression", - "arguments": {"expression": "{{ expr_col }} > {{ threshold }}"}, - }, - }, - ] - checks = apply_variables(checks, {"col": "b", "expr_col": "a", "threshold": 1}) - - good, bad = dq_engine.apply_checks_by_metadata_and_split(test_df, checks) - - # Row [1, 3, 3]: b is not null, a > 1 passes -> good only - # Row [2, None, 4]: b is null (error), a > 1 passes -> bad only - # Row [None, 4, None]: b is not null, a is null so "a > 1" fails (warn) -> both good and bad - assert good.count() == 2 - assert bad.count() == 2 - - -def test_apply_checks_by_metadata_with_variables_name_and_filter(ws, spark): - dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) - test_df = spark.createDataFrame([[1, 3, 3], [2, None, 4], [None, 4, None]], SCHEMA) - - checks = [ - { - "criticality": "error", - "name": "{{ col }}_greater_than_{{ threshold }}", - "check": { - "function": "sql_expression", - "arguments": {"expression": "{{ col }} > {{ threshold }}"}, - }, - "filter": "{{ filter_col }} IS NOT NULL", - }, - ] - checks = apply_variables(checks, {"col": "a", "threshold": 1, "filter_col": "a"}) - - checked = dq_engine.apply_checks_by_metadata(test_df, checks) - - # Row with a=1 should have an error since a > 1 is false - result_rows = checked.collect() - row_a1 = [r for r in result_rows if r["a"] == 1][0] - assert row_a1["_errors"] is not None - assert len(row_a1["_errors"]) == 1 - assert row_a1["_errors"][0]["name"] == "a_greater_than_1" - - # Row with a=2 should have no errors - row_a2 = [r for r in result_rows if r["a"] == 2][0] - assert row_a2["_errors"] is None - - # Row with a=None should have no errors (filtered out) - row_null = [r for r in result_rows if r["a"] is None][0] - assert row_null["_errors"] is None - - -def test_validate_checks_with_variables(ws): - checks = [ - { - "criticality": "{{ crit }}", - "check": { - "function": "is_not_null", - "arguments": {"column": "{{ col }}"}, - }, - }, - ] - checks = apply_variables(checks, {"crit": "error", "col": "b"}) - - status = DQEngine.validate_checks(checks) - assert not status.has_errors - - -def test_validate_checks_with_variables_invalid_after_substitution(ws): - checks = [ - { - "criticality": "{{ crit }}", - "check": { - "function": "is_not_null", - "arguments": {"column": "b"}, - }, - }, - ] - checks = apply_variables(checks, {"crit": "not_a_valid_criticality"}) - - status = DQEngine.validate_checks(checks) - assert status.has_errors - - -def test_validate_checks_without_variables_fails_on_placeholders(ws): - checks = [ - { - "criticality": "{{ crit }}", - "check": { - "function": "is_not_null", - "arguments": {"column": "b"}, - }, - }, - ] - - status = DQEngine.validate_checks(checks) - assert status.has_errors diff --git a/tests/integration/test_apply_checks_and_save_in_table.py b/tests/integration/test_apply_checks_and_save_in_table.py index d07d738a0..937f6e22d 100644 --- a/tests/integration/test_apply_checks_and_save_in_table.py +++ b/tests/integration/test_apply_checks_and_save_in_table.py @@ -11,9 +11,8 @@ WorkspaceFileChecksStorageConfig, TableChecksStorageConfig, ) -from databricks.labs.dqx.engine import DQEngine +from databricks.labs.dqx.engine import DQEngine, DQEngineCore from databricks.labs.dqx.errors import InvalidConfigError -from databricks.labs.dqx.utils import apply_variables from databricks.labs.dqx.rule import DQRowRule, DQDatasetRule from tests.integration.conftest import ( EXTRA_PARAMS, @@ -2258,7 +2257,7 @@ def test_apply_checks_by_metadata_and_save_in_table_loads_checks_from_table(ws, assert_df_equality(actual_df, expected_df, ignore_nullable=True) -def test_apply_checks_by_metadata_and_save_in_table_with_variables(ws, spark, make_schema, make_random): +def test_apply_checks_by_metadata_and_save_in_table_with_variables(ws, spark, make_schema, make_random, tmp_path): catalog_name = TEST_CATALOG schema = make_schema(catalog_name=catalog_name) input_table = f"{catalog_name}.{schema.name}.{make_random(8).lower()}" @@ -2268,14 +2267,17 @@ def test_apply_checks_by_metadata_and_save_in_table_with_variables(ws, spark, ma test_df = spark.createDataFrame([[1, 2, "valid"], [None, 3, "error"], [4, None, "warn"]], test_schema) test_df.write.format("delta").mode("overwrite").saveAsTable(input_table) - checks = [ - { - "name": "{{ col }}_is_null", - "criticality": "{{ crit }}", - "check": {"function": "is_not_null", "arguments": {"column": "{{ col }}"}}, - }, - ] - checks = apply_variables(checks, {"col": "a", "crit": "error"}) + checks_yaml = """ + - name: "{{ col }}_is_null" + criticality: "{{ crit }}" + check: + function: is_not_null + arguments: + column: "{{ col }}" + """ + checks_file = tmp_path / "checks.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"col": "a", "crit": "error"}) engine = DQEngine(ws, spark=spark, extra_params=EXTRA_PARAMS) engine.apply_checks_by_metadata_and_save_in_table( diff --git a/tests/integration/test_apply_checks_variables.py b/tests/integration/test_apply_checks_variables.py new file mode 100644 index 000000000..5598288fb --- /dev/null +++ b/tests/integration/test_apply_checks_variables.py @@ -0,0 +1,303 @@ +import dataclasses +from databricks.labs.dqx.engine import DQEngine, DQEngineCore +from databricks.labs.dqx.config import FileChecksStorageConfig +from tests.integration.conftest import ( + REPORTING_COLUMNS, + RUN_TIME, + EXTRA_PARAMS, + RUN_ID, + assert_df_equality_ignore_fingerprints as assert_df_equality, +) + +SCHEMA = "a: int, b: int, c: int" +EXPECTED_SCHEMA = SCHEMA + REPORTING_COLUMNS + + +def test_apply_checks_by_metadata_with_variables(ws, spark, tmp_path): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + test_df = spark.createDataFrame([[1, 3, 3], [2, None, 4], [None, 4, None]], SCHEMA) + + checks_yaml = """ + - criticality: error + check: + function: is_not_null_and_not_empty + arguments: + column: "{{ col }}" + """ + checks_file = tmp_path / "checks.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"col": "b"}) + + checked = dq_engine.apply_checks_by_metadata(test_df, checks) + + expected = spark.createDataFrame( + [ + [1, 3, 3, None, None], + [ + 2, + None, + 4, + [ + { + "name": "b_is_null_or_empty", + "message": "Column 'b' value is null or empty", + "columns": ["b"], + "filter": None, + "function": "is_not_null_and_not_empty", + "run_time": RUN_TIME, + "run_id": RUN_ID, + "user_metadata": {}, + } + ], + None, + ], + [None, 4, None, None, None], + ], + EXPECTED_SCHEMA, + ) + assert_df_equality(checked, expected, ignore_nullable=True) + + +def test_apply_checks_by_metadata_and_split_with_variables(ws, spark, tmp_path): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + test_df = spark.createDataFrame([[1, 3, 3], [2, None, 4], [None, 4, None]], SCHEMA) + + checks_yaml = """ + - criticality: error + name: "{{ col }}_null_check" + check: + function: is_not_null_and_not_empty + arguments: + column: "{{ col }}" + - criticality: warn + check: + function: sql_expression + arguments: + expression: "{{ expr_col }} > {{ threshold }}" + """ + checks_file = tmp_path / "checks.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + checks = DQEngineCore.load_checks_from_local_file( + str(checks_file), variables={"col": "b", "expr_col": "a", "threshold": 1} + ) + + good, bad = dq_engine.apply_checks_by_metadata_and_split(test_df, checks) + + # Row [1, 3, 3]: b is not null, a > 1 passes -> good only + # Row [2, None, 4]: b is null (error), a > 1 passes -> bad only + # Row [None, 4, None]: b is not null, a is null so "a > 1" fails (warn) -> both good and bad + assert good.count() == 2 + assert bad.count() == 2 + + +def test_apply_checks_by_metadata_with_variables_name_and_filter(ws, spark, tmp_path): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + test_df = spark.createDataFrame([[1, 3, 3], [2, None, 4], [None, 4, None]], SCHEMA) + + checks_yaml = """ + - criticality: error + name: "{{ col }}_greater_than_{{ threshold }}" + check: + function: sql_expression + arguments: + expression: "{{ col }} > {{ threshold }}" + filter: "{{ filter_col }} IS NOT NULL" + """ + checks_file = tmp_path / "checks.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + checks = DQEngineCore.load_checks_from_local_file( + str(checks_file), variables={"col": "a", "threshold": 1, "filter_col": "a"} + ) + + checked = dq_engine.apply_checks_by_metadata(test_df, checks) + + # Row with a=1 should have an error since a > 1 is false + result_rows = checked.collect() + row_a1 = [r for r in result_rows if r["a"] == 1][0] + assert row_a1["_errors"] is not None + assert len(row_a1["_errors"]) == 1 + assert row_a1["_errors"][0]["name"] == "a_greater_than_1" + + # Row with a=2 should have no errors + row_a2 = [r for r in result_rows if r["a"] == 2][0] + assert row_a2["_errors"] is None + + # Row with a=None should have no errors (filtered out) + row_null = [r for r in result_rows if r["a"] is None][0] + assert row_null["_errors"] is None + + +def test_validate_checks_with_variables(ws, tmp_path): + checks_yaml = """ + - criticality: "{{ crit }}" + check: + function: is_not_null + arguments: + column: "{{ col }}" + """ + checks_file = tmp_path / "checks.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"crit": "error", "col": "b"}) + + status = DQEngine.validate_checks(checks) + assert not status.has_errors + + +def test_validate_checks_with_variables_invalid_after_substitution(ws, tmp_path): + checks_yaml = """ + - criticality: "{{ crit }}" + check: + function: is_not_null + arguments: + column: b + """ + checks_file = tmp_path / "checks.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"crit": "not_a_valid_criticality"}) + + status = DQEngine.validate_checks(checks) + assert status.has_errors + + +def test_validate_checks_without_variables_fails_on_placeholders(ws): + checks = [ + { + "criticality": "{{ crit }}", + "check": { + "function": "is_not_null", + "arguments": {"column": "b"}, + }, + }, + ] + + status = DQEngine.validate_checks(checks) + assert status.has_errors + + +def test_extra_params_variables_substitution_and_overrides(ws, spark, tmp_path): + # Setup data specific to this test + schema = "id int, name string" + expected_schema = schema + REPORTING_COLUMNS + df = spark.createDataFrame([(1, "John"), (None, "Doe")], schema) + + # Define Checks with placeholders in nested structure (user_metadata) + # and deep inside check arguments + checks_yaml = """ + - criticality: error + name: "id_check" + check: + function: is_not_null + arguments: + column: "{{ target_col }}" + user_metadata: + env: "{{ environment }}" + rule_id: "{{ nested_var }}" + """ + checks_file = tmp_path / "checks_extra.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + + # Setup DQEngine with ExtraParams variables (Default values) + # Default variables: target_col=id, environment=dev, nested_var=old + extra_params = dataclasses.replace( + EXTRA_PARAMS, + variables={ + "target_col": "id", + "environment": "dev", + "nested_var": "old", + }, + ) + dq_engine = DQEngine(ws, spark, extra_params=extra_params) + + # Load Checks with overrides + # target_col: id (from ExtraParams default) + # environment: prod (per-call override wins) + # nested_var: new (per-call override wins) + config = FileChecksStorageConfig(location=str(checks_file)) + checks = dq_engine.load_checks(config, variables={"environment": "prod", "nested_var": "new"}) + + # Verify substitution (Structural check) + assert checks[0]["check"]["arguments"]["column"] == "id" + assert checks[0]["user_metadata"]["env"] == "prod" + assert checks[0]["user_metadata"]["rule_id"] == "new" + + # Apply checks to DataFrame (Functional check) + checked_df = dq_engine.apply_checks_by_metadata(df, checks) + + expected = spark.createDataFrame( + [ + [1, "John", None, None], + [ + None, + "Doe", + [ + { + "name": "id_check", + "message": "Column 'id' value is null", + "columns": ["id"], + "filter": None, + "function": "is_not_null", + "run_time": RUN_TIME, + "run_id": RUN_ID, + "user_metadata": {"env": "prod", "rule_id": "new"}, + } + ], + None, + ], + ], + expected_schema, + ) + + assert_df_equality(checked_df, expected, ignore_nullable=True) + + +def test_extra_params_variables_conflict_resolution(ws, spark, tmp_path): + # Verify that a conflict where a variable is defined in both ExtraParams and per-call + # results in the per-call variable taking precedence. + + # 1. Setup DQEngine with ExtraParams variables + extra_params = dataclasses.replace(EXTRA_PARAMS, variables={"my_var": "default"}) + dq_engine = DQEngine(ws, spark, extra_params=extra_params) + + # 2. File with placeholder + checks_yaml = """ + - name: "check_{{ my_var }}" + check: + function: is_not_null + arguments: + column: id + """ + checks_file = tmp_path / "checks_conflict.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + config = FileChecksStorageConfig(location=str(checks_file)) + + # 3. Load with override + checks = dq_engine.load_checks(config, variables={"my_var": "override"}) + + # 4. Verify that "override" won + assert checks[0]["name"] == "check_override" + + +def test_extra_params_variables_fallback_to_defaults(ws, spark, tmp_path): + # Verify that if a variable is NOT provided in the call, it falls back to ExtraParams. + + # 1. Setup DQEngine with ExtraParams variables + extra_params = dataclasses.replace(EXTRA_PARAMS, variables={"my_var": "default"}) + dq_engine = DQEngine(ws, spark, extra_params=extra_params) + + # 2. File with placeholder + checks_yaml = """ + - name: "check_{{ my_var }}" + check: + function: is_not_null + arguments: + column: id + """ + checks_file = tmp_path / "checks_fallback.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + config = FileChecksStorageConfig(location=str(checks_file)) + + # 3. Load WITHOUT specific variables in the call - should use engine defaults + checks = dq_engine.load_checks(config) + + # 4. Verify that "default" was used + assert checks[0]["name"] == "check_default" diff --git a/tests/unit/test_load_checks.py b/tests/unit/test_load_checks.py index ab4fa90a8..8b79cd14d 100644 --- a/tests/unit/test_load_checks.py +++ b/tests/unit/test_load_checks.py @@ -1,3 +1,4 @@ +import logging from unittest.mock import create_autospec import pytest @@ -8,7 +9,7 @@ ChecksStorageHandler, VolumeFileChecksStorageHandler, ) -from databricks.labs.dqx.config import FileChecksStorageConfig, VolumeFileChecksStorageConfig +from databricks.labs.dqx.config import FileChecksStorageConfig, VolumeFileChecksStorageConfig, ExtraParams from databricks.labs.dqx.engine import DQEngine, DQEngineCore from databricks.labs.dqx.errors import InvalidCheckError, CheckDownloadError, InvalidConfigError from databricks.sdk import WorkspaceClient @@ -182,3 +183,68 @@ def test_load_checks_variables_none(): checks = engine.load_checks(config, variables=None) assert checks == raw_checks + + +def test_load_checks_from_local_file_unresolved_placeholder(tmp_path, caplog): + content = """- criticality: error + check: + function: is_not_null + arguments: + column: "{{ col }}" +""" + file_path = tmp_path / "checks.yml" + file_path.write_text(content, encoding="utf-8") + + with caplog.at_level(logging.WARNING): + checks = DQEngineCore.load_checks_from_local_file(str(file_path), variables={"other": "value"}) + + assert checks[0]["check"]["arguments"]["column"] == "{{ col }}" + assert any("Unresolved placeholder" in msg for msg in caplog.messages) + + +def test_load_checks_with_engine_default_variables(): + ws = create_autospec(WorkspaceClient) + mock_spark = create_autospec(SparkSession) + + raw_checks = [ + {"criticality": "{{ crit }}", "check": {"function": "is_not_null", "arguments": {"column": "{{ col }}"}}} + ] + + mock_factory = create_autospec(BaseChecksStorageHandlerFactory) + mock_handler = create_autospec(ChecksStorageHandler) + mock_factory.create.return_value = mock_handler + mock_handler.load.return_value = raw_checks + + extra_params = ExtraParams(variables={"crit": "error", "col": "default_col"}) + engine = DQEngine(ws, spark=mock_spark, checks_handler_factory=mock_factory, extra_params=extra_params) + config = FileChecksStorageConfig(location="checks.yml") + + checks = engine.load_checks(config) + + assert checks == [ + {"criticality": "error", "check": {"function": "is_not_null", "arguments": {"column": "default_col"}}}, + ] + + +def test_load_checks_per_call_overrides_engine_defaults(): + ws = create_autospec(WorkspaceClient) + mock_spark = create_autospec(SparkSession) + + raw_checks = [ + {"criticality": "{{ crit }}", "check": {"function": "is_not_null", "arguments": {"column": "{{ col }}"}}} + ] + + mock_factory = create_autospec(BaseChecksStorageHandlerFactory) + mock_handler = create_autospec(ChecksStorageHandler) + mock_factory.create.return_value = mock_handler + mock_handler.load.return_value = raw_checks + + extra_params = ExtraParams(variables={"crit": "warn", "col": "default_col"}) + engine = DQEngine(ws, spark=mock_spark, checks_handler_factory=mock_factory, extra_params=extra_params) + config = FileChecksStorageConfig(location="checks.yml") + + checks = engine.load_checks(config, variables={"crit": "error"}) + + assert checks == [ + {"criticality": "error", "check": {"function": "is_not_null", "arguments": {"column": "default_col"}}}, + ] diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 36122a6ce..b60bc1e41 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,5 +1,5 @@ import logging -from datetime import date, datetime +from datetime import date, datetime, time from decimal import Decimal from enum import Enum from typing import Any @@ -21,7 +21,7 @@ safe_strip_file_from_path, missing_required_packages, get_file_extension, - apply_variables, + resolve_variables, ) from databricks.labs.dqx.rule import normalize_bound_args from databricks.labs.dqx.errors import InvalidParameterError, InvalidConfigError @@ -524,7 +524,7 @@ def test_get_file_extension_with_path_object(): assert get_file_extension(file_path) == ".json" -def test_apply_variables_replaces_all_string_fields(): +def test_resolve_variables_replaces_all_string_fields(): checks = [ { "criticality": "error", @@ -537,28 +537,28 @@ def test_apply_variables_replaces_all_string_fields(): } ] variables = {"col": "email", "filter_col": "status"} - result = apply_variables(checks, variables) + result = resolve_variables(checks, variables) assert result[0]["name"] == "email_not_null" assert result[0]["check"]["arguments"]["column"] == "email" assert result[0]["filter"] == "status = 'active'" -def test_apply_variables_none_variables(): +def test_resolve_variables_none_variables(): checks = [{"name": "{{ x }}"}] - result = apply_variables(checks, None) + result = resolve_variables(checks, None) assert result is checks # same object, no copy assert result[0]["name"] == "{{ x }}" -def test_apply_variables_empty_variables(): +def test_resolve_variables_empty_variables(): checks = [{"name": "{{ x }}"}] - result = apply_variables(checks, {}) + result = resolve_variables(checks, {}) assert result is checks # same object, no copy assert result[0]["name"] == "{{ x }}" -def test_apply_variables_non_string_values_converted(): +def test_resolve_variables_non_string_values_converted(): checks = [ { "check": { @@ -568,11 +568,11 @@ def test_apply_variables_non_string_values_converted(): } ] variables = {"col": "age", "threshold": 18} - result = apply_variables(checks, variables) + result = resolve_variables(checks, variables) assert result[0]["check"]["arguments"]["expression"] == "age > 18" -def test_apply_variables_does_not_mutate_original(): +def test_resolve_variables_does_not_mutate_original(): checks = [ { "name": "{{ col }}_check", @@ -583,14 +583,14 @@ def test_apply_variables_does_not_mutate_original(): } ] variables = {"col": "name"} - apply_variables(checks, variables) + resolve_variables(checks, variables) # Original must be unchanged assert checks[0]["name"] == "{{ col }}_check" assert checks[0]["check"]["arguments"]["column"] == "{{ col }}" -def test_apply_variables_nested_dicts(): +def test_resolve_variables_nested_dicts(): checks = [ { "check": { @@ -603,41 +603,41 @@ def test_apply_variables_nested_dicts(): } ] variables = {"col": "id", "team": "data-eng"} - result = apply_variables(checks, variables) + result = resolve_variables(checks, variables) assert result[0]["check"]["arguments"]["expression"] == "id IS NOT NULL" assert result[0]["user_metadata"]["owner"] == "data-eng" -def test_apply_variables_partial_replacement(): +def test_resolve_variables_partial_replacement(): checks = [{"name": "{{ p1 }}_greater_than_{{ threshold }}"}] variables = {"p1": "column1", "threshold": 10} - result = apply_variables(checks, variables) + result = resolve_variables(checks, variables) assert result[0]["name"] == "column1_greater_than_10" -def test_apply_variables_unresolved_placeholder_warning(caplog): +def test_resolve_variables_unresolved_placeholder_warning(caplog): checks = [{"name": "{{ resolved }}_{{ unresolved }}"}] variables = {"resolved": "ok"} with caplog.at_level(logging.WARNING, logger="databricks.labs.dqx.utils"): - result = apply_variables(checks, variables) + result = resolve_variables(checks, variables) assert result[0]["name"] == "ok_{{ unresolved }}" assert any("Unresolved placeholder" in msg for msg in caplog.messages) -def test_apply_variables_whitespace_tolerance(): +def test_resolve_variables_whitespace_tolerance(): checks = [ {"a": "{{x}}", "b": "{{ x }}", "c": "{{ x }}"}, ] variables = {"x": "val"} - result = apply_variables(checks, variables) + result = resolve_variables(checks, variables) assert result[0]["a"] == "val" assert result[0]["b"] == "val" assert result[0]["c"] == "val" -def test_apply_variables_non_string_dict_values_untouched(): +def test_resolve_variables_non_string_dict_values_untouched(): checks = [ { "criticality": "error", @@ -648,13 +648,13 @@ def test_apply_variables_non_string_dict_values_untouched(): } ] variables = {"col": "status"} - result = apply_variables(checks, variables) + result = resolve_variables(checks, variables) assert result[0]["check"]["arguments"]["column"] == "status" assert result[0]["check"]["arguments"]["allowed"] == [1, 2, 3] assert result[0]["criticality"] == "error" -def test_apply_variables_for_each_column(): +def test_resolve_variables_for_each_column(): checks = [ { "criticality": "error", @@ -665,11 +665,11 @@ def test_apply_variables_for_each_column(): } ] variables = {"col1": "first_name", "col2": "last_name"} - result = apply_variables(checks, variables) + result = resolve_variables(checks, variables) assert result[0]["check"]["for_each_column"] == ["first_name", "last_name"] -def test_apply_variables_multiple_checks(): +def test_resolve_variables_multiple_checks(): checks = [ { "name": "{{ col }}_not_null", @@ -681,78 +681,168 @@ def test_apply_variables_multiple_checks(): }, ] variables = {"col": "a", "col2": "b"} - result = apply_variables(checks, variables) + result = resolve_variables(checks, variables) assert result[0]["name"] == "a_not_null" assert result[0]["check"]["arguments"]["column"] == "a" assert result[1]["name"] == "b_not_empty" assert result[1]["check"]["arguments"]["column"] == "b" -def test_apply_variables_empty_checks_list(): - result = apply_variables([], {"col": "x"}) +def test_resolve_variables_empty_checks_list(): + result = resolve_variables([], {"col": "x"}) assert result == [] -def test_apply_variables_empty_string_value(): +def test_resolve_variables_empty_string_value(): checks = [{"name": "prefix_{{ col }}_suffix"}] - result = apply_variables(checks, {"col": ""}) + result = resolve_variables(checks, {"col": ""}) assert result[0]["name"] == "prefix__suffix" -def test_apply_variables_value_contains_braces(): +def test_resolve_variables_value_contains_braces(): """Variable value itself contains {{ }} — should NOT be re-expanded.""" checks = [{"expr": "{{ col }}"}] - result = apply_variables(checks, {"col": "{{ other }}"}) + result = resolve_variables(checks, {"col": "{{ other }}"}) assert result[0]["expr"] == "{{ other }}" -def test_apply_variables_key_with_regex_special_chars(): +def test_resolve_variables_key_with_regex_special_chars(): """Variable keys with regex metacharacters must be escaped properly.""" checks = [{"name": "{{ col.name }}_check", "filter": "{{ col+1 }} > 0"}] variables = {"col.name": "revenue", "col+1": "amount"} - result = apply_variables(checks, variables) + result = resolve_variables(checks, variables) assert result[0]["name"] == "revenue_check" assert result[0]["filter"] == "amount > 0" -def test_apply_variables_same_placeholder_repeated_in_string(): +def test_resolve_variables_same_placeholder_repeated_in_string(): checks = [{"expr": "{{ x }} + {{ x }}"}] - result = apply_variables(checks, {"x": "col"}) + result = resolve_variables(checks, {"x": "col"}) assert result[0]["expr"] == "col + col" -def test_apply_variables_deeply_nested(): +def test_resolve_variables_deeply_nested(): checks = [{"a": {"b": {"c": {"d": "{{ v }}"}}}}] - result = apply_variables(checks, {"v": "deep"}) + result = resolve_variables(checks, {"v": "deep"}) assert result[0]["a"]["b"]["c"]["d"] == "deep" -def test_apply_variables_value_with_backslash(): +def test_resolve_variables_value_with_backslash(): """Backslashes in values should be treated literally (no regex group refs).""" checks = [{"path": "{{ p }}"}] - result = apply_variables(checks, {"p": r"C:\Users\test"}) + result = resolve_variables(checks, {"p": r"C:\Users\test"}) assert result[0]["path"] == r"C:\Users\test" -def test_apply_variables_rejects_list_value(): +def test_resolve_variables_rejects_list_value(): checks = [{"check": {"arguments": {"column": "{{ col }}"}}}] with pytest.raises(InvalidParameterError, match="unsupported type 'list'"): - apply_variables(checks, {"col": ["a", "b"]}) + resolve_variables(checks, {"col": ["a", "b"]}) -def test_apply_variables_rejects_dict_value(): +def test_resolve_variables_rejects_dict_value(): checks = [{"check": {"arguments": {"column": "{{ col }}"}}}] with pytest.raises(InvalidParameterError, match="unsupported type 'dict'"): - apply_variables(checks, {"col": {"nested": "value"}}) + resolve_variables(checks, {"col": {"nested": "value"}}) -def test_apply_variables_accepts_decimal_value(): +def test_resolve_variables_accepts_decimal_value(): checks = [{"expr": "col > {{ threshold }}"}] - result = apply_variables(checks, {"threshold": Decimal("3.14")}) + result = resolve_variables(checks, {"threshold": Decimal("3.14")}) assert result[0]["expr"] == "col > 3.14" -def test_apply_variables_accepts_bool_value(): +def test_resolve_variables_accepts_bool_value(): checks = [{"expr": "{{ flag }}"}] - result = apply_variables(checks, {"flag": True}) + result = resolve_variables(checks, {"flag": True}) assert result[0]["expr"] == "True" + + +def test_resolve_variables_false_bool(): + checks = [{"expr": "{{ flag }}"}] + result = resolve_variables(checks, {"flag": False}) + assert result[0]["expr"] == "False" + + +def test_resolve_variables_rejects_none_value(): + checks = [{"col": "{{ col }}"}] + with pytest.raises(InvalidParameterError, match="unsupported type 'NoneType'"): + resolve_variables(checks, {"col": None}) + + +def test_resolve_variables_rejects_set_value(): + checks = [{"col": "{{ col }}"}] + with pytest.raises(InvalidParameterError, match="unsupported type 'set'"): + resolve_variables(checks, {"col": {1, 2}}) + + +def test_resolve_variables_rejects_tuple_value(): + checks = [{"col": "{{ col }}"}] + with pytest.raises(InvalidParameterError, match="unsupported type 'tuple'"): + resolve_variables(checks, {"col": (1, 2)}) + + +def test_resolve_variables_dict_keys_not_substituted(): + checks = [{"{{ col }}": "value", "other": "{{ col }}"}] + result = resolve_variables(checks, {"col": "replaced"}) + assert "{{ col }}" in result[0] + assert result[0]["{{ col }}"] == "value" + assert result[0]["other"] == "replaced" + + +def test_resolve_variables_nan(): + checks = [{"expr": "{{ val }}"}] + result = resolve_variables(checks, {"val": float("nan")}) + assert result[0]["expr"] == "nan" + + +def test_resolve_variables_inf(): + checks = [{"expr": "{{ val }}"}] + result = resolve_variables(checks, {"val": float("inf")}) + assert result[0]["expr"] == "inf" + + +def test_resolve_variables_multiple_unresolved_warns(caplog): + checks = [{"expr": "{{ a }} and {{ b }}"}] + with caplog.at_level(logging.WARNING): + result = resolve_variables(checks, {"a": "x"}) + assert result[0]["expr"] == "x and {{ b }}" + assert any("Unresolved placeholder" in msg for msg in caplog.messages) + + +def test_resolve_variables_none_vars_no_warning(caplog): + checks = [{"col": "{{ x }}"}] + with caplog.at_level(logging.WARNING): + result = resolve_variables(checks, None) + assert result[0]["col"] == "{{ x }}" + assert not any("Unresolved placeholder" in msg for msg in caplog.messages) + + +def test_resolve_variables_whitespace_in_key(): + checks = [{"col": "{{col_a}}"}] + result = resolve_variables(checks, {"col_a": "replaced"}) + assert result[0]["col"] == "replaced" + + +def test_resolve_variables_unicode_values(): + checks = [{"col": "{{ col }}"}] + result = resolve_variables(checks, {"col": "prénom"}) + assert result[0]["col"] == "prénom" + + +def test_resolve_variables_accepts_date(): + checks = [{"expr": "date > '{{ d }}'"}] + result = resolve_variables(checks, {"d": date(2024, 1, 15)}) + assert result[0]["expr"] == "date > '2024-01-15'" + + +def test_resolve_variables_accepts_datetime(): + checks = [{"expr": "ts > '{{ ts }}'"}] + result = resolve_variables(checks, {"ts": datetime(2024, 1, 15, 10, 30)}) + assert "2024-01-15" in result[0]["expr"] + + +def test_resolve_variables_accepts_time(): + checks = [{"expr": "t > '{{ t }}'"}] + result = resolve_variables(checks, {"t": time(10, 30)}) + assert result[0]["expr"] == "t > '10:30:00'" From c1ff1461312f447713a6b82c01f7d88c6d4310f1 Mon Sep 17 00:00:00 2001 From: fedeflowers Date: Mon, 23 Mar 2026 20:14:54 +0100 Subject: [PATCH 07/24] add test parametrization variables, checked col is missing and another variable is given --- .../test_apply_checks_variables.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/integration/test_apply_checks_variables.py b/tests/integration/test_apply_checks_variables.py index 5598288fb..2ae9d2f33 100644 --- a/tests/integration/test_apply_checks_variables.py +++ b/tests/integration/test_apply_checks_variables.py @@ -301,3 +301,34 @@ def test_extra_params_variables_fallback_to_defaults(ws, spark, tmp_path): # 4. Verify that "default" was used assert checks[0]["name"] == "check_default" + + +def test_apply_checks_with_missing_variable(ws, spark, tmp_path): + dq_engine = DQEngine(workspace_client=ws, spark=spark, extra_params=EXTRA_PARAMS) + test_df = spark.createDataFrame([[1, 3, 3], [2, None, 4], [None, 4, None]], SCHEMA) + + checks_yaml = """ + - criticality: error + check: + function: is_not_null + arguments: + column: "{{ missing_col }}" + """ + checks_file = tmp_path / "checks_missing.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + + # Load file, which will warn and leave the placeholder + checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"different_var": "val"}) + + # Assert that the placeholder was left in the metadata (unresolved variable) + assert checks[0]["check"]["arguments"]["column"] == "{{ missing_col }}" + + # Check function apply should not raise an exception, but instead skip the check and report it in the results + checked = dq_engine.apply_checks_by_metadata(test_df, checks) + + errors = checked.select("_errors").collect() + for row in errors: + assert row["_errors"] is not None + assert len(row["_errors"]) == 1 + assert "Check evaluation skipped due to invalid check columns" in row["_errors"][0]["message"] + assert "{{ missing_col }}" in row["_errors"][0]["message"] From 100c9cfadb06a75f82dca067470aa5ca572c8f68 Mon Sep 17 00:00:00 2001 From: fedeflowers Date: Tue, 31 Mar 2026 16:54:30 +0200 Subject: [PATCH 08/24] fix tests for variable parametrization of core on load checks, reverted accidental change on config file, removed pylint ignore --- src/databricks/labs/dqx/config.py | 1 + src/databricks/labs/dqx/utils.py | 4 +- .../test_apply_checks_and_save_in_table.py | 58 +----- .../test_apply_checks_variables.py | 171 ++++++------------ 4 files changed, 60 insertions(+), 174 deletions(-) diff --git a/src/databricks/labs/dqx/config.py b/src/databricks/labs/dqx/config.py index 2b139b116..7185bb5e1 100644 --- a/src/databricks/labs/dqx/config.py +++ b/src/databricks/labs/dqx/config.py @@ -216,6 +216,7 @@ class ExtraParams: user_metadata: dict[str, str] = field(default_factory=dict) run_time_overwrite: str | None = None run_id_overwrite: str | None = None + suppress_skipped: bool = False variables: dict[str, VariableValue] = field(default_factory=dict) diff --git a/src/databricks/labs/dqx/utils.py b/src/databricks/labs/dqx/utils.py index 9a0dc4241..141af686f 100644 --- a/src/databricks/labs/dqx/utils.py +++ b/src/databricks/labs/dqx/utils.py @@ -562,7 +562,7 @@ def _replace_template(text: str, variables: dict[str, str]) -> str: """ if not variables: if _UNRESOLVED_PLACEHOLDER_PATTERN.search(text): - logger.warning("Unresolved placeholder found: '%s'", text) # pylint: disable=logging-too-many-args + logger.warning(f"Unresolved placeholder found: '{text}'") return text def _resolve(match_obj: re.Match[str]) -> str: @@ -575,7 +575,7 @@ def _resolve(match_obj: re.Match[str]) -> str: unresolved: list[str] = [] output = _UNRESOLVED_PLACEHOLDER_PATTERN.sub(_resolve, text) if unresolved: - logger.warning("Unresolved placeholders found: %s", unresolved) # pylint: disable=logging-too-many-args + logger.warning(f"Unresolved placeholders found: {unresolved}") return output diff --git a/tests/integration/test_apply_checks_and_save_in_table.py b/tests/integration/test_apply_checks_and_save_in_table.py index 937f6e22d..5c71a2659 100644 --- a/tests/integration/test_apply_checks_and_save_in_table.py +++ b/tests/integration/test_apply_checks_and_save_in_table.py @@ -11,7 +11,7 @@ WorkspaceFileChecksStorageConfig, TableChecksStorageConfig, ) -from databricks.labs.dqx.engine import DQEngine, DQEngineCore +from databricks.labs.dqx.engine import DQEngine from databricks.labs.dqx.errors import InvalidConfigError from databricks.labs.dqx.rule import DQRowRule, DQDatasetRule from tests.integration.conftest import ( @@ -2257,60 +2257,4 @@ def test_apply_checks_by_metadata_and_save_in_table_loads_checks_from_table(ws, assert_df_equality(actual_df, expected_df, ignore_nullable=True) -def test_apply_checks_by_metadata_and_save_in_table_with_variables(ws, spark, make_schema, make_random, tmp_path): - catalog_name = TEST_CATALOG - schema = make_schema(catalog_name=catalog_name) - input_table = f"{catalog_name}.{schema.name}.{make_random(8).lower()}" - output_table = f"{catalog_name}.{schema.name}.{make_random(8).lower()}" - - test_schema = "a: int, b: int, c: string" - test_df = spark.createDataFrame([[1, 2, "valid"], [None, 3, "error"], [4, None, "warn"]], test_schema) - test_df.write.format("delta").mode("overwrite").saveAsTable(input_table) - - checks_yaml = """ - - name: "{{ col }}_is_null" - criticality: "{{ crit }}" - check: - function: is_not_null - arguments: - column: "{{ col }}" - """ - checks_file = tmp_path / "checks.yml" - checks_file.write_text(checks_yaml, encoding="utf-8") - checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"col": "a", "crit": "error"}) - - engine = DQEngine(ws, spark=spark, extra_params=EXTRA_PARAMS) - engine.apply_checks_by_metadata_and_save_in_table( - checks=checks, - input_config=InputConfig(location=input_table), - output_config=OutputConfig(location=output_table, mode="overwrite"), - ) - actual_df = spark.table(output_table) - expected_schema = test_schema + REPORTING_COLUMNS - expected_df = spark.createDataFrame( - [ - [1, 2, "valid", None, None], - [ - None, - 3, - "error", - [ - { - "name": "a_is_null", - "message": "Column 'a' value is null", - "columns": ["a"], - "filter": None, - "function": "is_not_null", - "run_time": RUN_TIME, - "run_id": RUN_ID, - "user_metadata": {}, - } - ], - None, - ], - [4, None, "warn", None, None], - ], - schema=expected_schema, - ) - assert_df_equality(actual_df, expected_df, ignore_nullable=True) diff --git a/tests/integration/test_apply_checks_variables.py b/tests/integration/test_apply_checks_variables.py index 2ae9d2f33..96837e372 100644 --- a/tests/integration/test_apply_checks_variables.py +++ b/tests/integration/test_apply_checks_variables.py @@ -3,19 +3,14 @@ from databricks.labs.dqx.config import FileChecksStorageConfig from tests.integration.conftest import ( REPORTING_COLUMNS, - RUN_TIME, EXTRA_PARAMS, - RUN_ID, - assert_df_equality_ignore_fingerprints as assert_df_equality, ) SCHEMA = "a: int, b: int, c: int" EXPECTED_SCHEMA = SCHEMA + REPORTING_COLUMNS -def test_apply_checks_by_metadata_with_variables(ws, spark, tmp_path): - dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) - test_df = spark.createDataFrame([[1, 3, 3], [2, None, 4], [None, 4, None]], SCHEMA) +def test_load_checks_by_metadata_with_variables(tmp_path): checks_yaml = """ - criticality: error @@ -28,39 +23,18 @@ def test_apply_checks_by_metadata_with_variables(ws, spark, tmp_path): checks_file.write_text(checks_yaml, encoding="utf-8") checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"col": "b"}) - checked = dq_engine.apply_checks_by_metadata(test_df, checks) - - expected = spark.createDataFrame( - [ - [1, 3, 3, None, None], - [ - 2, - None, - 4, - [ - { - "name": "b_is_null_or_empty", - "message": "Column 'b' value is null or empty", - "columns": ["b"], - "filter": None, - "function": "is_not_null_and_not_empty", - "run_time": RUN_TIME, - "run_id": RUN_ID, - "user_metadata": {}, - } - ], - None, - ], - [None, 4, None, None, None], - ], - EXPECTED_SCHEMA, - ) - assert_df_equality(checked, expected, ignore_nullable=True) + assert checks == [ + { + "criticality": "error", + "check": { + "function": "is_not_null_and_not_empty", + "arguments": {"column": "b"}, + }, + } + ] -def test_apply_checks_by_metadata_and_split_with_variables(ws, spark, tmp_path): - dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) - test_df = spark.createDataFrame([[1, 3, 3], [2, None, 4], [None, 4, None]], SCHEMA) +def test_load_checks_by_metadata_and_split_with_variables(tmp_path): checks_yaml = """ - criticality: error @@ -81,18 +55,26 @@ def test_apply_checks_by_metadata_and_split_with_variables(ws, spark, tmp_path): str(checks_file), variables={"col": "b", "expr_col": "a", "threshold": 1} ) - good, bad = dq_engine.apply_checks_by_metadata_and_split(test_df, checks) - - # Row [1, 3, 3]: b is not null, a > 1 passes -> good only - # Row [2, None, 4]: b is null (error), a > 1 passes -> bad only - # Row [None, 4, None]: b is not null, a is null so "a > 1" fails (warn) -> both good and bad - assert good.count() == 2 - assert bad.count() == 2 + assert checks == [ + { + "criticality": "error", + "name": "b_null_check", + "check": { + "function": "is_not_null_and_not_empty", + "arguments": {"column": "b"}, + }, + }, + { + "criticality": "warn", + "check": { + "function": "sql_expression", + "arguments": {"expression": "a > 1"}, + }, + }, + ] -def test_apply_checks_by_metadata_with_variables_name_and_filter(ws, spark, tmp_path): - dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) - test_df = spark.createDataFrame([[1, 3, 3], [2, None, 4], [None, 4, None]], SCHEMA) +def test_load_checks_by_metadata_with_variables_name_and_filter(tmp_path): checks_yaml = """ - criticality: error @@ -109,25 +91,20 @@ def test_apply_checks_by_metadata_with_variables_name_and_filter(ws, spark, tmp_ str(checks_file), variables={"col": "a", "threshold": 1, "filter_col": "a"} ) - checked = dq_engine.apply_checks_by_metadata(test_df, checks) - - # Row with a=1 should have an error since a > 1 is false - result_rows = checked.collect() - row_a1 = [r for r in result_rows if r["a"] == 1][0] - assert row_a1["_errors"] is not None - assert len(row_a1["_errors"]) == 1 - assert row_a1["_errors"][0]["name"] == "a_greater_than_1" - - # Row with a=2 should have no errors - row_a2 = [r for r in result_rows if r["a"] == 2][0] - assert row_a2["_errors"] is None - - # Row with a=None should have no errors (filtered out) - row_null = [r for r in result_rows if r["a"] is None][0] - assert row_null["_errors"] is None + assert checks == [ + { + "criticality": "error", + "name": "a_greater_than_1", + "check": { + "function": "sql_expression", + "arguments": {"expression": "a > 1"}, + }, + "filter": "a IS NOT NULL", + } + ] -def test_validate_checks_with_variables(ws, tmp_path): +def test_validate_checks_with_variables(tmp_path): checks_yaml = """ - criticality: "{{ crit }}" check: @@ -143,7 +120,7 @@ def test_validate_checks_with_variables(ws, tmp_path): assert not status.has_errors -def test_validate_checks_with_variables_invalid_after_substitution(ws, tmp_path): +def test_validate_checks_with_variables_invalid_after_substitution(tmp_path): checks_yaml = """ - criticality: "{{ crit }}" check: @@ -156,10 +133,15 @@ def test_validate_checks_with_variables_invalid_after_substitution(ws, tmp_path) checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"crit": "not_a_valid_criticality"}) status = DQEngine.validate_checks(checks) - assert status.has_errors + expected_error = ( + "Invalid 'criticality' value: 'not_a_valid_criticality'. Expected 'warn' or 'error'. " + "Check details: {'criticality': 'not_a_valid_criticality', " + "'check': {'function': 'is_not_null', 'arguments': {'column': 'b'}}}" + ) + assert status.errors[0] == expected_error -def test_validate_checks_without_variables_fails_on_placeholders(ws): +def test_validate_checks_without_variables_fails_on_placeholders(): checks = [ { "criticality": "{{ crit }}", @@ -171,15 +153,15 @@ def test_validate_checks_without_variables_fails_on_placeholders(ws): ] status = DQEngine.validate_checks(checks) - assert status.has_errors + expected_error = ( + "Invalid 'criticality' value: '{{ crit }}'. Expected 'warn' or 'error'. " + "Check details: {'criticality': '{{ crit }}', " + "'check': {'function': 'is_not_null', 'arguments': {'column': 'b'}}}" + ) + assert status.errors[0] == expected_error def test_extra_params_variables_substitution_and_overrides(ws, spark, tmp_path): - # Setup data specific to this test - schema = "id int, name string" - expected_schema = schema + REPORTING_COLUMNS - df = spark.createDataFrame([(1, "John"), (None, "Doe")], schema) - # Define Checks with placeholders in nested structure (user_metadata) # and deep inside check arguments checks_yaml = """ @@ -220,35 +202,6 @@ def test_extra_params_variables_substitution_and_overrides(ws, spark, tmp_path): assert checks[0]["user_metadata"]["env"] == "prod" assert checks[0]["user_metadata"]["rule_id"] == "new" - # Apply checks to DataFrame (Functional check) - checked_df = dq_engine.apply_checks_by_metadata(df, checks) - - expected = spark.createDataFrame( - [ - [1, "John", None, None], - [ - None, - "Doe", - [ - { - "name": "id_check", - "message": "Column 'id' value is null", - "columns": ["id"], - "filter": None, - "function": "is_not_null", - "run_time": RUN_TIME, - "run_id": RUN_ID, - "user_metadata": {"env": "prod", "rule_id": "new"}, - } - ], - None, - ], - ], - expected_schema, - ) - - assert_df_equality(checked_df, expected, ignore_nullable=True) - def test_extra_params_variables_conflict_resolution(ws, spark, tmp_path): # Verify that a conflict where a variable is defined in both ExtraParams and per-call @@ -303,9 +256,7 @@ def test_extra_params_variables_fallback_to_defaults(ws, spark, tmp_path): assert checks[0]["name"] == "check_default" -def test_apply_checks_with_missing_variable(ws, spark, tmp_path): - dq_engine = DQEngine(workspace_client=ws, spark=spark, extra_params=EXTRA_PARAMS) - test_df = spark.createDataFrame([[1, 3, 3], [2, None, 4], [None, 4, None]], SCHEMA) +def test_load_checks_with_missing_variable(tmp_path): checks_yaml = """ - criticality: error @@ -322,13 +273,3 @@ def test_apply_checks_with_missing_variable(ws, spark, tmp_path): # Assert that the placeholder was left in the metadata (unresolved variable) assert checks[0]["check"]["arguments"]["column"] == "{{ missing_col }}" - - # Check function apply should not raise an exception, but instead skip the check and report it in the results - checked = dq_engine.apply_checks_by_metadata(test_df, checks) - - errors = checked.select("_errors").collect() - for row in errors: - assert row["_errors"] is not None - assert len(row["_errors"]) == 1 - assert "Check evaluation skipped due to invalid check columns" in row["_errors"][0]["message"] - assert "{{ missing_col }}" in row["_errors"][0]["message"] From e2e1f1e3ebe4edc24f6921ff72b7f0833eb9858f Mon Sep 17 00:00:00 2001 From: fedeflowers Date: Tue, 31 Mar 2026 16:57:56 +0200 Subject: [PATCH 09/24] fix reverted extra space on apply checks in table file --- tests/integration/test_apply_checks_and_save_in_table.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/integration/test_apply_checks_and_save_in_table.py b/tests/integration/test_apply_checks_and_save_in_table.py index 5c71a2659..73fc12e47 100644 --- a/tests/integration/test_apply_checks_and_save_in_table.py +++ b/tests/integration/test_apply_checks_and_save_in_table.py @@ -2254,7 +2254,4 @@ def test_apply_checks_by_metadata_and_save_in_table_loads_checks_from_table(ws, ], schema=expected_schema, ) - assert_df_equality(actual_df, expected_df, ignore_nullable=True) - - - + assert_df_equality(actual_df, expected_df, ignore_nullable=True) \ No newline at end of file From 4e3a81deae5652a272cc7d59da16ecb286ea74d7 Mon Sep 17 00:00:00 2001 From: fedeflowers Date: Fri, 3 Apr 2026 17:31:21 +0200 Subject: [PATCH 10/24] add docs, fix overloading, deduplication of tests, removed integration testing not using databricks sdk --- demos/dqx_demo_library.py | 60 +++- demos/dqx_quick_start_demo_library.py | 18 ++ .../docs/guide/quality_checks_definition.mdx | 54 ++++ .../dqx/docs/guide/quality_checks_storage.mdx | 7 + docs/dqx/docs/reference/engine.mdx | 2 +- docs/dqx/docs/reference/quality_checks.mdx | 4 + src/databricks/labs/dqx/utils.py | 26 +- .../test_apply_checks_and_save_in_table.py | 3 +- .../test_apply_checks_variables.py | 275 ------------------ tests/unit/test_checks_validation.py | 59 +++- tests/unit/test_load_checks.py | 89 ++++++ 11 files changed, 315 insertions(+), 282 deletions(-) delete mode 100644 tests/integration/test_apply_checks_variables.py diff --git a/demos/dqx_demo_library.py b/demos/dqx_demo_library.py index 34b250b3b..ae924e64c 100644 --- a/demos/dqx_demo_library.py +++ b/demos/dqx_demo_library.py @@ -1481,4 +1481,62 @@ def safe_parse_json(col): # explode warnings warnings_df = valid_and_quarantine_df.select(F.explode(F.col("dq_warnings")).alias("dq")).select(F.expr("dq.*")) -display(warnings_df) \ No newline at end of file +display(warnings_df) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Advanced: Variable Substitution +# MAGIC +# MAGIC DQX supports variable substitution in declarative check definitions (YAML, JSON, or Delta tables). +# MAGIC This allows you to parameterize your rules and inject values at **load time** via the `variables` parameter in `load_checks`. +# MAGIC +# MAGIC ### Example Usage +# MAGIC +# MAGIC 1. Define a rule with `{{ placeholder }}` syntax. +# MAGIC 2. Pass a dictionary of variables when loading the rules. + +# COMMAND ---------- + +from databricks.labs.dqx.config import WorkspaceFileChecksStorageConfig + +# Save to a temporary file + +# Define parameterized checks +parameterized_checks_yaml = """ +- criticality: error + name: "threshold_check_{{ threshold_name }}" + check: + function: is_not_greater_than + arguments: + column: "{{ target_column }}" + limit: "{{ max_value }}" +""" + +# Save to a temporary file +# demo_file_directory is defined at the beginning of this notebook +temp_checks_path = os.path.join(demo_file_directory, "parameterized_checks.yml") +with open(temp_checks_path, "w") as f: + f.write(parameterized_checks_yaml) + +dq_engine = DQEngine(WorkspaceClient()) + +# Load checks with variable resolution +# Resolution happens during the load process +resolved_checks = dq_engine.load_checks( + config=WorkspaceFileChecksStorageConfig(location=temp_checks_path), + variables={ + "threshold_name": "critical", + "target_column": "col1", + "max_value": 100 + } +) + +# The resolved checks now have the values injected +# Note: DQEngine internally converts string numbers to their appropriate types if needed during validation or apply +print(yaml.dump(resolved_checks)) + +# Apply the resolved checks to a DataFrame +data = spark.createDataFrame([[50], [150]], "col1: int") +result_df = dq_engine.apply_checks_by_metadata(data, resolved_checks) +display(result_df) \ No newline at end of file diff --git a/demos/dqx_quick_start_demo_library.py b/demos/dqx_quick_start_demo_library.py index 6b586428e..c6901b2af 100644 --- a/demos/dqx_quick_start_demo_library.py +++ b/demos/dqx_quick_start_demo_library.py @@ -125,6 +125,24 @@ print(f"Checks from YAML: {status}") # COMMAND ---------- + +# MAGIC %md +# MAGIC ### Variable Substitution +# MAGIC +# MAGIC You can parameterize your YAML checks using `{{ variable }}` syntax and resolve them at load time. +# MAGIC +# MAGIC ```python +# MAGIC # Example: Load checks with a dynamic age limit +# MAGIC # +# MAGIC # from databricks.labs.dqx.config import FileChecksStorageConfig +# MAGIC # +# MAGIC # resolved_checks = dq_engine.load_checks( +# MAGIC # config=FileChecksStorageConfig(location="checks.yml"), +# MAGIC # variables={"max_age": 120} +# MAGIC # ) +# MAGIC ``` +# MAGIC +# COMMAND ---------- # MAGIC %md # MAGIC ### Setup `DQEngine` diff --git a/docs/dqx/docs/guide/quality_checks_definition.mdx b/docs/dqx/docs/guide/quality_checks_definition.mdx index d5d5fa817..c2c97b133 100644 --- a/docs/dqx/docs/guide/quality_checks_definition.mdx +++ b/docs/dqx/docs/guide/quality_checks_definition.mdx @@ -720,6 +720,60 @@ Example checks saved in a Delta or Lakebase table (compact format — `for_each_ If `run_config_name` is not provided, "default" is used. Typically, the input table or job name is used for run config name to establish a one-to-one mapping between tables or jobs and checks. +## Variable Substitution + +DQX supports variable substitution in declarative check definitions (YAML, JSON, or Delta tables). This allows you to parameterize your quality rules and inject values at **load time** via the `variables` parameter in `load_checks`. + +### Syntax and Scope + +Placeholders are defined using the `{{ variable_name }}` syntax. Variable substitution is supported in **all string values** within the check definitions, including: +- `name` +- `filter` +- `check` function arguments (`arguments`) and column names (`for_each_column`) +- any other top-level or nested string field + +### Resolution + +Variables are resolved at **load time** when the checks are loaded from the storage backend. To resolve variables, pass a dictionary to the `variables` parameter of the `load_checks` method. + + +Variable substitution is only available when defining checks declaratively (as dictionaries or in files/tables). It is not supported when using DQX classes (e.g., `DQRowRule`) directly. + + + + + ```yaml + - criticality: error + check: + function: is_in_range + arguments: + column: temperature + min_limit: {{ min_temp }} + max_limit: {{ max_temp }} + filter: "region = '{{ region }}'" + ``` + + + ```python + from databricks.labs.dqx.engine import DQEngine + from databricks.labs.dqx.config import FileChecksStorageConfig + from databricks.sdk import WorkspaceClient + + dq_engine = DQEngine(WorkspaceClient()) + + # Load checks with variable resolution + resolved_checks = dq_engine.load_checks( + config=FileChecksStorageConfig(location="checks.yml"), + variables={ + "min_temp": 0, + "max_temp": 100, + "region": "EMEA" + } + ) + ``` + + + ## Validating syntax of quality checks You can validate the syntax of checks loaded from a storage system or checks defined programmatically before applying them. diff --git a/docs/dqx/docs/guide/quality_checks_storage.mdx b/docs/dqx/docs/guide/quality_checks_storage.mdx index aff515c5a..ef8fe083f 100644 --- a/docs/dqx/docs/guide/quality_checks_storage.mdx +++ b/docs/dqx/docs/guide/quality_checks_storage.mdx @@ -180,6 +180,13 @@ If you create checks as a list of DQRule objects, you can convert them using the # also works for absolute and relative workspace paths if invoked from Databricks notebook or job checks: list[dict] = dq_engine.load_checks(config=FileChecksStorageConfig(location="checks.yml")) + # load checks from a local file with variable substitution + # see more on variable substitution [here](/docs/guide/quality_checks_definition/#variable-substitution) + checks: list[dict] = dq_engine.load_checks( + FileChecksStorageConfig(location="checks.yml"), + variables={"threshold": 100, "column_name": "total_amount"} + ) + # load checks from arbitrary workspace location using absolute path checks: list[dict] = dq_engine.load_checks(config=WorkspaceFileChecksStorageConfig(location="/Shared/App1/checks.yml")) diff --git a/docs/dqx/docs/reference/engine.mdx b/docs/dqx/docs/reference/engine.mdx index 76bab212b..8866c2ce1 100644 --- a/docs/dqx/docs/reference/engine.mdx +++ b/docs/dqx/docs/reference/engine.mdx @@ -62,7 +62,7 @@ The following table outlines the available methods of the `DQEngine` and their f | `validate_checks` | Validates the provided quality checks to ensure they conform to the expected structure and types. | `checks`: List of checks to validate; `custom_check_functions`: (optional) Dictionary of custom check functions that can be used; `validate_custom_check_functions`: (optional) If True, validates custom check functions (defaults to True). | Yes | | `get_invalid` | Retrieves records from the DataFrame that violate data quality checks (records with warnings and errors). | `df`: Input DataFrame. | Yes | | `get_valid` | Retrieves records from the DataFrame that pass all data quality checks. | `df`: Input DataFrame. | Yes | -| `load_checks` | Loads quality rules (checks) from storage backend. Multiple storage backends are supported including tables, files, workspace files, or installation-managed sources inferred from run config. | `config`: Configuration for loading checks from a storage backend, e.g., `FileChecksStorageConfig` (local YAML/JSON file or workspace file), `WorkspaceFileChecksStorageConfig` (workspace file with absolute path), `VolumeFileChecksStorageConfig` (Unity Catalog Volume YAML/JSON), `TableChecksStorageConfig` (table), `InstallationChecksStorageConfig` (installation-managed backend using `checks_location` in run config). | Yes (only with `FileChecksStorageConfig`) | +| `load_checks` | Loads quality rules (checks) from storage backend. Multiple storage backends are supported including tables, files, workspace files, or installation-managed sources inferred from run config. | `config`: Configuration for loading checks from a storage backend, e.g., `FileChecksStorageConfig` (local YAML/JSON file or workspace file), `WorkspaceFileChecksStorageConfig` (workspace file with absolute path), `VolumeFileChecksStorageConfig` (Unity Catalog Volume YAML/JSON), `TableChecksStorageConfig` (table), `InstallationChecksStorageConfig` (installation-managed backend using `checks_location` in run config); `variables`: (optional) dictionary of variables for [variable substitution](/docs/guide/quality_checks_definition/#variable-substitution). | Yes (only with `FileChecksStorageConfig`) | | `save_checks` | Saves quality rules (checks) to a storage backend. Multiple storage backends are supported including tables, files, workspace files, or installation-managed targets inferred from run config. | `checks`: List of checks defined as dictionary; `config`: Configuration for saving checks in a storage backend, e.g., `FileChecksStorageConfig` (local YAML/JSON file or workspace file), `WorkspaceFileChecksStorageConfig` (workspace file with absolute path), `VolumeFileChecksStorageConfig` (Unity Catalog Volume YAML/JSON), `TableChecksStorageConfig` (table), `InstallationChecksStorageConfig` (installation-managed backend using `checks_location` in run config). | Yes (only with `FileChecksStorageConfig`) | | `save_results_in_table` | Saves DataFrames as tables using Unity Catalog table references or storage paths. Supports both batch and streaming writes. For streaming DataFrames, returns a StreamingQuery that can be used to monitor or wait for completion. For batch DataFrames, data is written synchronously and None is returned. | `output_df`: (optional) DataFrame containing the output data (batch or streaming); `quarantine_df`: (optional) DataFrame containing invalid data (batch or streaming); `observation`: (optional) Spark Observation tracking summary metrics; `output_config`: `OutputConfig` with location (table name or storage path), mode, format, options, and optional trigger (supports `partition_by` or `cluster_by`, only one applies;); `quarantine_config`: (optional) `OutputConfig` with location (table name or storage path), mode, format, options, and optional trigger (supports `partition_by` or `cluster_by`, only one applies;); `metrics_config`: (optional) `OutputConfig` with location for summary metrics; `rule_set_fingerprint`: (optional) SHA-256 fingerprint of the rule set used for this run, included in summary metrics when metrics_config is provided; `run_config_name`: Name of the run config to use; `install_folder`: (optional) Installation folder where DQX is installed (only required for custom folder); `assume_user`: (optional) If True, assume user installation, otherwise global. | No | | `save_summary_metrics` | Saves quality checking summary metrics to a Delta table. | `observed_metrics`: `dict[str, Any]` Collected summary metrics from Spark Observation; `metrics_config`: `OutputConfig` object with the table name, output mode, and options for the summary metrics data; `input_config`: (optional) `InputConfig` object with the table name for reading the input data; `output_config`: (optional) `OutputConfig` object with the table name for the output data (supports `partition_by` or `cluster_by`, only one applies); `quarantine_config`: (optional) `OutputConfig` object with the table name for the quarantine data (supports `partition_by` or `cluster_by`, only one applies); `checks_location`: (optional) Location where checks are stored; `rule_set_fingerprint`: (optional) SHA-256 fingerprint of the rule set used for this run. | No | diff --git a/docs/dqx/docs/reference/quality_checks.mdx b/docs/dqx/docs/reference/quality_checks.mdx index e96040994..adfc71bc1 100644 --- a/docs/dqx/docs/reference/quality_checks.mdx +++ b/docs/dqx/docs/reference/quality_checks.mdx @@ -18,6 +18,10 @@ All rule types, including row-level and dataset-level rules, can be defined and You can explore the implementation details of the check functions [here](https://github.com/databrickslabs/dqx/blob/v0.13.0/src/databricks/labs/dqx/check_funcs.py). + +All declarative check definitions (YAML, JSON, or Delta tables) support **variable substitution** for string-based fields using the `{{ variable_name }}` syntax. This allows for dynamic parameterization of column names, thresholds, and filters at load time. See the [User Guide](/docs/guide/quality_checks_definition/#variable-substitution) for more details. + + ## Row-level checks reference Row-level checks are applied to each row in a PySpark DataFrame. The quality check results are reported for individual rows in the result columns. diff --git a/src/databricks/labs/dqx/utils.py b/src/databricks/labs/dqx/utils.py index 141af686f..6f302d5a3 100644 --- a/src/databricks/labs/dqx/utils.py +++ b/src/databricks/labs/dqx/utils.py @@ -6,10 +6,11 @@ from decimal import Decimal from enum import Enum from importlib.util import find_spec -from typing import Any +from typing import Any, TypeVar, overload from fnmatch import fnmatch from pathlib import Path + from pyspark.sql import Column from pyspark.sql.types import StructType @@ -29,6 +30,9 @@ logger = logging.getLogger(__name__) +T = TypeVar("T") + + COLUMN_NORMALIZE_EXPRESSION = re.compile("[^a-zA-Z0-9]+") COLUMN_PATTERN = re.compile(r"Column<'(.*?)(?: AS (\w+))?'>$", re.DOTALL) INVALID_COLUMN_NAME_PATTERN = re.compile(r"[\s,;{}\(\)\n\t=]+") @@ -579,7 +583,23 @@ def _resolve(match_obj: re.Match[str]) -> str: return output -def _substitute_variables(obj: object, variables: dict[str, str]) -> object: +@overload +def _substitute_variables(obj: str, variables: dict[str, str]) -> str: ... + + +@overload +def _substitute_variables(obj: list[T], variables: dict[str, str]) -> list[T]: ... + + +@overload +def _substitute_variables(obj: dict[str, T], variables: dict[str, str]) -> dict[str, T]: ... + + +@overload +def _substitute_variables(obj: T, variables: dict[str, str]) -> T: ... + + +def _substitute_variables(obj: Any, variables: dict[str, str]) -> Any: """Recursively replace **{{ key }}** placeholders in all string values within *obj*. Traverses dicts, lists, and strings. Non-string/non-collection values are @@ -649,7 +669,7 @@ def resolve_variables(checks: list[dict], variables: dict[str, VariableValue] | _validate_variable_types(variables) str_variables = {k: str(v) for k, v in variables.items()} - return _substitute_variables(checks, str_variables) # type: ignore[return-value] + return _substitute_variables(checks, str_variables) def get_file_extension(file_path: str | os.PathLike) -> str: diff --git a/tests/integration/test_apply_checks_and_save_in_table.py b/tests/integration/test_apply_checks_and_save_in_table.py index 73fc12e47..0a2816784 100644 --- a/tests/integration/test_apply_checks_and_save_in_table.py +++ b/tests/integration/test_apply_checks_and_save_in_table.py @@ -2254,4 +2254,5 @@ def test_apply_checks_by_metadata_and_save_in_table_loads_checks_from_table(ws, ], schema=expected_schema, ) - assert_df_equality(actual_df, expected_df, ignore_nullable=True) \ No newline at end of file + assert_df_equality(actual_df, expected_df, ignore_nullable=True) + \ No newline at end of file diff --git a/tests/integration/test_apply_checks_variables.py b/tests/integration/test_apply_checks_variables.py deleted file mode 100644 index 96837e372..000000000 --- a/tests/integration/test_apply_checks_variables.py +++ /dev/null @@ -1,275 +0,0 @@ -import dataclasses -from databricks.labs.dqx.engine import DQEngine, DQEngineCore -from databricks.labs.dqx.config import FileChecksStorageConfig -from tests.integration.conftest import ( - REPORTING_COLUMNS, - EXTRA_PARAMS, -) - -SCHEMA = "a: int, b: int, c: int" -EXPECTED_SCHEMA = SCHEMA + REPORTING_COLUMNS - - -def test_load_checks_by_metadata_with_variables(tmp_path): - - checks_yaml = """ - - criticality: error - check: - function: is_not_null_and_not_empty - arguments: - column: "{{ col }}" - """ - checks_file = tmp_path / "checks.yml" - checks_file.write_text(checks_yaml, encoding="utf-8") - checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"col": "b"}) - - assert checks == [ - { - "criticality": "error", - "check": { - "function": "is_not_null_and_not_empty", - "arguments": {"column": "b"}, - }, - } - ] - - -def test_load_checks_by_metadata_and_split_with_variables(tmp_path): - - checks_yaml = """ - - criticality: error - name: "{{ col }}_null_check" - check: - function: is_not_null_and_not_empty - arguments: - column: "{{ col }}" - - criticality: warn - check: - function: sql_expression - arguments: - expression: "{{ expr_col }} > {{ threshold }}" - """ - checks_file = tmp_path / "checks.yml" - checks_file.write_text(checks_yaml, encoding="utf-8") - checks = DQEngineCore.load_checks_from_local_file( - str(checks_file), variables={"col": "b", "expr_col": "a", "threshold": 1} - ) - - assert checks == [ - { - "criticality": "error", - "name": "b_null_check", - "check": { - "function": "is_not_null_and_not_empty", - "arguments": {"column": "b"}, - }, - }, - { - "criticality": "warn", - "check": { - "function": "sql_expression", - "arguments": {"expression": "a > 1"}, - }, - }, - ] - - -def test_load_checks_by_metadata_with_variables_name_and_filter(tmp_path): - - checks_yaml = """ - - criticality: error - name: "{{ col }}_greater_than_{{ threshold }}" - check: - function: sql_expression - arguments: - expression: "{{ col }} > {{ threshold }}" - filter: "{{ filter_col }} IS NOT NULL" - """ - checks_file = tmp_path / "checks.yml" - checks_file.write_text(checks_yaml, encoding="utf-8") - checks = DQEngineCore.load_checks_from_local_file( - str(checks_file), variables={"col": "a", "threshold": 1, "filter_col": "a"} - ) - - assert checks == [ - { - "criticality": "error", - "name": "a_greater_than_1", - "check": { - "function": "sql_expression", - "arguments": {"expression": "a > 1"}, - }, - "filter": "a IS NOT NULL", - } - ] - - -def test_validate_checks_with_variables(tmp_path): - checks_yaml = """ - - criticality: "{{ crit }}" - check: - function: is_not_null - arguments: - column: "{{ col }}" - """ - checks_file = tmp_path / "checks.yml" - checks_file.write_text(checks_yaml, encoding="utf-8") - checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"crit": "error", "col": "b"}) - - status = DQEngine.validate_checks(checks) - assert not status.has_errors - - -def test_validate_checks_with_variables_invalid_after_substitution(tmp_path): - checks_yaml = """ - - criticality: "{{ crit }}" - check: - function: is_not_null - arguments: - column: b - """ - checks_file = tmp_path / "checks.yml" - checks_file.write_text(checks_yaml, encoding="utf-8") - checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"crit": "not_a_valid_criticality"}) - - status = DQEngine.validate_checks(checks) - expected_error = ( - "Invalid 'criticality' value: 'not_a_valid_criticality'. Expected 'warn' or 'error'. " - "Check details: {'criticality': 'not_a_valid_criticality', " - "'check': {'function': 'is_not_null', 'arguments': {'column': 'b'}}}" - ) - assert status.errors[0] == expected_error - - -def test_validate_checks_without_variables_fails_on_placeholders(): - checks = [ - { - "criticality": "{{ crit }}", - "check": { - "function": "is_not_null", - "arguments": {"column": "b"}, - }, - }, - ] - - status = DQEngine.validate_checks(checks) - expected_error = ( - "Invalid 'criticality' value: '{{ crit }}'. Expected 'warn' or 'error'. " - "Check details: {'criticality': '{{ crit }}', " - "'check': {'function': 'is_not_null', 'arguments': {'column': 'b'}}}" - ) - assert status.errors[0] == expected_error - - -def test_extra_params_variables_substitution_and_overrides(ws, spark, tmp_path): - # Define Checks with placeholders in nested structure (user_metadata) - # and deep inside check arguments - checks_yaml = """ - - criticality: error - name: "id_check" - check: - function: is_not_null - arguments: - column: "{{ target_col }}" - user_metadata: - env: "{{ environment }}" - rule_id: "{{ nested_var }}" - """ - checks_file = tmp_path / "checks_extra.yml" - checks_file.write_text(checks_yaml, encoding="utf-8") - - # Setup DQEngine with ExtraParams variables (Default values) - # Default variables: target_col=id, environment=dev, nested_var=old - extra_params = dataclasses.replace( - EXTRA_PARAMS, - variables={ - "target_col": "id", - "environment": "dev", - "nested_var": "old", - }, - ) - dq_engine = DQEngine(ws, spark, extra_params=extra_params) - - # Load Checks with overrides - # target_col: id (from ExtraParams default) - # environment: prod (per-call override wins) - # nested_var: new (per-call override wins) - config = FileChecksStorageConfig(location=str(checks_file)) - checks = dq_engine.load_checks(config, variables={"environment": "prod", "nested_var": "new"}) - - # Verify substitution (Structural check) - assert checks[0]["check"]["arguments"]["column"] == "id" - assert checks[0]["user_metadata"]["env"] == "prod" - assert checks[0]["user_metadata"]["rule_id"] == "new" - - -def test_extra_params_variables_conflict_resolution(ws, spark, tmp_path): - # Verify that a conflict where a variable is defined in both ExtraParams and per-call - # results in the per-call variable taking precedence. - - # 1. Setup DQEngine with ExtraParams variables - extra_params = dataclasses.replace(EXTRA_PARAMS, variables={"my_var": "default"}) - dq_engine = DQEngine(ws, spark, extra_params=extra_params) - - # 2. File with placeholder - checks_yaml = """ - - name: "check_{{ my_var }}" - check: - function: is_not_null - arguments: - column: id - """ - checks_file = tmp_path / "checks_conflict.yml" - checks_file.write_text(checks_yaml, encoding="utf-8") - config = FileChecksStorageConfig(location=str(checks_file)) - - # 3. Load with override - checks = dq_engine.load_checks(config, variables={"my_var": "override"}) - - # 4. Verify that "override" won - assert checks[0]["name"] == "check_override" - - -def test_extra_params_variables_fallback_to_defaults(ws, spark, tmp_path): - # Verify that if a variable is NOT provided in the call, it falls back to ExtraParams. - - # 1. Setup DQEngine with ExtraParams variables - extra_params = dataclasses.replace(EXTRA_PARAMS, variables={"my_var": "default"}) - dq_engine = DQEngine(ws, spark, extra_params=extra_params) - - # 2. File with placeholder - checks_yaml = """ - - name: "check_{{ my_var }}" - check: - function: is_not_null - arguments: - column: id - """ - checks_file = tmp_path / "checks_fallback.yml" - checks_file.write_text(checks_yaml, encoding="utf-8") - config = FileChecksStorageConfig(location=str(checks_file)) - - # 3. Load WITHOUT specific variables in the call - should use engine defaults - checks = dq_engine.load_checks(config) - - # 4. Verify that "default" was used - assert checks[0]["name"] == "check_default" - - -def test_load_checks_with_missing_variable(tmp_path): - - checks_yaml = """ - - criticality: error - check: - function: is_not_null - arguments: - column: "{{ missing_col }}" - """ - checks_file = tmp_path / "checks_missing.yml" - checks_file.write_text(checks_yaml, encoding="utf-8") - - # Load file, which will warn and leave the placeholder - checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"different_var": "val"}) - - # Assert that the placeholder was left in the metadata (unresolved variable) - assert checks[0]["check"]["arguments"]["column"] == "{{ missing_col }}" diff --git a/tests/unit/test_checks_validation.py b/tests/unit/test_checks_validation.py index 936a2d814..2467b4a84 100644 --- a/tests/unit/test_checks_validation.py +++ b/tests/unit/test_checks_validation.py @@ -1,5 +1,5 @@ from pyspark.sql.functions import col -from databricks.labs.dqx.engine import DQEngine +from databricks.labs.dqx.engine import DQEngine, DQEngineCore def dummy_func(column): @@ -456,3 +456,60 @@ def test_is_in_range_float_arguments(): ] status = DQEngine.validate_checks(checks) assert not status.has_errors + + +def test_validate_checks_with_variables(tmp_path): + checks_yaml = """ + - criticality: "{{ crit }}" + check: + function: is_not_null + arguments: + column: "{{ col }}" + """ + checks_file = tmp_path / "checks.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"crit": "error", "col": "b"}) + + status = DQEngine.validate_checks(checks) + assert not status.has_errors + + +def test_validate_checks_with_variables_invalid_after_substitution(tmp_path): + checks_yaml = """ + - criticality: "{{ crit }}" + check: + function: is_not_null + arguments: + column: b + """ + checks_file = tmp_path / "checks.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"crit": "not_a_valid_criticality"}) + + status = DQEngine.validate_checks(checks) + expected_error = ( + "Invalid 'criticality' value: 'not_a_valid_criticality'. Expected 'warn' or 'error'. " + "Check details: {'criticality': 'not_a_valid_criticality', " + "'check': {'function': 'is_not_null', 'arguments': {'column': 'b'}}}" + ) + assert status.errors[0] == expected_error + + +def test_validate_checks_without_variables_fails_on_placeholders(): + checks = [ + { + "criticality": "{{ crit }}", + "check": { + "function": "is_not_null", + "arguments": {"column": "b"}, + }, + }, + ] + + status = DQEngine.validate_checks(checks) + expected_error = ( + "Invalid 'criticality' value: '{{ crit }}'. Expected 'warn' or 'error'. " + "Check details: {'criticality': '{{ crit }}', " + "'check': {'function': 'is_not_null', 'arguments': {'column': 'b'}}}" + ) + assert status.errors[0] == expected_error diff --git a/tests/unit/test_load_checks.py b/tests/unit/test_load_checks.py index 8b79cd14d..ef1e15d05 100644 --- a/tests/unit/test_load_checks.py +++ b/tests/unit/test_load_checks.py @@ -248,3 +248,92 @@ def test_load_checks_per_call_overrides_engine_defaults(): assert checks == [ {"criticality": "error", "check": {"function": "is_not_null", "arguments": {"column": "default_col"}}}, ] + + +def test_load_checks_by_metadata_and_split_with_variables(tmp_path): + + checks_yaml = """ + - criticality: error + name: "{{ col }}_null_check" + check: + function: is_not_null_and_not_empty + arguments: + column: "{{ col }}" + - criticality: warn + check: + function: sql_expression + arguments: + expression: "{{ expr_col }} > {{ threshold }}" + """ + checks_file = tmp_path / "checks.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + checks = DQEngineCore.load_checks_from_local_file( + str(checks_file), variables={"col": "b", "expr_col": "a", "threshold": 1} + ) + + assert checks == [ + { + "criticality": "error", + "name": "b_null_check", + "check": { + "function": "is_not_null_and_not_empty", + "arguments": {"column": "b"}, + }, + }, + { + "criticality": "warn", + "check": { + "function": "sql_expression", + "arguments": {"expression": "a > 1"}, + }, + }, + ] + + +def test_load_checks_by_metadata_with_variables_name_and_filter(tmp_path): + + checks_yaml = """ + - criticality: error + name: "{{ col }}_greater_than_{{ threshold }}" + check: + function: sql_expression + arguments: + expression: "{{ col }} > {{ threshold }}" + filter: "{{ filter_col }} IS NOT NULL" + """ + checks_file = tmp_path / "checks.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + checks = DQEngineCore.load_checks_from_local_file( + str(checks_file), variables={"col": "a", "threshold": 1, "filter_col": "a"} + ) + + assert checks == [ + { + "criticality": "error", + "name": "a_greater_than_1", + "check": { + "function": "sql_expression", + "arguments": {"expression": "a > 1"}, + }, + "filter": "a IS NOT NULL", + } + ] + + +def test_load_checks_with_missing_variable(tmp_path): + + checks_yaml = """ + - criticality: error + check: + function: is_not_null + arguments: + column: "{{ missing_col }}" + """ + checks_file = tmp_path / "checks_missing.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + + # Load file, which will warn and leave the placeholder + checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"different_var": "val"}) + + # Assert that the placeholder was left in the metadata (unresolved variable) + assert checks[0]["check"]["arguments"]["column"] == "{{ missing_col }}" From 7fdd172fe45b779e4a274fd4efa3b3516c9979b7 Mon Sep 17 00:00:00 2001 From: fedeflowers Date: Sun, 5 Apr 2026 12:12:26 +0200 Subject: [PATCH 11/24] add docs for variable parametrization, fix dqx demo --- demos/dqx_quick_start_demo_library.py | 18 ----- .../docs/guide/additional_configuration.mdx | 72 +++++++++++++++++++ .../docs/guide/quality_checks_definition.mdx | 6 ++ 3 files changed, 78 insertions(+), 18 deletions(-) diff --git a/demos/dqx_quick_start_demo_library.py b/demos/dqx_quick_start_demo_library.py index c6901b2af..6b586428e 100644 --- a/demos/dqx_quick_start_demo_library.py +++ b/demos/dqx_quick_start_demo_library.py @@ -125,24 +125,6 @@ print(f"Checks from YAML: {status}") # COMMAND ---------- - -# MAGIC %md -# MAGIC ### Variable Substitution -# MAGIC -# MAGIC You can parameterize your YAML checks using `{{ variable }}` syntax and resolve them at load time. -# MAGIC -# MAGIC ```python -# MAGIC # Example: Load checks with a dynamic age limit -# MAGIC # -# MAGIC # from databricks.labs.dqx.config import FileChecksStorageConfig -# MAGIC # -# MAGIC # resolved_checks = dq_engine.load_checks( -# MAGIC # config=FileChecksStorageConfig(location="checks.yml"), -# MAGIC # variables={"max_age": 120} -# MAGIC # ) -# MAGIC ``` -# MAGIC -# COMMAND ---------- # MAGIC %md # MAGIC ### Setup `DQEngine` diff --git a/docs/dqx/docs/guide/additional_configuration.mdx b/docs/dqx/docs/guide/additional_configuration.mdx index 69701e6fb..fea9884f9 100644 --- a/docs/dqx/docs/guide/additional_configuration.mdx +++ b/docs/dqx/docs/guide/additional_configuration.mdx @@ -171,3 +171,75 @@ from pyspark.sql import functions as F skipped = checked_df.select(F.explode("_errors").alias("e")).filter(F.col("e.skipped") == True) ``` + +## Defining default variables for substitution + +DQX allows you to define engine-level defaults for variables used in declarative check definitions (YAML, JSON, or Delta tables). These defaults are automatically applied during `load_checks` unless overridden by the per-call `variables` parameter. + + + + ```python + from databricks.labs.dqx.engine import DQEngine + from databricks.labs.dqx.config import ExtraParams + from databricks.sdk import WorkspaceClient + + # Initialize engine with default variables + dq_engine = DQEngine( + WorkspaceClient(), + extra_params=ExtraParams( + variables={ + "min_temp": 0, + "max_temp": 50, + "region": "GLOBAL" + } + ) + ) + + # Load checks - uses 'min_temp' and 'max_temp' from defaults, + # but overrides 'region' specifically for this call. + resolved_checks = dq_engine.load_checks( + config=FileChecksStorageConfig(location="checks.yml"), + ) + ``` + + + You can set the following fields in the [configuration file](/docs/installation/#configuration-file) to define default variables for substitution when using DQX workflows: + ```yaml + extra_params: + variables: + min_temp: 0 + max_temp: 50 + region: GLOBAL + ``` + + + +## Overwriting run metadata + +By default, DQX automatically generates a unique `run_id` for each engine instance and uses the current timestamp as the `run_time`. You can manually overwrite these values using `ExtraParams` if you need to align DQX results with external systems or re-run checks for a specific historical point in time. + + + + ```python + from databricks.labs.dqx.engine import DQEngine + from databricks.labs.dqx.config import ExtraParams + from databricks.sdk import WorkspaceClient + + extra_params = ExtraParams( + run_id_overwrite="custom-execution-id-123", + run_time_overwrite="2024-01-01T12:00:00Z" + ) + + dq_engine = DQEngine(WorkspaceClient(), extra_params=extra_params) + ``` + + + You can set the following fields in the [configuration file](/docs/installation/#configuration-file) to overwrite the run metadata when using DQX workflows: + ```yaml + extra_params: + run_id_overwrite: custom-execution-id-123 + run_time_overwrite: 2024-01-01T12:00:00Z + ``` + + + diff --git a/docs/dqx/docs/guide/quality_checks_definition.mdx b/docs/dqx/docs/guide/quality_checks_definition.mdx index c2c97b133..83e8147eb 100644 --- a/docs/dqx/docs/guide/quality_checks_definition.mdx +++ b/docs/dqx/docs/guide/quality_checks_definition.mdx @@ -774,6 +774,12 @@ Variable substitution is only available when defining checks declaratively (as d +## Default Variables + +In addition to specifying variables during the load process, you can define engine-level defaults using the `ExtraParams` class. These constants are automatically applied to all checks unless explicitly overridden. + +For technical details and configuration examples, see [Default Variables](/docs/guide/additional_configuration#defining-default-variables-for-substitution) in the Additional Configuration guide. + ## Validating syntax of quality checks You can validate the syntax of checks loaded from a storage system or checks defined programmatically before applying them. From a3a21a6272f6367ba31ecd3378a6d3ef86b3a35b Mon Sep 17 00:00:00 2001 From: fedeflowers Date: Sun, 5 Apr 2026 12:27:26 +0200 Subject: [PATCH 12/24] fix tests duplication --- tests/unit/test_load_checks.py | 18 ------------------ tests/unit/test_utils.py | 13 ------------- 2 files changed, 31 deletions(-) diff --git a/tests/unit/test_load_checks.py b/tests/unit/test_load_checks.py index ef1e15d05..24a87e06f 100644 --- a/tests/unit/test_load_checks.py +++ b/tests/unit/test_load_checks.py @@ -319,21 +319,3 @@ def test_load_checks_by_metadata_with_variables_name_and_filter(tmp_path): } ] - -def test_load_checks_with_missing_variable(tmp_path): - - checks_yaml = """ - - criticality: error - check: - function: is_not_null - arguments: - column: "{{ missing_col }}" - """ - checks_file = tmp_path / "checks_missing.yml" - checks_file.write_text(checks_yaml, encoding="utf-8") - - # Load file, which will warn and leave the placeholder - checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"different_var": "val"}) - - # Assert that the placeholder was left in the metadata (unresolved variable) - assert checks[0]["check"]["arguments"]["column"] == "{{ missing_col }}" diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index b60bc1e41..119d7786c 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -544,13 +544,6 @@ def test_resolve_variables_replaces_all_string_fields(): assert result[0]["filter"] == "status = 'active'" -def test_resolve_variables_none_variables(): - checks = [{"name": "{{ x }}"}] - result = resolve_variables(checks, None) - assert result is checks # same object, no copy - assert result[0]["name"] == "{{ x }}" - - def test_resolve_variables_empty_variables(): checks = [{"name": "{{ x }}"}] result = resolve_variables(checks, {}) @@ -818,12 +811,6 @@ def test_resolve_variables_none_vars_no_warning(caplog): assert not any("Unresolved placeholder" in msg for msg in caplog.messages) -def test_resolve_variables_whitespace_in_key(): - checks = [{"col": "{{col_a}}"}] - result = resolve_variables(checks, {"col_a": "replaced"}) - assert result[0]["col"] == "replaced" - - def test_resolve_variables_unicode_values(): checks = [{"col": "{{ col }}"}] result = resolve_variables(checks, {"col": "prénom"}) From 908873cee60137068b90d5a717b3ca72378baf1d Mon Sep 17 00:00:00 2001 From: fedeflowers Date: Sun, 5 Apr 2026 14:32:17 +0200 Subject: [PATCH 13/24] fix test readded test_extra_params_variables_substitution_and_overrides as unit test --- tests/unit/test_load_checks.py | 35 ++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tests/unit/test_load_checks.py b/tests/unit/test_load_checks.py index 24a87e06f..d1d4d870a 100644 --- a/tests/unit/test_load_checks.py +++ b/tests/unit/test_load_checks.py @@ -250,6 +250,41 @@ def test_load_checks_per_call_overrides_engine_defaults(): ] +def test_extra_params_variables_substitution_and_overrides(tmp_path): + ws = create_autospec(WorkspaceClient) + mock_spark = create_autospec(SparkSession) + + checks_yaml = """ + - criticality: error + name: "id_check" + check: + function: is_not_null + arguments: + column: "{{ target_col }}" + user_metadata: + env: "{{ environment }}" + rule_id: "{{ nested_var }}" + """ + checks_file = tmp_path / "checks_extra.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + + raw_checks = DQEngineCore.load_checks_from_local_file(str(checks_file)) + mock_factory = create_autospec(BaseChecksStorageHandlerFactory) + mock_handler = create_autospec(ChecksStorageHandler) + mock_factory.create.return_value = mock_handler + mock_handler.load.return_value = raw_checks + + extra_params = ExtraParams(variables={"target_col": "id", "environment": "dev", "nested_var": "old"}) + engine = DQEngine(ws, spark=mock_spark, checks_handler_factory=mock_factory, extra_params=extra_params) + config = FileChecksStorageConfig(location=str(checks_file)) + + checks = engine.load_checks(config, variables={"environment": "prod", "nested_var": "new"}) + + assert checks[0]["check"]["arguments"]["column"] == "id" + assert checks[0]["user_metadata"]["env"] == "prod" + assert checks[0]["user_metadata"]["rule_id"] == "new" + + def test_load_checks_by_metadata_and_split_with_variables(tmp_path): checks_yaml = """ From 4446b52975ca533fdaff1419125ce3a7e60a5222 Mon Sep 17 00:00:00 2001 From: fedeflowers Date: Tue, 7 Apr 2026 00:19:07 +0200 Subject: [PATCH 14/24] add doc warnign and test with empty dictionary --- docs/dqx/docs/guide/additional_configuration.mdx | 3 +++ tests/unit/test_utils.py | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/docs/dqx/docs/guide/additional_configuration.mdx b/docs/dqx/docs/guide/additional_configuration.mdx index fea9884f9..8ac2c8d09 100644 --- a/docs/dqx/docs/guide/additional_configuration.mdx +++ b/docs/dqx/docs/guide/additional_configuration.mdx @@ -203,6 +203,9 @@ DQX allows you to define engine-level defaults for variables used in declarative ``` + :::warning + Variable substitution is not currently supported in DQX installable workflows. Variables defined in the configuration file will be stored but not applied during workflow execution. + ::: You can set the following fields in the [configuration file](/docs/installation/#configuration-file) to define default variables for substitution when using DQX workflows: ```yaml extra_params: diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 119d7786c..8b2aed714 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -810,6 +810,11 @@ def test_resolve_variables_none_vars_no_warning(caplog): assert result[0]["col"] == "{{ x }}" assert not any("Unresolved placeholder" in msg for msg in caplog.messages) + with caplog.at_level(logging.WARNING): + result = resolve_variables(checks, {}) + assert result[0]["col"] == "{{ x }}" + assert not any("Unresolved placeholder" in msg for msg in caplog.messages) + def test_resolve_variables_unicode_values(): checks = [{"col": "{{ col }}"}] From 8a9eb799062cd77b23aa23432d37eab09007a0b7 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Tue, 7 Apr 2026 13:24:46 -0400 Subject: [PATCH 15/24] Update docs and fmt --- .../docs/guide/additional_configuration.mdx | 17 +++-------- src/databricks/labs/dqx/base.py | 4 +-- src/databricks/labs/dqx/engine.py | 8 ++--- src/databricks/labs/dqx/utils.py | 29 ++++++++++--------- .../test_apply_checks_and_save_in_table.py | 1 - tests/unit/test_load_checks.py | 1 - 6 files changed, 25 insertions(+), 35 deletions(-) diff --git a/docs/dqx/docs/guide/additional_configuration.mdx b/docs/dqx/docs/guide/additional_configuration.mdx index 8ac2c8d09..f165e9498 100644 --- a/docs/dqx/docs/guide/additional_configuration.mdx +++ b/docs/dqx/docs/guide/additional_configuration.mdx @@ -202,21 +202,12 @@ DQX allows you to define engine-level defaults for variables used in declarative ) ``` - - :::warning - Variable substitution is not currently supported in DQX installable workflows. Variables defined in the configuration file will be stored but not applied during workflow execution. - ::: - You can set the following fields in the [configuration file](/docs/installation/#configuration-file) to define default variables for substitution when using DQX workflows: - ```yaml - extra_params: - variables: - min_temp: 0 - max_temp: 50 - region: GLOBAL - ``` - + +Variable substitution is not currently supported in DQX installable workflows. Variables cam be defined and stored as YAML in the configuration file but will not be applied during workflow execution. + + ## Overwriting run metadata By default, DQX automatically generates a unique `run_id` for each engine instance and uses the current timestamp as the `run_time`. You can manually overwrite these values using `ExtraParams` if you need to align DQX results with external systems or re-run checks for a specific historical point in time. diff --git a/src/databricks/labs/dqx/base.py b/src/databricks/labs/dqx/base.py index 74d77f9e6..7a5c6a8d1 100644 --- a/src/databricks/labs/dqx/base.py +++ b/src/databricks/labs/dqx/base.py @@ -189,8 +189,8 @@ def load_checks_from_local_file(filepath: str, variables: dict[str, VariableValu Args: filepath: Path to a file containing checks definitions. - variables: Optional mapping of placeholder names to replacement values. Replaces **{{ key }}** - placeholders in all string values of the check definitions before returning. + variables: Optional mapping of placeholder names to replacement values. Replaces placeholders + in all string values of the check definitions before returning. Returns: List of DQ rules (checks). diff --git a/src/databricks/labs/dqx/engine.py b/src/databricks/labs/dqx/engine.py index 0c0d149e3..df6fc6402 100644 --- a/src/databricks/labs/dqx/engine.py +++ b/src/databricks/labs/dqx/engine.py @@ -349,8 +349,8 @@ def load_checks_from_local_file(filepath: str, variables: dict[str, VariableValu Args: filepath: Path to a file containing checks definitions. - variables: Optional mapping of placeholder names to replacement values. Replaces **{{ key }}** - placeholders in all string values of the check definitions before returning. + variables: Optional mapping of placeholder names to replacement values. Replaces placeholders + in all string values of the check definitions before returning. Returns: List of DQ rules. @@ -1205,8 +1205,8 @@ def load_checks( Args: config: Configuration object describing the storage backend. - variables: Optional mapping of placeholder names to replacement values. Replaces **{{ key }}** - placeholders in all string values of the check definitions before returning. + variables: Optional mapping of placeholder names to replacement values. Replaces placeholders + in all string values of the check definitions before returning. Returns: List of DQ rules (checks) represented as dictionaries. diff --git a/src/databricks/labs/dqx/utils.py b/src/databricks/labs/dqx/utils.py index 6f302d5a3..c02f83cb2 100644 --- a/src/databricks/labs/dqx/utils.py +++ b/src/databricks/labs/dqx/utils.py @@ -635,27 +635,28 @@ def _validate_variable_types(variables: dict[str, VariableValue]) -> None: def resolve_variables(checks: list[dict], variables: dict[str, VariableValue] | None) -> list[dict]: """Resolve variable substitution in check definitions. - Replaces **{{ key }}** placeholders in all string values of *checks* with the - corresponding values from *variables*. The original *checks* list is never mutated. + Replaces placeholders in all string values of *checks* with the corresponding values + from *variables*. - Variable values must be scalar types (**str**, **int**, **float**, **bool**, - **Decimal**, **datetime.date**, **datetime.datetime**, **datetime.time**). - Non-string scalars are converted via **str()** — for example, **{"threshold": 10}** becomes **"10"** in - the substituted string. Collection types (**list**, **dict**, **set**, etc.) are - rejected with :class:`~databricks.labs.dqx.errors.InvalidParameterError` because - their **str()** representation is rarely meaningful in SQL or column expressions. + Variable values must be scalar types (e.g. *str*, *int*, *float*, *bool*, *Decimal*, + *datetime.date*, *datetime.datetime*, *datetime.time*). Non-string scalars are + converted to strings via *str()* in the substituted string. Collection type + variables (e.g. *list*, *dict*, *set*, etc.) are rejected with + *databricks.labs.dqx.errors.InvalidParameterError* because their string representation + is rarely meaningful in SQL or column expressions. - Logs a warning for any **{{ ... }}** placeholders that remain unresolved after - substitution (e.g. misspelled variable names). + Logs a warning for any placeholders that remain unresolved after substitution + (e.g. misspelled variable names). - **Security note:** variable values substituted into **sql_expression** checks are - not sanitized and are passed directly to **F.expr()**. Callers must ensure that - variable values come from trusted sources to prevent SQL injection. + Note: + Variable values substituted into *sql_expression* checks are not sanitized and are + passed directly to *F.expr()*. Callers must **ensure variable values come from trusted + sources** to prevent SQL injection. Args: checks: List of check definition dictionaries (metadata format). variables: Mapping of placeholder names to scalar replacement values. - If **None** or empty the checks are returned unchanged. + If *None* or empty the checks are returned unchanged. Returns: A new list of check dicts with placeholders resolved, or the original list diff --git a/tests/integration/test_apply_checks_and_save_in_table.py b/tests/integration/test_apply_checks_and_save_in_table.py index 0a2816784..3f0514b92 100644 --- a/tests/integration/test_apply_checks_and_save_in_table.py +++ b/tests/integration/test_apply_checks_and_save_in_table.py @@ -2255,4 +2255,3 @@ def test_apply_checks_by_metadata_and_save_in_table_loads_checks_from_table(ws, schema=expected_schema, ) assert_df_equality(actual_df, expected_df, ignore_nullable=True) - \ No newline at end of file diff --git a/tests/unit/test_load_checks.py b/tests/unit/test_load_checks.py index d1d4d870a..3e6c55cca 100644 --- a/tests/unit/test_load_checks.py +++ b/tests/unit/test_load_checks.py @@ -353,4 +353,3 @@ def test_load_checks_by_metadata_with_variables_name_and_filter(tmp_path): "filter": "a IS NOT NULL", } ] - From ffd982ab75d9e39a4fa7d9b754cd8975863e43e7 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 14 Apr 2026 12:59:11 +0200 Subject: [PATCH 16/24] Apply suggestion from @mwojtyczka --- docs/dqx/docs/guide/additional_configuration.mdx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/dqx/docs/guide/additional_configuration.mdx b/docs/dqx/docs/guide/additional_configuration.mdx index f165e9498..5d4b14645 100644 --- a/docs/dqx/docs/guide/additional_configuration.mdx +++ b/docs/dqx/docs/guide/additional_configuration.mdx @@ -205,7 +205,9 @@ DQX allows you to define engine-level defaults for variables used in declarative -Variable substitution is not currently supported in DQX installable workflows. Variables cam be defined and stored as YAML in the configuration file but will not be applied during workflow execution. +Variable substitution is not currently supported in DQX installable workflows. Variables can be defined and stored as YAML in the configuration file but will not be applied during workflow execution. + +Variable substitution is only available when defining checks declaratively (as dictionaries or in files/tables). It is not supported when using DQX classes (e.g., `DQRowRule`) directly. ## Overwriting run metadata From 1eae942ff12b2a02b21aaf47a66fb3fb2190cbe0 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 14 Apr 2026 13:01:23 +0200 Subject: [PATCH 17/24] Apply suggestions from code review Code review feedback implementation Co-authored-by: Marcin Wojtyczka --- docs/dqx/docs/guide/additional_configuration.mdx | 2 +- docs/dqx/docs/guide/quality_checks_definition.mdx | 2 +- docs/dqx/docs/guide/quality_checks_storage.mdx | 3 +-- src/databricks/labs/dqx/engine.py | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/dqx/docs/guide/additional_configuration.mdx b/docs/dqx/docs/guide/additional_configuration.mdx index 5d4b14645..12d1e6d50 100644 --- a/docs/dqx/docs/guide/additional_configuration.mdx +++ b/docs/dqx/docs/guide/additional_configuration.mdx @@ -198,7 +198,7 @@ DQX allows you to define engine-level defaults for variables used in declarative # Load checks - uses 'min_temp' and 'max_temp' from defaults, # but overrides 'region' specifically for this call. resolved_checks = dq_engine.load_checks( - config=FileChecksStorageConfig(location="checks.yml"), + config=FileChecksStorageConfig(location="checks.yml"), variables={ "region": "EMEA" } ) ``` diff --git a/docs/dqx/docs/guide/quality_checks_definition.mdx b/docs/dqx/docs/guide/quality_checks_definition.mdx index 83e8147eb..93a4b11a4 100644 --- a/docs/dqx/docs/guide/quality_checks_definition.mdx +++ b/docs/dqx/docs/guide/quality_checks_definition.mdx @@ -722,7 +722,7 @@ If `run_config_name` is not provided, "default" is used. Typically, the input ta ## Variable Substitution -DQX supports variable substitution in declarative check definitions (YAML, JSON, or Delta tables). This allows you to parameterize your quality rules and inject values at **load time** via the `variables` parameter in `load_checks`. +DQX supports variable substitution in declarative check definitions (YAML, JSON, or Delta tables). This allows you to parameterize your quality rules and inject values at **load time** from engine-level defaults and/or via the `variables` parameter in `load_checks`. ### Syntax and Scope diff --git a/docs/dqx/docs/guide/quality_checks_storage.mdx b/docs/dqx/docs/guide/quality_checks_storage.mdx index ef8fe083f..a1ca120cb 100644 --- a/docs/dqx/docs/guide/quality_checks_storage.mdx +++ b/docs/dqx/docs/guide/quality_checks_storage.mdx @@ -181,7 +181,6 @@ If you create checks as a list of DQRule objects, you can convert them using the checks: list[dict] = dq_engine.load_checks(config=FileChecksStorageConfig(location="checks.yml")) # load checks from a local file with variable substitution - # see more on variable substitution [here](/docs/guide/quality_checks_definition/#variable-substitution) checks: list[dict] = dq_engine.load_checks( FileChecksStorageConfig(location="checks.yml"), variables={"threshold": 100, "column_name": "total_amount"} @@ -215,7 +214,7 @@ If you create checks as a list of DQRule objects, you can convert them using the # validate loaded checks assert not dq_engine.validate_checks(checks).has_errors - ``` + When using the quality checker or e2e workflows to apply quality checks, they load checks from the `checks_location` field defined in the [configuration file](/docs/installation/#configuration-file). diff --git a/src/databricks/labs/dqx/engine.py b/src/databricks/labs/dqx/engine.py index df6fc6402..326aaaace 100644 --- a/src/databricks/labs/dqx/engine.py +++ b/src/databricks/labs/dqx/engine.py @@ -1217,7 +1217,7 @@ def load_checks( handler = self._checks_handler_factory.create(config) checks = handler.load(config) merged = self._merge_variables(variables) - return resolve_variables(checks=checks, variables=merged) + return resolve_variables(checks=checks, variables=merged_variables) def _merge_variables(self, per_call: dict[str, VariableValue] | None) -> dict[str, VariableValue] | None: """Merge engine-level default variables with per-call overrides. From a30271bbdc6cabd89b8916541c819f23f2880bae Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 14 Apr 2026 13:40:19 +0200 Subject: [PATCH 18/24] added tests --- src/databricks/labs/dqx/engine.py | 2 +- src/databricks/labs/dqx/utils.py | 5 ++- tests/unit/test_load_checks.py | 66 +++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/dqx/engine.py b/src/databricks/labs/dqx/engine.py index 326aaaace..3db11b1c1 100644 --- a/src/databricks/labs/dqx/engine.py +++ b/src/databricks/labs/dqx/engine.py @@ -1216,7 +1216,7 @@ def load_checks( """ handler = self._checks_handler_factory.create(config) checks = handler.load(config) - merged = self._merge_variables(variables) + merged_variables = self._merge_variables(variables) return resolve_variables(checks=checks, variables=merged_variables) def _merge_variables(self, per_call: dict[str, VariableValue] | None) -> dict[str, VariableValue] | None: diff --git a/src/databricks/labs/dqx/utils.py b/src/databricks/labs/dqx/utils.py index c02f83cb2..0aa57aa57 100644 --- a/src/databricks/labs/dqx/utils.py +++ b/src/databricks/labs/dqx/utils.py @@ -579,7 +579,10 @@ def _resolve(match_obj: re.Match[str]) -> str: unresolved: list[str] = [] output = _UNRESOLVED_PLACEHOLDER_PATTERN.sub(_resolve, text) if unresolved: - logger.warning(f"Unresolved placeholders found: {unresolved}") + logger.warning( + f"Unresolved placeholders found: {unresolved}. " + f"They may be resolved at runtime for certain checks (e.g. sql_query)." + ) return output diff --git a/tests/unit/test_load_checks.py b/tests/unit/test_load_checks.py index 3e6c55cca..7c05a3bb1 100644 --- a/tests/unit/test_load_checks.py +++ b/tests/unit/test_load_checks.py @@ -325,6 +325,72 @@ def test_load_checks_by_metadata_and_split_with_variables(tmp_path): ] +def test_load_checks_sql_query_no_variables(tmp_path, caplog): + checks_yaml = """ + - criticality: error + check: + function: sql_query + arguments: + query: "SELECT id, COUNT(*) > 0 AS condition FROM {{ input_view }} GROUP BY id" + merge_columns: + - id + """ + checks_file = tmp_path / "checks.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + + with caplog.at_level(logging.WARNING): + checks = DQEngineCore.load_checks_from_local_file(str(checks_file)) + + assert not any("input_view" in msg for msg in caplog.messages) + + assert checks == [ + { + "criticality": "error", + "check": { + "function": "sql_query", + "arguments": { + "query": "SELECT id, COUNT(*) > 0 AS condition FROM {{ input_view }} GROUP BY id", + "merge_columns": ["id"], + }, + }, + }, + ] + + +def test_load_checks_sql_query_with_variables(tmp_path, caplog): + checks_yaml = """ + - criticality: "{{ crit }}" + name: "count_check" + check: + function: sql_query + arguments: + query: "SELECT id, COUNT(*) > 0 AS condition FROM {{ input_view }} GROUP BY id" + merge_columns: + - id + """ + checks_file = tmp_path / "checks.yml" + checks_file.write_text(checks_yaml, encoding="utf-8") + + with caplog.at_level(logging.WARNING): + checks = DQEngineCore.load_checks_from_local_file(str(checks_file), variables={"crit": "error"}) + + assert checks == [ + { + "criticality": "error", + "name": "count_check", + "check": { + "function": "sql_query", + "arguments": { + "query": "SELECT id, COUNT(*) > 0 AS condition FROM {{ input_view }} GROUP BY id", + "merge_columns": ["id"], + }, + }, + }, + ] + # {{ input_view }} is left unresolved — it is resolved at runtime by sql_query itself + assert any("input_view" in msg for msg in caplog.messages) + + def test_load_checks_by_metadata_with_variables_name_and_filter(tmp_path): checks_yaml = """ From 07a09682dece3195443e0bc82eaa017074d4b326 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 14 Apr 2026 16:23:37 +0200 Subject: [PATCH 19/24] added vars resolution when saving checks and discourage using vars for criticality --- .../docs/guide/additional_configuration.mdx | 15 +++- .../docs/guide/quality_checks_definition.mdx | 37 +++++--- docs/dqx/docs/reference/engine.mdx | 2 +- src/databricks/labs/dqx/engine.py | 18 +++- .../test_save_and_load_checks_from_table.py | 53 +++++++++++ tests/unit/test_load_checks.py | 89 +++++++++++++++++++ 6 files changed, 198 insertions(+), 16 deletions(-) diff --git a/docs/dqx/docs/guide/additional_configuration.mdx b/docs/dqx/docs/guide/additional_configuration.mdx index 12d1e6d50..e078f9902 100644 --- a/docs/dqx/docs/guide/additional_configuration.mdx +++ b/docs/dqx/docs/guide/additional_configuration.mdx @@ -174,13 +174,13 @@ skipped = checked_df.select(F.explode("_errors").alias("e")).filter(F.col("e.ski ## Defining default variables for substitution -DQX allows you to define engine-level defaults for variables used in declarative check definitions (YAML, JSON, or Delta tables). These defaults are automatically applied during `load_checks` unless overridden by the per-call `variables` parameter. +DQX allows you to define engine-level defaults for variables used in declarative check definitions (YAML, JSON, or Delta tables). These defaults are automatically applied during `load_checks` and `save_checks` unless overridden by the per-call `variables` parameter. ```python from databricks.labs.dqx.engine import DQEngine - from databricks.labs.dqx.config import ExtraParams + from databricks.labs.dqx.config import ExtraParams, FileChecksStorageConfig, TableChecksStorageConfig from databricks.sdk import WorkspaceClient # Initialize engine with default variables @@ -198,7 +198,16 @@ DQX allows you to define engine-level defaults for variables used in declarative # Load checks - uses 'min_temp' and 'max_temp' from defaults, # but overrides 'region' specifically for this call. resolved_checks = dq_engine.load_checks( - config=FileChecksStorageConfig(location="checks.yml"), variables={ "region": "EMEA" } + config=FileChecksStorageConfig(location="checks.yml"), + variables={"region": "EMEA"}, + ) + + # Save checks - resolves variables before computing fingerprints and persisting. + # Uses 'min_temp' and 'max_temp' from defaults, overrides 'region' for this call. + dq_engine.save_checks( + checks=checks, + config=TableChecksStorageConfig(location="catalog.schema.checks_table"), + variables={"region": "EMEA"}, ) ``` diff --git a/docs/dqx/docs/guide/quality_checks_definition.mdx b/docs/dqx/docs/guide/quality_checks_definition.mdx index 93a4b11a4..b585beb71 100644 --- a/docs/dqx/docs/guide/quality_checks_definition.mdx +++ b/docs/dqx/docs/guide/quality_checks_definition.mdx @@ -722,7 +722,7 @@ If `run_config_name` is not provided, "default" is used. Typically, the input ta ## Variable Substitution -DQX supports variable substitution in declarative check definitions (YAML, JSON, or Delta tables). This allows you to parameterize your quality rules and inject values at **load time** from engine-level defaults and/or via the `variables` parameter in `load_checks`. +DQX supports variable substitution in declarative check definitions (YAML, JSON, or Delta tables). This allows you to parameterize your quality rules and inject values at **load time** or **save time** from engine-level defaults and/or via the `variables` parameter in `load_checks` or `save_checks`. ### Syntax and Scope @@ -732,9 +732,17 @@ Placeholders are defined using the `{{ variable_name }}` syntax. Variable substi - `check` function arguments (`arguments`) and column names (`for_each_column`) - any other top-level or nested string field + +The `criticality` field only accepts fixed values (`error` or `warn`). Do not use variable placeholders for `criticality` — the resolved value must be a valid criticality and substituting it defeats the purpose of having an explicit severity level in the check definition. + + ### Resolution -Variables are resolved at **load time** when the checks are loaded from the storage backend. To resolve variables, pass a dictionary to the `variables` parameter of the `load_checks` method. +Variables are resolved when checks are loaded or saved via the engine. To resolve variables, pass a dictionary to the `variables` parameter of `load_checks` or `save_checks`. User can decide whether to provide variables when loading or saving checks. + + +When using `save_checks` with variables, placeholders are resolved **before** computing rule fingerprints and persisting. This ensures that stored checks and their fingerprints reflect the actual resolved check logic. Without resolving at save time, fingerprints would be computed on unresolved `{{ }}` placeholders, causing a mismatch between the fingerprints stored in the checks table and those recorded in the summary metrics and per-row detailed results tables. + Variable substitution is only available when defining checks declaratively (as dictionaries or in files/tables). It is not supported when using DQX classes (e.g., `DQRowRule`) directly. @@ -755,20 +763,29 @@ Variable substitution is only available when defining checks declaratively (as d ```python - from databricks.labs.dqx.engine import DQEngine - from databricks.labs.dqx.config import FileChecksStorageConfig + from databricks.labs.dqx.engine import DQEngine, DQEngineCore + from databricks.labs.dqx.config import FileChecksStorageConfig, TableChecksStorageConfig from databricks.sdk import WorkspaceClient dq_engine = DQEngine(WorkspaceClient()) + variables = { + "min_temp": 0, + "max_temp": 100, + "region": "EMEA" + } # Load checks with variable resolution resolved_checks = dq_engine.load_checks( config=FileChecksStorageConfig(location="checks.yml"), - variables={ - "min_temp": 0, - "max_temp": 100, - "region": "EMEA" - } + variables=variables, + ) + + # Or resolve variables when saving checks (ensures fingerprints are consistent) + checks = DQEngineCore.load_checks_from_local_file("checks.yml") + dq_engine.save_checks( + checks=checks, + config=TableChecksStorageConfig(location="catalog.schema.checks_table"), + variables=variables, ) ``` @@ -776,7 +793,7 @@ Variable substitution is only available when defining checks declaratively (as d ## Default Variables -In addition to specifying variables during the load process, you can define engine-level defaults using the `ExtraParams` class. These constants are automatically applied to all checks unless explicitly overridden. +In addition to specifying variables during the load or save process, you can define engine-level defaults using the `ExtraParams` class. These constants are automatically applied to all checks unless explicitly overridden. For technical details and configuration examples, see [Default Variables](/docs/guide/additional_configuration#defining-default-variables-for-substitution) in the Additional Configuration guide. diff --git a/docs/dqx/docs/reference/engine.mdx b/docs/dqx/docs/reference/engine.mdx index 8866c2ce1..f4aced15c 100644 --- a/docs/dqx/docs/reference/engine.mdx +++ b/docs/dqx/docs/reference/engine.mdx @@ -63,7 +63,7 @@ The following table outlines the available methods of the `DQEngine` and their f | `get_invalid` | Retrieves records from the DataFrame that violate data quality checks (records with warnings and errors). | `df`: Input DataFrame. | Yes | | `get_valid` | Retrieves records from the DataFrame that pass all data quality checks. | `df`: Input DataFrame. | Yes | | `load_checks` | Loads quality rules (checks) from storage backend. Multiple storage backends are supported including tables, files, workspace files, or installation-managed sources inferred from run config. | `config`: Configuration for loading checks from a storage backend, e.g., `FileChecksStorageConfig` (local YAML/JSON file or workspace file), `WorkspaceFileChecksStorageConfig` (workspace file with absolute path), `VolumeFileChecksStorageConfig` (Unity Catalog Volume YAML/JSON), `TableChecksStorageConfig` (table), `InstallationChecksStorageConfig` (installation-managed backend using `checks_location` in run config); `variables`: (optional) dictionary of variables for [variable substitution](/docs/guide/quality_checks_definition/#variable-substitution). | Yes (only with `FileChecksStorageConfig`) | -| `save_checks` | Saves quality rules (checks) to a storage backend. Multiple storage backends are supported including tables, files, workspace files, or installation-managed targets inferred from run config. | `checks`: List of checks defined as dictionary; `config`: Configuration for saving checks in a storage backend, e.g., `FileChecksStorageConfig` (local YAML/JSON file or workspace file), `WorkspaceFileChecksStorageConfig` (workspace file with absolute path), `VolumeFileChecksStorageConfig` (Unity Catalog Volume YAML/JSON), `TableChecksStorageConfig` (table), `InstallationChecksStorageConfig` (installation-managed backend using `checks_location` in run config). | Yes (only with `FileChecksStorageConfig`) | +| `save_checks` | Saves quality rules (checks) to a storage backend. Multiple storage backends are supported including tables, files, workspace files, or installation-managed targets inferred from run config. Variables are resolved before computing fingerprints and persisting. | `checks`: List of checks defined as dictionary; `config`: Configuration for saving checks in a storage backend, e.g., `FileChecksStorageConfig` (local YAML/JSON file or workspace file), `WorkspaceFileChecksStorageConfig` (workspace file with absolute path), `VolumeFileChecksStorageConfig` (Unity Catalog Volume YAML/JSON), `TableChecksStorageConfig` (table), `InstallationChecksStorageConfig` (installation-managed backend using `checks_location` in run config); `variables`: (optional) dictionary of variables for [variable substitution](/docs/guide/quality_checks_definition/#variable-substitution). | Yes (only with `FileChecksStorageConfig`) | | `save_results_in_table` | Saves DataFrames as tables using Unity Catalog table references or storage paths. Supports both batch and streaming writes. For streaming DataFrames, returns a StreamingQuery that can be used to monitor or wait for completion. For batch DataFrames, data is written synchronously and None is returned. | `output_df`: (optional) DataFrame containing the output data (batch or streaming); `quarantine_df`: (optional) DataFrame containing invalid data (batch or streaming); `observation`: (optional) Spark Observation tracking summary metrics; `output_config`: `OutputConfig` with location (table name or storage path), mode, format, options, and optional trigger (supports `partition_by` or `cluster_by`, only one applies;); `quarantine_config`: (optional) `OutputConfig` with location (table name or storage path), mode, format, options, and optional trigger (supports `partition_by` or `cluster_by`, only one applies;); `metrics_config`: (optional) `OutputConfig` with location for summary metrics; `rule_set_fingerprint`: (optional) SHA-256 fingerprint of the rule set used for this run, included in summary metrics when metrics_config is provided; `run_config_name`: Name of the run config to use; `install_folder`: (optional) Installation folder where DQX is installed (only required for custom folder); `assume_user`: (optional) If True, assume user installation, otherwise global. | No | | `save_summary_metrics` | Saves quality checking summary metrics to a Delta table. | `observed_metrics`: `dict[str, Any]` Collected summary metrics from Spark Observation; `metrics_config`: `OutputConfig` object with the table name, output mode, and options for the summary metrics data; `input_config`: (optional) `InputConfig` object with the table name for reading the input data; `output_config`: (optional) `OutputConfig` object with the table name for the output data (supports `partition_by` or `cluster_by`, only one applies); `quarantine_config`: (optional) `OutputConfig` object with the table name for the quarantine data (supports `partition_by` or `cluster_by`, only one applies); `checks_location`: (optional) Location where checks are stored; `rule_set_fingerprint`: (optional) SHA-256 fingerprint of the rule set used for this run. | No | | `get_streaming_metrics_listener` | Gets a streaming metrics listener for writing metrics to an output table. Only required when using streaming DataFrames. | `metrics_config`: `OutputConfig` object with the table name, output mode, and options for the summary metrics data; `input_config`: (optional) `InputConfig` object with the table name for reading the input data; `output_config`: (optional) `OutputConfig` object with the table name for the output data (supports `partition_by` or `cluster_by`, only one applies); `quarantine_config`: (optional) `OutputConfig` object with the table name for the quarantine data (supports `partition_by` or `cluster_by`, only one applies); `checks_location`: (optional) checks location; `rule_set_fingerprint`: (optional) SHA-256 fingerprint of the rule set used for this run; `target_query_id`: (optional) Query ID of the specific streaming query to monitor, if provided, metrics will be collected only for this query. | No | diff --git a/src/databricks/labs/dqx/engine.py b/src/databricks/labs/dqx/engine.py index 3db11b1c1..47d3eaf07 100644 --- a/src/databricks/labs/dqx/engine.py +++ b/src/databricks/labs/dqx/engine.py @@ -1234,7 +1234,12 @@ def _merge_variables(self, per_call: dict[str, VariableValue] | None) -> dict[st return {**defaults, **per_call} @telemetry_logger("engine", "save_checks") - def save_checks(self, checks: list[dict], config: BaseChecksStorageConfig) -> None: + def save_checks( + self, + checks: list[dict], + config: BaseChecksStorageConfig, + variables: dict[str, VariableValue] | None = None, + ) -> None: """Persist DQ rules (checks) to the storage backend described by *config*. The appropriate storage handler is resolved from the configuration @@ -1250,9 +1255,16 @@ def save_checks(self, checks: list[dict], config: BaseChecksStorageConfig) -> No - *InstallationChecksStorageConfig* (installation directory); - *VolumeFileChecksStorageConfig* (Unity Catalog volume file); + Per-call *variables* are merged with engine-level defaults from + *ExtraParams.variables* (per-call values take precedence on conflict). + Variables are resolved before computing fingerprints and persisting, + ensuring that stored checks and their fingerprints are consistent. + Args: checks: List of DQ rules (checks) to save (as dictionaries). config: Configuration object describing the storage backend and write options. + variables: Optional mapping of placeholder names to replacement values. Replaces placeholders + in all string values of the check definitions before saving. Returns: None @@ -1260,8 +1272,10 @@ def save_checks(self, checks: list[dict], config: BaseChecksStorageConfig) -> No Raises: InvalidConfigError: If the configuration type is unsupported. """ + merged_variables = self._merge_variables(variables) + resolved_checks = resolve_variables(checks=checks, variables=merged_variables) handler = self._checks_handler_factory.create(config) - handler.save(checks, config) + handler.save(resolved_checks, config) @telemetry_logger("engine", "save_summary_metrics") def save_summary_metrics( diff --git a/tests/integration/test_save_and_load_checks_from_table.py b/tests/integration/test_save_and_load_checks_from_table.py index 06794226f..523f249b1 100644 --- a/tests/integration/test_save_and_load_checks_from_table.py +++ b/tests/integration/test_save_and_load_checks_from_table.py @@ -12,6 +12,7 @@ TableChecksStorageConfig, InstallationChecksStorageConfig, BaseChecksStorageConfig, + ExtraParams, ) from databricks.labs.dqx.engine import DQEngine from databricks.labs.dqx.errors import InvalidConfigError, UnsafeSqlQueryError @@ -677,3 +678,55 @@ def test_save_idempotency_overwrite_mode(ws, make_schema, make_random, spark): checks = engine.load_checks(config=TableChecksStorageConfig(location=table_name)) assert checks == EXPECTED_CHECKS_FROM_TABLE_LOAD[1:], "Idempotency guard must prevent duplicate overwrite" + + +def test_save_and_load_checks_from_table_with_variables(ws, make_schema, make_random, spark): + """Save checks with {{ }} placeholders resolved via engine-level + per-call variables, then load and apply.""" + catalog_name = TEST_CATALOG + schema_name = make_schema(catalog_name=catalog_name).name + table_name = f"{catalog_name}.{schema_name}.{make_random(10).lower()}" + + checks_with_placeholders = [ + { + "criticality": "error", + "name": "{{ col1 }}_null_check", + "check": { + "function": "is_not_null", + "arguments": {"column": "{{ col1 }}"}, + }, + }, + { + "criticality": "warn", + "name": "{{ col2 }}_not_empty_check", + "check": { + "function": "is_not_null_and_not_empty", + "arguments": {"column": "{{ col2 }}"}, + }, + "filter": "{{ filter_col }} IS NOT NULL", + }, + ] + + # Engine-level defaults; per-call override: crit "warn" -> "error" + extra_params = ExtraParams(variables={"crit": "warn", "col1": "a", "col2": "b", "filter_col": "a"}) + engine = DQEngine(ws, spark, extra_params=extra_params) + + config = TableChecksStorageConfig(location=table_name) + engine.save_checks(checks_with_placeholders, config=config, variables={"crit": "error"}) + + # Load — checks are already resolved, no variables needed + loaded = engine.load_checks(config=config) + + expected = [ + { + "name": "a_null_check", + "criticality": "error", + "check": {"function": "is_not_null", "arguments": {"column": "a"}}, + }, + { + "name": "b_not_empty_check", + "criticality": "warn", + "check": {"function": "is_not_null_and_not_empty", "arguments": {"column": "b"}}, + "filter": "a IS NOT NULL", + }, + ] + assert loaded == expected, "Variable substitution did not resolve correctly after table roundtrip." diff --git a/tests/unit/test_load_checks.py b/tests/unit/test_load_checks.py index 7c05a3bb1..22a5e111f 100644 --- a/tests/unit/test_load_checks.py +++ b/tests/unit/test_load_checks.py @@ -391,6 +391,95 @@ def test_load_checks_sql_query_with_variables(tmp_path, caplog): assert any("input_view" in msg for msg in caplog.messages) +def test_save_checks_with_variables(): + ws = create_autospec(WorkspaceClient) + mock_spark = create_autospec(SparkSession) + + raw_checks = [ + {"criticality": "{{ crit }}", "check": {"function": "is_not_null", "arguments": {"column": "{{ col }}"}}} + ] + + mock_factory = create_autospec(BaseChecksStorageHandlerFactory) + mock_handler = create_autospec(ChecksStorageHandler) + mock_factory.create.return_value = mock_handler + + engine = DQEngine(ws, spark=mock_spark, checks_handler_factory=mock_factory) + config = FileChecksStorageConfig(location="checks.yml") + + engine.save_checks(raw_checks, config, variables={"crit": "error", "col": "id"}) + + mock_handler.save.assert_called_once_with( + [{"criticality": "error", "check": {"function": "is_not_null", "arguments": {"column": "id"}}}], + config, + ) + + +def test_save_checks_variables_none(): + ws = create_autospec(WorkspaceClient) + mock_spark = create_autospec(SparkSession) + + raw_checks = [{"criticality": "error", "check": {"function": "is_not_null", "arguments": {"column": "id"}}}] + + mock_factory = create_autospec(BaseChecksStorageHandlerFactory) + mock_handler = create_autospec(ChecksStorageHandler) + mock_factory.create.return_value = mock_handler + + engine = DQEngine(ws, spark=mock_spark, checks_handler_factory=mock_factory) + config = FileChecksStorageConfig(location="checks.yml") + + engine.save_checks(raw_checks, config, variables=None) + + mock_handler.save.assert_called_once_with(raw_checks, config) + + +def test_save_checks_with_engine_default_variables(): + ws = create_autospec(WorkspaceClient) + mock_spark = create_autospec(SparkSession) + + raw_checks = [ + {"criticality": "{{ crit }}", "check": {"function": "is_not_null", "arguments": {"column": "{{ col }}"}}} + ] + + mock_factory = create_autospec(BaseChecksStorageHandlerFactory) + mock_handler = create_autospec(ChecksStorageHandler) + mock_factory.create.return_value = mock_handler + + extra_params = ExtraParams(variables={"crit": "error", "col": "default_col"}) + engine = DQEngine(ws, spark=mock_spark, checks_handler_factory=mock_factory, extra_params=extra_params) + config = FileChecksStorageConfig(location="checks.yml") + + engine.save_checks(raw_checks, config) + + mock_handler.save.assert_called_once_with( + [{"criticality": "error", "check": {"function": "is_not_null", "arguments": {"column": "default_col"}}}], + config, + ) + + +def test_save_checks_per_call_overrides_engine_defaults(): + ws = create_autospec(WorkspaceClient) + mock_spark = create_autospec(SparkSession) + + raw_checks = [ + {"criticality": "{{ crit }}", "check": {"function": "is_not_null", "arguments": {"column": "{{ col }}"}}} + ] + + mock_factory = create_autospec(BaseChecksStorageHandlerFactory) + mock_handler = create_autospec(ChecksStorageHandler) + mock_factory.create.return_value = mock_handler + + extra_params = ExtraParams(variables={"crit": "warn", "col": "default_col"}) + engine = DQEngine(ws, spark=mock_spark, checks_handler_factory=mock_factory, extra_params=extra_params) + config = FileChecksStorageConfig(location="checks.yml") + + engine.save_checks(raw_checks, config, variables={"crit": "error"}) + + mock_handler.save.assert_called_once_with( + [{"criticality": "error", "check": {"function": "is_not_null", "arguments": {"column": "default_col"}}}], + config, + ) + + def test_load_checks_by_metadata_with_variables_name_and_filter(tmp_path): checks_yaml = """ From 55d4e1f7f1e99473c1425e4e14c3d00df203ebc0 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 14 Apr 2026 16:26:25 +0200 Subject: [PATCH 20/24] updated tests --- tests/integration/test_save_and_load_checks_from_table.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_save_and_load_checks_from_table.py b/tests/integration/test_save_and_load_checks_from_table.py index 523f249b1..d1b56b729 100644 --- a/tests/integration/test_save_and_load_checks_from_table.py +++ b/tests/integration/test_save_and_load_checks_from_table.py @@ -688,7 +688,7 @@ def test_save_and_load_checks_from_table_with_variables(ws, make_schema, make_ra checks_with_placeholders = [ { - "criticality": "error", + "criticality": "{{ crit }}", "name": "{{ col1 }}_null_check", "check": { "function": "is_not_null", @@ -730,3 +730,7 @@ def test_save_and_load_checks_from_table_with_variables(ws, make_schema, make_ra }, ] assert loaded == expected, "Variable substitution did not resolve correctly after table roundtrip." + + # Verify the resolved checks are valid and can be applied end-to-end + assert not engine.validate_checks(loaded).has_errors + From 6b8687bb179dbe6a2414e145425d9b2483d98b26 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 14 Apr 2026 16:42:34 +0200 Subject: [PATCH 21/24] fix docs --- .../docs/guide/quality_checks_definition.mdx | 69 +++++++++---------- .../dqx/docs/guide/quality_checks_storage.mdx | 2 +- 2 files changed, 34 insertions(+), 37 deletions(-) diff --git a/docs/dqx/docs/guide/quality_checks_definition.mdx b/docs/dqx/docs/guide/quality_checks_definition.mdx index b585beb71..71861dcd7 100644 --- a/docs/dqx/docs/guide/quality_checks_definition.mdx +++ b/docs/dqx/docs/guide/quality_checks_definition.mdx @@ -748,48 +748,45 @@ When using `save_checks` with variables, placeholders are resolved **before** co Variable substitution is only available when defining checks declaratively (as dictionaries or in files/tables). It is not supported when using DQX classes (e.g., `DQRowRule`) directly. - - - ```yaml +```python +import yaml +from databricks.labs.dqx.engine import DQEngine +from databricks.labs.dqx.config import FileChecksStorageConfig, TableChecksStorageConfig +from databricks.sdk import WorkspaceClient + +dq_engine = DQEngine(WorkspaceClient()) + +# Define checks with variable placeholders +checks = yaml.safe_load(""" - criticality: error check: function: is_in_range arguments: column: temperature - min_limit: {{ min_temp }} - max_limit: {{ max_temp }} + min_limit: "{{ min_temp }}" + max_limit: "{{ max_temp }}" filter: "region = '{{ region }}'" - ``` - - - ```python - from databricks.labs.dqx.engine import DQEngine, DQEngineCore - from databricks.labs.dqx.config import FileChecksStorageConfig, TableChecksStorageConfig - from databricks.sdk import WorkspaceClient - - dq_engine = DQEngine(WorkspaceClient()) - variables = { - "min_temp": 0, - "max_temp": 100, - "region": "EMEA" - } - - # Load checks with variable resolution - resolved_checks = dq_engine.load_checks( - config=FileChecksStorageConfig(location="checks.yml"), - variables=variables, - ) - - # Or resolve variables when saving checks (ensures fingerprints are consistent) - checks = DQEngineCore.load_checks_from_local_file("checks.yml") - dq_engine.save_checks( - checks=checks, - config=TableChecksStorageConfig(location="catalog.schema.checks_table"), - variables=variables, - ) - ``` - - +""") + +variables = { + "min_temp": 0, + "max_temp": 100, + "region": "EMEA", +} + +# Load checks from file with variable resolution +resolved_checks = dq_engine.load_checks( + config=FileChecksStorageConfig(location="checks.yml"), + variables=variables, +) + +# Or resolve variables when saving checks (ensures fingerprints are consistent) +dq_engine.save_checks( + checks=checks, + config=TableChecksStorageConfig(location="catalog.schema.checks_table"), + variables=variables, +) +``` ## Default Variables diff --git a/docs/dqx/docs/guide/quality_checks_storage.mdx b/docs/dqx/docs/guide/quality_checks_storage.mdx index a1ca120cb..eea90da0e 100644 --- a/docs/dqx/docs/guide/quality_checks_storage.mdx +++ b/docs/dqx/docs/guide/quality_checks_storage.mdx @@ -214,7 +214,7 @@ If you create checks as a list of DQRule objects, you can convert them using the # validate loaded checks assert not dq_engine.validate_checks(checks).has_errors - + ``` When using the quality checker or e2e workflows to apply quality checks, they load checks from the `checks_location` field defined in the [configuration file](/docs/installation/#configuration-file). From aa4883cc3188beba943d081342c12e97bd86b0c8 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 14 Apr 2026 17:22:41 +0200 Subject: [PATCH 22/24] fixed ci --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b5e1b2c17..7dd161dfd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -149,7 +149,7 @@ yq = [ [tool.uv] required-version = "~=0.11.0" exclude-newer = "7 days" -exclude-newer-package = { "databricks-sdk" = false, "databricks-connect" = false } +exclude-newer-package = { "databricks-sdk" = false, "databricks-connect" = false, "setuptools" = false } [tool.pydoc-markdown] loaders = [ From b6fdb8cb010bfa211ba7353d7c33a638142d3910 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 14 Apr 2026 17:29:53 +0200 Subject: [PATCH 23/24] fix CI --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7dd161dfd..e1557ed34 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -149,7 +149,7 @@ yq = [ [tool.uv] required-version = "~=0.11.0" exclude-newer = "7 days" -exclude-newer-package = { "databricks-sdk" = false, "databricks-connect" = false, "setuptools" = false } +exclude-newer-package = { "databricks-sdk" = false, "databricks-connect" = false, "setuptools" = false, "hatchling" = false, "hatch-fancy-pypi-readme" = false } [tool.pydoc-markdown] loaders = [ From 68112581ed7f7a0ea31dd7febaecc6cacb40dfb2 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Wed, 22 Apr 2026 16:38:05 +0200 Subject: [PATCH 24/24] fmt --- tests/integration/test_save_and_load_checks_from_table.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_save_and_load_checks_from_table.py b/tests/integration/test_save_and_load_checks_from_table.py index 257019cae..a506902fc 100644 --- a/tests/integration/test_save_and_load_checks_from_table.py +++ b/tests/integration/test_save_and_load_checks_from_table.py @@ -733,4 +733,3 @@ def test_save_and_load_checks_from_table_with_variables(ws, make_schema, make_ra # Verify the resolved checks are valid and can be applied end-to-end assert not engine.validate_checks(loaded).has_errors -