feat: Improved summary stats report for string datatype columns (#1104)

Roshan1299 · mwojtyczka · web-flow · commit ed9a28bdd405 · 2026-04-23T19:44:06.000+02:00
## Changes  - Nulled out `min` and `max` metrics for string columns in summary stats, since lexicographic min/max values are not meaningful for text data - Added `count_distinct` metric for all column types ### Linked issues  Resolves #670 ### Tests  - [ ] manually tested - [ ] added unit tests - [x] added integration tests - [ ] added end-to-end tests - [ ] added performance tests Both changes are in private methods (`_process_metric`, `_profile`) that are only testable through the public `profile()` API, which requires a live SparkSession. The existing integration tests in `test_profiler.py` exercise these code paths and will validate in CI. Happy to add a dedicated integration test if needed. - [x] Formatting passes (`make fmt` — pylint 10/10, mypy clean) - [x] Unit tests pass (`make test` — 939 passed) --------- Signed-off-by: Roshan1299 <banisettirosh@gmail.com> Co-authored-by: Marcin Wojtyczka <marcin.wojtyczka@databricks.com>
diff --git a/.github/scripts/fork-sync-pr.sh b/.github/scripts/fork-sync-pr.sh
@@ -88,6 +88,8 @@ if [ -z "$EXISTING" ]; then
     --base "$BASE_BRANCH" \
     --head "$SYNC_BRANCH" \
     --title "$TEST_PR_TITLE" \
+    --label "do-not-merge" \
+    --label "fork-test" \
     --body "Automated sync from fork PR for CI testing.
 
 Original PR: ${PR_URL}
diff --git a/src/databricks/labs/dqx/profiler/profiler.py b/src/databricks/labs/dqx/profiler/profiler.py
@@ -391,10 +391,15 @@ def _profile(
             if is_text and trim_strings:
                 column_df = column_df.select(F.trim(F.col(column_label)).alias(column_label))
 
-            count_non_null = column_df.count()
+            aggr_stats = column_df.agg(
+                F.count(column_label).alias("cnt"),
+                F.countDistinct(column_label).alias("cnt_distinct"),
+            ).first()
+            count_non_null = aggr_stats[0] if aggr_stats else 0
             metrics["count"] = total_count
             metrics["count_null"] = total_count - count_non_null
             metrics["count_non_null"] = count_non_null
+            metrics["count_distinct"] = aggr_stats[1] if aggr_stats else 0
             if is_text:
                 metrics["empty_count"] = column_df.filter(F.col(column_label) == "").count()
             else:
@@ -539,7 +544,9 @@ def _process_metric(self, metric_name: str, metric_value: Any, metric: str, sm_d
         """
         typ = field_types[metric_name]
         if metric_value is not None:
-            if metric in {"stddev", "mean"}:
+            if isinstance(typ, TEXT_TYPES) and metric in {"min", "max"}:
+                sm_dict[metric_name][metric] = None
+            elif metric in {"stddev", "mean"}:
                 sm_dict[metric_name][metric] = float(metric_value)
             else:
                 sm_dict[metric_name][metric] = self._do_cast(metric_value, typ)
diff --git a/tests/integration/test_app_backend.py b/tests/integration/test_app_backend.py
@@ -209,11 +209,14 @@ def test_save_settings_strips_whitespace_from_path(self, settings_manager, insta
     def test_save_settings_with_nested_folder_structure(self, ws, settings_manager, installation_folder, make_random):
         """Should handle nested folder structures correctly."""
         user_home = f"/Users/{ws.current_user.me().user_name}"
-        nested_folder = f"{user_home}/test/nested/dqx_{make_random(8)}"
+        # Randomize the whole parent chain so parallel workers / concurrent CI jobs
+        # cannot collide on a shared prefix and wipe each other's folders on cleanup.
+        test_root = f"{user_home}/dqx_nested_{make_random(8)}"
+        nested_folder = f"{test_root}/test/nested/dqx_{make_random(8)}"
 
-        # Register both the nested folder and its parent for cleanup
+        # Register both the nested folder and its randomized parent for cleanup
         installation_folder(nested_folder)
-        installation_folder(f"{user_home}/test")
+        installation_folder(test_root)
 
         # Save settings with nested path
         settings_to_save = InstallationSettings(install_folder=nested_folder)
diff --git a/tests/integration/test_profiler.py b/tests/integration/test_profiler.py
@@ -2077,3 +2077,43 @@ def test_profiler_detect_pk_from_path_with_llm(ws, spark, make_schema, make_rand
     assert result["primary_key_columns"] == ["id"]
     assert result["confidence"]
     assert result["reasoning"]
+
+
+def test_profiler_string_columns_have_null_min_max(spark, ws):
+    schema = T.StructType(
+        [
+            T.StructField("name", T.StringType()),
+            T.StructField("age", T.IntegerType()),
+        ]
+    )
+    input_df = spark.createDataFrame(
+        [["Alice", 30], ["Bob", 25], ["Charlie", 35]],
+        schema=schema,
+    )
+
+    profiler = DQProfiler(ws)
+    stats, _ = profiler.profile(input_df, options={"sample_fraction": None, "llm_primary_key_detection": False})
+
+    assert stats["name"]["min"] is None
+    assert stats["name"]["max"] is None
+    assert stats["age"]["min"] is not None
+    assert stats["age"]["max"] is not None
+
+
+def test_profiler_count_distinct_computed(spark, ws):
+    schema = T.StructType(
+        [
+            T.StructField("color", T.StringType()),
+            T.StructField("value", T.IntegerType()),
+        ]
+    )
+    input_df = spark.createDataFrame(
+        [["red", 1], ["blue", 2], ["red", 3], ["blue", 1]],
+        schema=schema,
+    )
+
+    profiler = DQProfiler(ws)
+    stats, _ = profiler.profile(input_df, options={"sample_fraction": None, "llm_primary_key_detection": False})
+
+    assert stats["color"]["count_distinct"] == 2
+    assert stats["value"]["count_distinct"] == 3