Skip to content

Commit ed9a28b

Browse files
feat: Improved summary stats report for string datatype columns (#1104)
## Changes <!-- Summary of your changes that are easy to understand. Add screenshots when necessary --> - Nulled out `min` and `max` metrics for string columns in summary stats, since lexicographic min/max values are not meaningful for text data - Added `count_distinct` metric for all column types ### Linked issues <!-- DOC: Link issue with a keyword: close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved. See https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword --> Resolves #670 ### Tests <!-- How is this tested? Please see the checklist below and also describe any other relevant tests --> - [ ] manually tested - [ ] added unit tests - [x] added integration tests - [ ] added end-to-end tests - [ ] added performance tests Both changes are in private methods (`_process_metric`, `_profile`) that are only testable through the public `profile()` API, which requires a live SparkSession. The existing integration tests in `test_profiler.py` exercise these code paths and will validate in CI. Happy to add a dedicated integration test if needed. - [x] Formatting passes (`make fmt` — pylint 10/10, mypy clean) - [x] Unit tests pass (`make test` — 939 passed) --------- Signed-off-by: Roshan1299 <banisettirosh@gmail.com> Co-authored-by: Marcin Wojtyczka <marcin.wojtyczka@databricks.com>
1 parent 6ae5f91 commit ed9a28b

4 files changed

Lines changed: 57 additions & 5 deletions

File tree

.github/scripts/fork-sync-pr.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ if [ -z "$EXISTING" ]; then
8888
--base "$BASE_BRANCH" \
8989
--head "$SYNC_BRANCH" \
9090
--title "$TEST_PR_TITLE" \
91+
--label "do-not-merge" \
92+
--label "fork-test" \
9193
--body "Automated sync from fork PR for CI testing.
9294
9395
Original PR: ${PR_URL}

src/databricks/labs/dqx/profiler/profiler.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -391,10 +391,15 @@ def _profile(
391391
if is_text and trim_strings:
392392
column_df = column_df.select(F.trim(F.col(column_label)).alias(column_label))
393393

394-
count_non_null = column_df.count()
394+
aggr_stats = column_df.agg(
395+
F.count(column_label).alias("cnt"),
396+
F.countDistinct(column_label).alias("cnt_distinct"),
397+
).first()
398+
count_non_null = aggr_stats[0] if aggr_stats else 0
395399
metrics["count"] = total_count
396400
metrics["count_null"] = total_count - count_non_null
397401
metrics["count_non_null"] = count_non_null
402+
metrics["count_distinct"] = aggr_stats[1] if aggr_stats else 0
398403
if is_text:
399404
metrics["empty_count"] = column_df.filter(F.col(column_label) == "").count()
400405
else:
@@ -539,7 +544,9 @@ def _process_metric(self, metric_name: str, metric_value: Any, metric: str, sm_d
539544
"""
540545
typ = field_types[metric_name]
541546
if metric_value is not None:
542-
if metric in {"stddev", "mean"}:
547+
if isinstance(typ, TEXT_TYPES) and metric in {"min", "max"}:
548+
sm_dict[metric_name][metric] = None
549+
elif metric in {"stddev", "mean"}:
543550
sm_dict[metric_name][metric] = float(metric_value)
544551
else:
545552
sm_dict[metric_name][metric] = self._do_cast(metric_value, typ)

tests/integration/test_app_backend.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -209,11 +209,14 @@ def test_save_settings_strips_whitespace_from_path(self, settings_manager, insta
209209
def test_save_settings_with_nested_folder_structure(self, ws, settings_manager, installation_folder, make_random):
210210
"""Should handle nested folder structures correctly."""
211211
user_home = f"/Users/{ws.current_user.me().user_name}"
212-
nested_folder = f"{user_home}/test/nested/dqx_{make_random(8)}"
212+
# Randomize the whole parent chain so parallel workers / concurrent CI jobs
213+
# cannot collide on a shared prefix and wipe each other's folders on cleanup.
214+
test_root = f"{user_home}/dqx_nested_{make_random(8)}"
215+
nested_folder = f"{test_root}/test/nested/dqx_{make_random(8)}"
213216

214-
# Register both the nested folder and its parent for cleanup
217+
# Register both the nested folder and its randomized parent for cleanup
215218
installation_folder(nested_folder)
216-
installation_folder(f"{user_home}/test")
219+
installation_folder(test_root)
217220

218221
# Save settings with nested path
219222
settings_to_save = InstallationSettings(install_folder=nested_folder)

tests/integration/test_profiler.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2077,3 +2077,43 @@ def test_profiler_detect_pk_from_path_with_llm(ws, spark, make_schema, make_rand
20772077
assert result["primary_key_columns"] == ["id"]
20782078
assert result["confidence"]
20792079
assert result["reasoning"]
2080+
2081+
2082+
def test_profiler_string_columns_have_null_min_max(spark, ws):
2083+
schema = T.StructType(
2084+
[
2085+
T.StructField("name", T.StringType()),
2086+
T.StructField("age", T.IntegerType()),
2087+
]
2088+
)
2089+
input_df = spark.createDataFrame(
2090+
[["Alice", 30], ["Bob", 25], ["Charlie", 35]],
2091+
schema=schema,
2092+
)
2093+
2094+
profiler = DQProfiler(ws)
2095+
stats, _ = profiler.profile(input_df, options={"sample_fraction": None, "llm_primary_key_detection": False})
2096+
2097+
assert stats["name"]["min"] is None
2098+
assert stats["name"]["max"] is None
2099+
assert stats["age"]["min"] is not None
2100+
assert stats["age"]["max"] is not None
2101+
2102+
2103+
def test_profiler_count_distinct_computed(spark, ws):
2104+
schema = T.StructType(
2105+
[
2106+
T.StructField("color", T.StringType()),
2107+
T.StructField("value", T.IntegerType()),
2108+
]
2109+
)
2110+
input_df = spark.createDataFrame(
2111+
[["red", 1], ["blue", 2], ["red", 3], ["blue", 1]],
2112+
schema=schema,
2113+
)
2114+
2115+
profiler = DQProfiler(ws)
2116+
stats, _ = profiler.profile(input_df, options={"sample_fraction": None, "llm_primary_key_detection": False})
2117+
2118+
assert stats["color"]["count_distinct"] == 2
2119+
assert stats["value"]["count_distinct"] == 3

0 commit comments

Comments
 (0)