Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/scripts/fork-sync-pr.sh
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ if [ -z "$EXISTING" ]; then
--base "$BASE_BRANCH" \
--head "$SYNC_BRANCH" \
--title "$TEST_PR_TITLE" \
--label "do-not-merge" \
--label "fork-test" \
--body "Automated sync from fork PR for CI testing.

Original PR: ${PR_URL}
Expand Down
11 changes: 9 additions & 2 deletions src/databricks/labs/dqx/profiler/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,10 +391,15 @@ def _profile(
if is_text and trim_strings:
column_df = column_df.select(F.trim(F.col(column_label)).alias(column_label))

count_non_null = column_df.count()
aggr_stats = column_df.agg(
F.count(column_label).alias("cnt"),
F.countDistinct(column_label).alias("cnt_distinct"),
).first()
count_non_null = aggr_stats[0] if aggr_stats else 0
metrics["count"] = total_count
metrics["count_null"] = total_count - count_non_null
metrics["count_non_null"] = count_non_null
metrics["count_distinct"] = aggr_stats[1] if aggr_stats else 0
if is_text:
metrics["empty_count"] = column_df.filter(F.col(column_label) == "").count()
else:
Expand Down Expand Up @@ -539,7 +544,9 @@ def _process_metric(self, metric_name: str, metric_value: Any, metric: str, sm_d
"""
typ = field_types[metric_name]
if metric_value is not None:
if metric in {"stddev", "mean"}:
if isinstance(typ, TEXT_TYPES) and metric in {"min", "max"}:
sm_dict[metric_name][metric] = None
elif metric in {"stddev", "mean"}:
sm_dict[metric_name][metric] = float(metric_value)
else:
sm_dict[metric_name][metric] = self._do_cast(metric_value, typ)
Expand Down
9 changes: 6 additions & 3 deletions tests/integration/test_app_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,11 +209,14 @@ def test_save_settings_strips_whitespace_from_path(self, settings_manager, insta
def test_save_settings_with_nested_folder_structure(self, ws, settings_manager, installation_folder, make_random):
"""Should handle nested folder structures correctly."""
user_home = f"/Users/{ws.current_user.me().user_name}"
nested_folder = f"{user_home}/test/nested/dqx_{make_random(8)}"
# Randomize the whole parent chain so parallel workers / concurrent CI jobs
# cannot collide on a shared prefix and wipe each other's folders on cleanup.
test_root = f"{user_home}/dqx_nested_{make_random(8)}"
nested_folder = f"{test_root}/test/nested/dqx_{make_random(8)}"

# Register both the nested folder and its parent for cleanup
# Register both the nested folder and its randomized parent for cleanup
installation_folder(nested_folder)
installation_folder(f"{user_home}/test")
installation_folder(test_root)

# Save settings with nested path
settings_to_save = InstallationSettings(install_folder=nested_folder)
Expand Down
40 changes: 40 additions & 0 deletions tests/integration/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2077,3 +2077,43 @@ def test_profiler_detect_pk_from_path_with_llm(ws, spark, make_schema, make_rand
assert result["primary_key_columns"] == ["id"]
assert result["confidence"]
assert result["reasoning"]


def test_profiler_string_columns_have_null_min_max(spark, ws):
schema = T.StructType(
[
T.StructField("name", T.StringType()),
T.StructField("age", T.IntegerType()),
]
)
input_df = spark.createDataFrame(
[["Alice", 30], ["Bob", 25], ["Charlie", 35]],
schema=schema,
)

profiler = DQProfiler(ws)
stats, _ = profiler.profile(input_df, options={"sample_fraction": None, "llm_primary_key_detection": False})

assert stats["name"]["min"] is None
assert stats["name"]["max"] is None
assert stats["age"]["min"] is not None
assert stats["age"]["max"] is not None


def test_profiler_count_distinct_computed(spark, ws):
schema = T.StructType(
[
T.StructField("color", T.StringType()),
T.StructField("value", T.IntegerType()),
]
)
input_df = spark.createDataFrame(
[["red", 1], ["blue", 2], ["red", 3], ["blue", 1]],
schema=schema,
)

profiler = DQProfiler(ws)
stats, _ = profiler.profile(input_df, options={"sample_fraction": None, "llm_primary_key_detection": False})

assert stats["color"]["count_distinct"] == 2
assert stats["value"]["count_distinct"] == 3
Loading