diff --git a/.github/scripts/fork-sync-pr.sh b/.github/scripts/fork-sync-pr.sh index 305edf3b1..134b7076d 100755 --- a/.github/scripts/fork-sync-pr.sh +++ b/.github/scripts/fork-sync-pr.sh @@ -88,6 +88,8 @@ if [ -z "$EXISTING" ]; then --base "$BASE_BRANCH" \ --head "$SYNC_BRANCH" \ --title "$TEST_PR_TITLE" \ + --label "do-not-merge" \ + --label "fork-test" \ --body "Automated sync from fork PR for CI testing. Original PR: ${PR_URL} diff --git a/src/databricks/labs/dqx/profiler/profiler.py b/src/databricks/labs/dqx/profiler/profiler.py index fce207bf9..d4380fbb8 100644 --- a/src/databricks/labs/dqx/profiler/profiler.py +++ b/src/databricks/labs/dqx/profiler/profiler.py @@ -391,10 +391,15 @@ def _profile( if is_text and trim_strings: column_df = column_df.select(F.trim(F.col(column_label)).alias(column_label)) - count_non_null = column_df.count() + aggr_stats = column_df.agg( + F.count(column_label).alias("cnt"), + F.countDistinct(column_label).alias("cnt_distinct"), + ).first() + count_non_null = aggr_stats[0] if aggr_stats else 0 metrics["count"] = total_count metrics["count_null"] = total_count - count_non_null metrics["count_non_null"] = count_non_null + metrics["count_distinct"] = aggr_stats[1] if aggr_stats else 0 if is_text: metrics["empty_count"] = column_df.filter(F.col(column_label) == "").count() else: @@ -539,7 +544,9 @@ def _process_metric(self, metric_name: str, metric_value: Any, metric: str, sm_d """ typ = field_types[metric_name] if metric_value is not None: - if metric in {"stddev", "mean"}: + if isinstance(typ, TEXT_TYPES) and metric in {"min", "max"}: + sm_dict[metric_name][metric] = None + elif metric in {"stddev", "mean"}: sm_dict[metric_name][metric] = float(metric_value) else: sm_dict[metric_name][metric] = self._do_cast(metric_value, typ) diff --git a/tests/integration/test_app_backend.py b/tests/integration/test_app_backend.py index 820a83bc1..1be5354dc 100644 --- a/tests/integration/test_app_backend.py +++ b/tests/integration/test_app_backend.py @@ -209,11 +209,14 @@ def test_save_settings_strips_whitespace_from_path(self, settings_manager, insta def test_save_settings_with_nested_folder_structure(self, ws, settings_manager, installation_folder, make_random): """Should handle nested folder structures correctly.""" user_home = f"/Users/{ws.current_user.me().user_name}" - nested_folder = f"{user_home}/test/nested/dqx_{make_random(8)}" + # Randomize the whole parent chain so parallel workers / concurrent CI jobs + # cannot collide on a shared prefix and wipe each other's folders on cleanup. + test_root = f"{user_home}/dqx_nested_{make_random(8)}" + nested_folder = f"{test_root}/test/nested/dqx_{make_random(8)}" - # Register both the nested folder and its parent for cleanup + # Register both the nested folder and its randomized parent for cleanup installation_folder(nested_folder) - installation_folder(f"{user_home}/test") + installation_folder(test_root) # Save settings with nested path settings_to_save = InstallationSettings(install_folder=nested_folder) diff --git a/tests/integration/test_profiler.py b/tests/integration/test_profiler.py index 9a05176fa..cb0ab2a56 100644 --- a/tests/integration/test_profiler.py +++ b/tests/integration/test_profiler.py @@ -2077,3 +2077,43 @@ def test_profiler_detect_pk_from_path_with_llm(ws, spark, make_schema, make_rand assert result["primary_key_columns"] == ["id"] assert result["confidence"] assert result["reasoning"] + + +def test_profiler_string_columns_have_null_min_max(spark, ws): + schema = T.StructType( + [ + T.StructField("name", T.StringType()), + T.StructField("age", T.IntegerType()), + ] + ) + input_df = spark.createDataFrame( + [["Alice", 30], ["Bob", 25], ["Charlie", 35]], + schema=schema, + ) + + profiler = DQProfiler(ws) + stats, _ = profiler.profile(input_df, options={"sample_fraction": None, "llm_primary_key_detection": False}) + + assert stats["name"]["min"] is None + assert stats["name"]["max"] is None + assert stats["age"]["min"] is not None + assert stats["age"]["max"] is not None + + +def test_profiler_count_distinct_computed(spark, ws): + schema = T.StructType( + [ + T.StructField("color", T.StringType()), + T.StructField("value", T.IntegerType()), + ] + ) + input_df = spark.createDataFrame( + [["red", 1], ["blue", 2], ["red", 3], ["blue", 1]], + schema=schema, + ) + + profiler = DQProfiler(ws) + stats, _ = profiler.profile(input_df, options={"sample_fraction": None, "llm_primary_key_detection": False}) + + assert stats["color"]["count_distinct"] == 2 + assert stats["value"]["count_distinct"] == 3