Skip to content

Commit ad5a4d9

Browse files
committed
Implement native profiler to replace pydeequ
1 parent 474a6b4 commit ad5a4d9

13 files changed

Lines changed: 1383 additions & 429 deletions

src/tablespec/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
load_umf_from_yaml,
3838
save_umf_to_yaml,
3939
)
40-
from tablespec.profiling import ColumnProfile, DataFrameProfile, DeequToUmfMapper
40+
from tablespec.profiling import ColumnProfile, DataFrameProfile, ProfileToGxMapper
4141
from tablespec.prompts import (
4242
generate_column_validation_prompt,
4343
generate_documentation_prompt,
@@ -117,7 +117,7 @@
117117
# -- Profiling --
118118
"ColumnProfile",
119119
"DataFrameProfile",
120-
"DeequToUmfMapper",
120+
"ProfileToGxMapper",
121121
# -- LLM Prompt Generation --
122122
"generate_column_validation_prompt",
123123
"generate_documentation_prompt",

src/tablespec/cli.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
from pathlib import Path
2222

2323
from pydantic import ValidationError
24+
import os
25+
2426
from rich.console import Console
2527
from rich.table import Table as RichTable
2628
import typer
@@ -48,7 +50,7 @@
4850
name="tablespec",
4951
help="Work with UMF (Universal Metadata Format) table schemas",
5052
)
51-
console = Console()
53+
console = Console(no_color=bool(os.environ.get("NO_COLOR")))
5254

5355
# Module-level validation context (process lifetime caching) - only when validator is available
5456
_validation_context = ValidationContext() if _HAS_VALIDATOR else None

src/tablespec/gx_baseline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ def generate_baseline_column_expectations(
340340
},
341341
}
342342
)
343-
# If nullable is True (bool from DeequToUmfMapper), column IS nullable — no not-null expectation needed
343+
# If nullable is True (bool), column IS nullable — no not-null expectation needed
344344

345345
# 2. Length constraints
346346
max_length = column.get("max_length") or column.get("length")
Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,47 @@
11
"""Schema profiling and mapping utilities for tablespec.
22
3-
This module provides tools for mapping profiling data and Spark schemas to UMF format.
3+
This module provides tools for profiling DataFrames and mapping results to UMF format.
44
5-
Spark-dependent components (SparkToUmfMapper) require installing tablespec[spark]:
5+
Components:
6+
- ``NativeSparkProfiler``: Profiles DataFrames using native SQL (serverless-compatible).
7+
- ``ProfileToGxMapper``: Generates GX expectations directly from profiling results.
8+
- ``SparkToUmfMapper``: Maps Spark DataFrame schema to UMF (requires pyspark).
9+
10+
Type mappings:
11+
- ``SPARK_TO_UMF_TYPE``: Spark DataType class name → UMF data_type string.
12+
- ``SQL_TO_UMF_TYPE``: SQL/warehouse type name → UMF data_type string (for dbt, etc.).
13+
14+
Spark-dependent components require installing tablespec[spark]:
615
pip install tablespec[spark]
716
"""
817

9-
from tablespec.profiling.deequ_mapper import DeequToUmfMapper
18+
from tablespec.profiling.gx_expectation_builder import ProfileToGxMapper
1019
from tablespec.profiling.types import ColumnProfile, DataFrameProfile
1120

1221
__all__ = [
1322
"ColumnProfile",
1423
"DataFrameProfile",
15-
"DeequToUmfMapper",
24+
"ProfileToGxMapper",
1625
]
1726

18-
# SparkToUmfMapper is available only if pyspark is installed
27+
# SparkToUmfMapper and type mapping dicts are available only if pyspark is installed
1928
try:
20-
from tablespec.profiling.spark_mapper import SparkToUmfMapper # noqa: F401
29+
from tablespec.profiling.spark_mapper import ( # noqa: F401
30+
SPARK_TO_UMF_TYPE,
31+
SQL_TO_UMF_TYPE,
32+
SparkToUmfMapper,
33+
)
2134

22-
__all__.append("SparkToUmfMapper")
35+
__all__.extend(["SparkToUmfMapper", "SPARK_TO_UMF_TYPE", "SQL_TO_UMF_TYPE"])
2336
except ImportError:
2437
# pyspark not available - SparkToUmfMapper won't be exported
2538
pass
39+
40+
# NativeSparkProfiler requires only pyspark (works on Connect/serverless)
41+
try:
42+
from tablespec.profiling.native_profiler import NativeSparkProfiler # noqa: F401
43+
44+
__all__.append("NativeSparkProfiler")
45+
except ImportError:
46+
# pyspark not available
47+
pass

src/tablespec/profiling/deequ_mapper.py

Lines changed: 0 additions & 133 deletions
This file was deleted.

0 commit comments

Comments
 (0)