Skip to content

Commit 13f4c14

Browse files
authored
Merge pull request #409 from posit-dev/feat-dq-dims-health-scoring
feat: add data quality dimensions and health scoring
2 parents 57b2153 + 7330090 commit 13f4c14

22 files changed

Lines changed: 2565 additions & 2 deletions

File tree

great-docs.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,11 @@ reference:
271271
- Validate.all_passed
272272
- Validate.assert_passing
273273
- Validate.assert_below_threshold
274+
- Validate.assert_dimension_scores
274275
- Validate.above_threshold
276+
- Validate.get_dimension_scores
277+
- Validate.get_health_score
278+
- Validate.get_scorecard
275279
- Validate.n
276280
- Validate.n_passed
277281
- Validate.n_failed

pointblank/_constants.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,89 @@
6262
"col_sum_eq": "sum_eq",
6363
}
6464

65+
# The canonical set of data quality dimension names used for health scoring. These are the
66+
# categories that each validation step is tagged with (either inferred automatically from the
67+
# `assertion_type` or set explicitly via the `dimension=` parameter on a validation method).
68+
DIMENSION_NAMES: list[str] = [
69+
"completeness",
70+
"consistency",
71+
"validity",
72+
"uniqueness",
73+
"timeliness",
74+
"volume",
75+
]
76+
77+
# Default mapping of `assertion_type` to a data quality dimension. This enables automatic
78+
# dimension inference for every validation step with no user effort. Users can remap globally
79+
# via `pb.config(dimension_map=...)` or override per step via a validation method's `dimension=`
80+
# parameter. Any assertion type not present here falls back to `"unknown"`.
81+
ASSERTION_TYPE_TO_DIMENSION: dict[str, str] = {
82+
# Completeness: presence/absence of values
83+
"col_vals_null": "completeness",
84+
"col_vals_not_null": "completeness",
85+
"col_pct_null": "completeness",
86+
"col_pct_missing": "completeness",
87+
"col_missing_coded": "completeness",
88+
"col_missing_only_coded": "completeness",
89+
"rows_complete": "completeness",
90+
# Consistency: internal agreement across columns/rows/tables
91+
"col_missing_consistent": "consistency",
92+
"conjointly": "consistency",
93+
"tbl_match": "consistency",
94+
# Validity: values conform to expected rules, ranges, formats, or schema
95+
"col_vals_gt": "validity",
96+
"col_vals_lt": "validity",
97+
"col_vals_eq": "validity",
98+
"col_vals_ne": "validity",
99+
"col_vals_ge": "validity",
100+
"col_vals_le": "validity",
101+
"col_vals_between": "validity",
102+
"col_vals_outside": "validity",
103+
"col_vals_in_set": "validity",
104+
"col_vals_not_in_set": "validity",
105+
"col_vals_regex": "validity",
106+
"col_vals_within_spec": "validity",
107+
"col_vals_increasing": "validity",
108+
"col_vals_decreasing": "validity",
109+
"col_vals_expr": "validity",
110+
"col_exists": "validity",
111+
"col_schema_match": "validity",
112+
"col_sum_eq": "validity",
113+
"prompt": "validity",
114+
"specially": "validity",
115+
# Uniqueness: absence of duplicate rows
116+
"rows_distinct": "uniqueness",
117+
# Timeliness: data recency/freshness
118+
"data_freshness": "timeliness",
119+
# Volume: expected row/column counts
120+
"row_count_match": "volume",
121+
"col_count_match": "volume",
122+
}
123+
124+
# Two-letter abbreviations used for the compact, color-coded dimension badge shown in the corner of
125+
# the step-number cell in the validation report.
126+
DIMENSION_ABBR: dict[str, str] = {
127+
"completeness": "CM",
128+
"consistency": "CS",
129+
"validity": "VA",
130+
"uniqueness": "UQ",
131+
"timeliness": "TM",
132+
"volume": "VO",
133+
"unknown": "??",
134+
}
135+
136+
# Accent colors used to color-code the dimension badge in the validation report. Hues are chosen
137+
# to read as categorical (distinct from the warning/error/critical severity palette).
138+
DIMENSION_COLORS: dict[str, str] = {
139+
"completeness": "#3C6E9A", # blue
140+
"consistency": "#C57B3C", # amber
141+
"validity": "#4E9A6B", # green
142+
"uniqueness": "#8E6FB5", # purple
143+
"timeliness": "#3F9C9C", # teal
144+
"volume": "#7A7A8C", # slate
145+
"unknown": "#B0B0B0", # gray
146+
}
147+
65148
COMPARISON_OPERATORS = {
66149
"col_vals_gt": ">",
67150
"col_vals_ge": ">=",
@@ -130,6 +213,7 @@
130213
"thresholds",
131214
"label",
132215
"brief",
216+
"dimension",
133217
"active",
134218
"all_passed",
135219
"n",

0 commit comments

Comments
 (0)