|
62 | 62 | "col_sum_eq": "sum_eq", |
63 | 63 | } |
64 | 64 |
|
| 65 | +# The canonical set of data quality dimension names used for health scoring. These are the |
| 66 | +# categories that each validation step is tagged with (either inferred automatically from the |
| 67 | +# `assertion_type` or set explicitly via the `dimension=` parameter on a validation method). |
| 68 | +DIMENSION_NAMES: list[str] = [ |
| 69 | + "completeness", |
| 70 | + "consistency", |
| 71 | + "validity", |
| 72 | + "uniqueness", |
| 73 | + "timeliness", |
| 74 | + "volume", |
| 75 | +] |
| 76 | + |
| 77 | +# Default mapping of `assertion_type` to a data quality dimension. This enables automatic |
| 78 | +# dimension inference for every validation step with no user effort. Users can remap globally |
| 79 | +# via `pb.config(dimension_map=...)` or override per step via a validation method's `dimension=` |
| 80 | +# parameter. Any assertion type not present here falls back to `"unknown"`. |
| 81 | +ASSERTION_TYPE_TO_DIMENSION: dict[str, str] = { |
| 82 | + # Completeness: presence/absence of values |
| 83 | + "col_vals_null": "completeness", |
| 84 | + "col_vals_not_null": "completeness", |
| 85 | + "col_pct_null": "completeness", |
| 86 | + "col_pct_missing": "completeness", |
| 87 | + "col_missing_coded": "completeness", |
| 88 | + "col_missing_only_coded": "completeness", |
| 89 | + "rows_complete": "completeness", |
| 90 | + # Consistency: internal agreement across columns/rows/tables |
| 91 | + "col_missing_consistent": "consistency", |
| 92 | + "conjointly": "consistency", |
| 93 | + "tbl_match": "consistency", |
| 94 | + # Validity: values conform to expected rules, ranges, formats, or schema |
| 95 | + "col_vals_gt": "validity", |
| 96 | + "col_vals_lt": "validity", |
| 97 | + "col_vals_eq": "validity", |
| 98 | + "col_vals_ne": "validity", |
| 99 | + "col_vals_ge": "validity", |
| 100 | + "col_vals_le": "validity", |
| 101 | + "col_vals_between": "validity", |
| 102 | + "col_vals_outside": "validity", |
| 103 | + "col_vals_in_set": "validity", |
| 104 | + "col_vals_not_in_set": "validity", |
| 105 | + "col_vals_regex": "validity", |
| 106 | + "col_vals_within_spec": "validity", |
| 107 | + "col_vals_increasing": "validity", |
| 108 | + "col_vals_decreasing": "validity", |
| 109 | + "col_vals_expr": "validity", |
| 110 | + "col_exists": "validity", |
| 111 | + "col_schema_match": "validity", |
| 112 | + "col_sum_eq": "validity", |
| 113 | + "prompt": "validity", |
| 114 | + "specially": "validity", |
| 115 | + # Uniqueness: absence of duplicate rows |
| 116 | + "rows_distinct": "uniqueness", |
| 117 | + # Timeliness: data recency/freshness |
| 118 | + "data_freshness": "timeliness", |
| 119 | + # Volume: expected row/column counts |
| 120 | + "row_count_match": "volume", |
| 121 | + "col_count_match": "volume", |
| 122 | +} |
| 123 | + |
| 124 | +# Two-letter abbreviations used for the compact, color-coded dimension badge shown in the corner of |
| 125 | +# the step-number cell in the validation report. |
| 126 | +DIMENSION_ABBR: dict[str, str] = { |
| 127 | + "completeness": "CM", |
| 128 | + "consistency": "CS", |
| 129 | + "validity": "VA", |
| 130 | + "uniqueness": "UQ", |
| 131 | + "timeliness": "TM", |
| 132 | + "volume": "VO", |
| 133 | + "unknown": "??", |
| 134 | +} |
| 135 | + |
| 136 | +# Accent colors used to color-code the dimension badge in the validation report. Hues are chosen |
| 137 | +# to read as categorical (distinct from the warning/error/critical severity palette). |
| 138 | +DIMENSION_COLORS: dict[str, str] = { |
| 139 | + "completeness": "#3C6E9A", # blue |
| 140 | + "consistency": "#C57B3C", # amber |
| 141 | + "validity": "#4E9A6B", # green |
| 142 | + "uniqueness": "#8E6FB5", # purple |
| 143 | + "timeliness": "#3F9C9C", # teal |
| 144 | + "volume": "#7A7A8C", # slate |
| 145 | + "unknown": "#B0B0B0", # gray |
| 146 | +} |
| 147 | + |
65 | 148 | COMPARISON_OPERATORS = { |
66 | 149 | "col_vals_gt": ">", |
67 | 150 | "col_vals_ge": ">=", |
|
130 | 213 | "thresholds", |
131 | 214 | "label", |
132 | 215 | "brief", |
| 216 | + "dimension", |
133 | 217 | "active", |
134 | 218 | "all_passed", |
135 | 219 | "n", |
|
0 commit comments