Skip to content

Commit 3c6d005

Browse files
committed
fix(mcp): hygiene issues review feedback (TG-1029)
Round of review fixes applied on top of the initial implementation: * Drop the legacy `profile_run_id` from the MCP layer — model methods take a `job_execution_id` and JOIN `ProfilingRun` to filter. New `ProfilingRun.get_latest_complete_je_id_for_table_group` avoids reading the `table_groups.last_complete_profile_run_id` cache (which still points at the internal run PK; schema-level cleanup tracked separately). * Trust the run/JE backfill migration — drop `if x is None` guards and the descriptive-string fallback in `_resolve_profile_run`. * StrEnums for fixed string sets: `Disposition`, `IssueLikelihood`, `PiiRisk` (in `common/models/hygiene_issue.py`) and `QualityDimension` (new shared `common/enums.py`). Parsers return enum types; coalesce defaults reference enum members; sentinel sets dropped. * Tighten labels: heading uses "for profiling run"; `get_hygiene_issue` no longer triples "Run / Profiling Run / Run Date". * Empty states render with title + italic marker. * `dq_prevalence` removed from output and dataclasses (no precedent label, unhelpful score-engine internal).
1 parent a026cc1 commit 3c6d005

14 files changed

Lines changed: 1893 additions & 94 deletions

testgen/common/enums.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""Shared enums used across multiple models, services, and surfaces.
2+
3+
Add an enum here when its values are referenced by more than one model file or by
4+
both the model layer and an outer surface (MCP, API, UI). Single-model enums live
5+
in their model file.
6+
"""
7+
from enum import StrEnum
8+
9+
10+
class QualityDimension(StrEnum):
11+
"""Stored ``dq_dimension`` values shared by ``profile_anomaly_types`` and ``test_types``.
12+
Surfaced to users as "Quality Dimension"."""
13+
ACCURACY = "Accuracy"
14+
COMPLETENESS = "Completeness"
15+
CONSISTENCY = "Consistency"
16+
RECENCY = "Recency"
17+
TIMELINESS = "Timeliness"
18+
UNIQUENESS = "Uniqueness"
19+
VALIDITY = "Validity"
20+
21+
22+
class ImpactDimension(StrEnum):
23+
"""Stored ``impact_dimension`` values shared by ``profile_anomaly_types`` /
24+
``profile_anomaly_results`` and ``test_types``. The primary dimension breakdown
25+
used by scorecards."""
26+
RELIABILITY = "Reliability"
27+
CONFORMANCE = "Conformance"
28+
REGULARITY = "Regularity"
29+
USABILITY = "Usability"

testgen/common/models/hygiene_issue.py

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from collections.abc import Iterable
33
from dataclasses import dataclass
44
from datetime import datetime
5+
from enum import StrEnum
56
from typing import Self
67
from uuid import UUID, uuid4
78

@@ -21,6 +22,28 @@
2122
PII_RISK_RE = re.compile(r"Risk: (MODERATE|HIGH),")
2223

2324

25+
class Disposition(StrEnum):
26+
"""Stored disposition values for ``profile_anomaly_results.disposition`` and
27+
``test_results.disposition``. The user-facing label for ``INACTIVE`` is "Muted"."""
28+
CONFIRMED = "Confirmed"
29+
DISMISSED = "Dismissed"
30+
INACTIVE = "Inactive"
31+
32+
33+
class IssueLikelihood(StrEnum):
34+
"""Stored ``profile_anomaly_types.issue_likelihood`` values."""
35+
DEFINITE = "Definite"
36+
LIKELY = "Likely"
37+
POSSIBLE = "Possible"
38+
POTENTIAL_PII = "Potential PII"
39+
40+
41+
class PiiRisk(StrEnum):
42+
"""Risk level extracted from PII issue ``detail`` strings via ``priority`` hybrid."""
43+
HIGH = "High"
44+
MODERATE = "Moderate"
45+
46+
2447
@dataclass
2548
class IssueLikelihoodCounts:
2649
"""Counts of hygiene issues by likelihood category, with dismissed/inactive separated."""
@@ -51,6 +74,7 @@ class HygieneIssueListRow:
5174
schema_name: str
5275
table_name: str
5376
column_name: str
77+
impact_dimension: str | None
5478
dq_dimension: str | None
5579
disposition: str
5680
priority: str | None
@@ -72,6 +96,7 @@ class HygieneIssueSearchRow:
7296
schema_name: str
7397
table_name: str
7498
column_name: str
99+
impact_dimension: str | None
75100
dq_dimension: str | None
76101
disposition: str
77102
priority: str | None
@@ -92,7 +117,6 @@ class HygieneIssueDetail:
92117
schema_name: str
93118
table_name: str
94119
column_name: str
95-
db_data_type: str | None
96120
dq_dimension: str | None
97121
impact_dimension: str | None
98122
disposition: str
@@ -150,13 +174,12 @@ class HygieneIssue(Entity):
150174
schema_name: str = Column(String, nullable=False)
151175
table_name: str = Column(String, nullable=False)
152176
column_name: str = Column(String, nullable=False)
153-
db_data_type: str = Column(String)
154177

155178
detail: str = Column(String, nullable=False)
156179
disposition: str = Column(String)
157180
impact_dimension: str = Column(String)
158181

159-
# Unmapped: column_type, dq_prevalence.
182+
# Unmapped: column_type, db_data_type, dq_prevalence.
160183

161184
@hybrid_property
162185
def priority(self):
@@ -237,12 +260,12 @@ def _priority_order(cls):
237260
@classmethod
238261
def list_for_run(
239262
cls,
240-
profile_run_id: UUID,
263+
job_execution_id: UUID,
241264
*clauses,
242265
page: int = 1,
243266
limit: int = 50,
244267
) -> tuple[list[HygieneIssueListRow], int]:
245-
"""Paginated hygiene issues for a single profiling run.
268+
"""Paginated hygiene issues for a single profiling run, scoped by its job_execution_id.
246269
247270
Caller-supplied ``*clauses`` carry every WHERE filter (project scoping, disposition,
248271
likelihood / pii_risk, table / column / dq_dimension / issue_type filters).
@@ -255,14 +278,16 @@ def list_for_run(
255278
cls.schema_name.label("schema_name"),
256279
cls.table_name.label("table_name"),
257280
cls.column_name.label("column_name"),
281+
cls.impact_dimension.label("impact_dimension"),
258282
HygieneIssueType.dq_dimension.label("dq_dimension"),
259-
func.coalesce(cls.disposition, "Confirmed").label("disposition"),
283+
func.coalesce(cls.disposition, Disposition.CONFIRMED).label("disposition"),
260284
cls.priority.label("priority"),
261285
cls.detail.label("detail"),
262286
HygieneIssueType.detail_redactable.label("detail_redactable"),
263287
ProfileResult.pii_flag.label("pii_flag"),
264288
)
265289
.join(HygieneIssueType, HygieneIssueType.id == cls.type_id)
290+
.join(ProfilingRun, ProfilingRun.id == cls.profile_run_id)
266291
.outerjoin(
267292
ProfileResult,
268293
and_(
@@ -272,7 +297,7 @@ def list_for_run(
272297
ProfileResult.column_name == cls.column_name,
273298
),
274299
)
275-
.where(cls.profile_run_id == profile_run_id, *clauses)
300+
.where(ProfilingRun.job_execution_id == job_execution_id, *clauses)
276301
.order_by(cls._priority_order(), cls.table_name, cls.column_name, cls.id)
277302
)
278303
return cls._paginate(query, page=page, limit=limit, data_class=HygieneIssueListRow)
@@ -301,8 +326,9 @@ def search(
301326
cls.schema_name.label("schema_name"),
302327
cls.table_name.label("table_name"),
303328
cls.column_name.label("column_name"),
329+
cls.impact_dimension.label("impact_dimension"),
304330
HygieneIssueType.dq_dimension.label("dq_dimension"),
305-
func.coalesce(cls.disposition, "Confirmed").label("disposition"),
331+
func.coalesce(cls.disposition, Disposition.CONFIRMED).label("disposition"),
306332
cls.priority.label("priority"),
307333
cls.detail.label("detail"),
308334
HygieneIssueType.detail_redactable.label("detail_redactable"),
@@ -347,10 +373,9 @@ def get_with_context(cls, issue_id: UUID, *clauses) -> HygieneIssueDetail | None
347373
cls.schema_name.label("schema_name"),
348374
cls.table_name.label("table_name"),
349375
cls.column_name.label("column_name"),
350-
cls.db_data_type.label("db_data_type"),
351376
HygieneIssueType.dq_dimension.label("dq_dimension"),
352377
cls.impact_dimension.label("impact_dimension"),
353-
func.coalesce(cls.disposition, "Confirmed").label("disposition"),
378+
func.coalesce(cls.disposition, Disposition.CONFIRMED).label("disposition"),
354379
cls.priority.label("priority"),
355380
cls.detail.label("detail"),
356381
HygieneIssueType.detail_redactable.label("detail_redactable"),

testgen/common/models/profiling_run.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,23 @@ def get_latest_run(cls, project_code: str) -> LatestProfilingRun | None:
162162
return LatestProfilingRun(str(result["id"]), result["run_time"])
163163
return None
164164

165+
@classmethod
166+
def get_latest_complete_je_id_for_table_group(cls, table_groups_id: UUID) -> UUID | None:
167+
"""Return the ``job_execution_id`` of the latest completed profiling run for a table group.
168+
169+
Computed live from ``profiling_runs`` joined to ``job_executions`` — does not read the
170+
legacy ``table_groups.last_complete_profile_run_id`` cache, which points at the internal
171+
run PK rather than the JE id.
172+
"""
173+
query = (
174+
select(cls.job_execution_id)
175+
.join(JobExecution, cls.job_execution_id == JobExecution.id)
176+
.where(cls.table_groups_id == table_groups_id, JobExecution.status == JobStatus.COMPLETED)
177+
.order_by(desc(JobExecution.started_at))
178+
.limit(1)
179+
)
180+
return get_current_session().scalar(query)
181+
165182
@classmethod
166183
@st.cache_data(show_spinner=False, hash_funcs=ENTITY_HASH_FUNCS)
167184
def select_minimal_where(

testgen/mcp/prompts/workflows.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,16 +95,15 @@ def hygiene_triage(table_group_id: str | None = None) -> str:
9595
steps.append(
9696
"Call `get_data_inventory()` to see projects and table groups, with profiling status per group."
9797
)
98-
steps.append(f"Call `list_profiling_summaries(table_group_id={tg})` to see hygiene issue counts per run.")
9998
steps.append(f"Call `list_hygiene_issues(table_group_id={tg}, disposition='Confirmed')` for the issues to review.")
10099
steps.append(
101100
"For each top issue (ordered by priority), call `get_hygiene_issue(issue_id='...')` for full context — "
102101
"issue type description, suggested action, column profile."
103102
)
104103
steps.append("For unfamiliar issue types, read `testgen://hygiene-issue-types` once for the reference table.")
105104
steps.append(
106-
"For each issue: explain what was found, then ask the user whether to dismiss the issue "
107-
"(call `update_hygiene_issue(issue_id='...', disposition='Dismissed')`) or investigate further."
105+
"For each issue: explain what was found, then help the user decide a disposition — **Confirmed**, **Dismissed**, or **Muted**. "
106+
"Apply via `update_hygiene_issue(issue_id='...', disposition='...')`, or leave it open for further investigation."
108107
)
109108
steps.append("Summarize the triage results and any patterns noted across the issues.")
110109

testgen/mcp/tools/common.py

Lines changed: 65 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,24 @@
22
from uuid import UUID
33

44
from testgen.common.date_service import parse_since
5-
from testgen.common.models.hygiene_issue import HygieneIssueType
5+
from testgen.common.enums import ImpactDimension, QualityDimension
6+
from testgen.common.models.hygiene_issue import Disposition, HygieneIssueType, IssueLikelihood, PiiRisk
67
from testgen.common.models.table_group import TableGroup
78
from testgen.common.models.test_definition import TestType
89
from testgen.common.models.test_result import TestResultStatus
910
from testgen.common.models.test_suite import TestSuite
1011
from testgen.mcp.exceptions import MCPResourceNotAccessible, MCPUserError
1112
from testgen.mcp.permissions import get_project_permissions
1213

13-
VALID_DQ_DIMENSIONS = {"Accuracy", "Completeness", "Consistency", "Recency", "Timeliness", "Uniqueness", "Validity"}
14-
# DB stores "Inactive"; user-facing label is "Muted".
15-
_DISPOSITION_USER_TO_DB = {"Confirmed": "Confirmed", "Dismissed": "Dismissed", "Muted": "Inactive"}
16-
_DISPOSITION_DB_TO_USER = {v: k for k, v in _DISPOSITION_USER_TO_DB.items()}
17-
_VALID_ISSUE_LIKELIHOODS = {"Definite", "Likely", "Possible"}
18-
_VALID_PII_RISKS = {"High", "Moderate"}
14+
# User-facing label for ``Disposition.INACTIVE`` is "Muted" — accept that label on input.
15+
_DISPOSITION_USER_TO_DB: dict[str, Disposition] = {
16+
"Confirmed": Disposition.CONFIRMED,
17+
"Dismissed": Disposition.DISMISSED,
18+
"Muted": Disposition.INACTIVE,
19+
}
20+
_DISPOSITION_DB_TO_USER: dict[Disposition, str] = {v: k for k, v in _DISPOSITION_USER_TO_DB.items()}
21+
# Filter accepts only the regular likelihoods — PII rows are filtered separately via ``pii_risk``.
22+
_FILTERABLE_LIKELIHOODS = frozenset({IssueLikelihood.DEFINITE, IssueLikelihood.LIKELY, IssueLikelihood.POSSIBLE})
1923

2024

2125
def parse_uuid(value: str, label: str = "ID") -> UUID:
@@ -50,44 +54,74 @@ def parse_since_arg(value: str, label: str = "since", *, today: date | None = No
5054
raise MCPUserError(f"Invalid `{label}`: {err}") from err
5155

5256

53-
def parse_quality_dimension(value: str) -> str:
54-
if value not in VALID_DQ_DIMENSIONS:
55-
valid = ", ".join(sorted(VALID_DQ_DIMENSIONS))
56-
raise MCPUserError(f"Invalid quality_dimension `{value}`. Valid values: {valid}")
57-
return value
57+
def parse_impact_dimension(value: str) -> ImpactDimension:
58+
try:
59+
return ImpactDimension(value)
60+
except ValueError as err:
61+
valid = ", ".join(d.value for d in ImpactDimension)
62+
raise MCPUserError(f"Invalid impact_dimension `{value}`. Valid values: {valid}") from err
63+
5864

65+
def parse_quality_dimension(value: str) -> QualityDimension:
66+
try:
67+
return QualityDimension(value)
68+
except ValueError as err:
69+
valid = ", ".join(d.value for d in QualityDimension)
70+
raise MCPUserError(f"Invalid quality_dimension `{value}`. Valid values: {valid}") from err
5971

60-
def parse_disposition(value: str) -> str:
61-
"""Validate a user-facing disposition label and return the DB value.
6272

63-
Accepts ``Confirmed``, ``Dismissed``, ``Muted`` and returns ``Confirmed``,
64-
``Dismissed``, ``Inactive`` respectively (the DB encodes the legacy ``Inactive``).
73+
def parse_disposition(value: str) -> Disposition:
74+
"""Validate a user-facing disposition label and return the stored ``Disposition``.
75+
76+
Accepts ``Confirmed``, ``Dismissed``, ``Muted`` (user-facing labels). The DB encodes
77+
``INACTIVE`` for "Muted" — see ``Disposition``.
6578
"""
66-
if value not in _DISPOSITION_USER_TO_DB:
79+
db_value = _DISPOSITION_USER_TO_DB.get(value)
80+
if db_value is None:
6781
valid = ", ".join(sorted(_DISPOSITION_USER_TO_DB))
6882
raise MCPUserError(f"Invalid disposition `{value}`. Valid values: {valid}")
69-
return _DISPOSITION_USER_TO_DB[value]
70-
83+
return db_value
7184

72-
def format_disposition(value: str) -> str:
73-
"""Map a DB disposition value to its user-facing label (``Inactive`` → ``Muted``)."""
74-
return _DISPOSITION_DB_TO_USER.get(value, value)
7585

76-
77-
def parse_issue_likelihood_list(values: list[str]) -> list[str]:
78-
invalid = [v for v in values if v not in _VALID_ISSUE_LIKELIHOODS]
86+
def format_disposition(value: Disposition | str) -> str:
87+
"""Map a stored disposition to its user-facing label (``INACTIVE`` → "Muted")."""
88+
try:
89+
return _DISPOSITION_DB_TO_USER[Disposition(value)]
90+
except ValueError:
91+
return str(value)
92+
93+
94+
def parse_issue_likelihood_list(values: list[str]) -> list[IssueLikelihood]:
95+
parsed: list[IssueLikelihood] = []
96+
invalid: list[str] = []
97+
for value in values:
98+
try:
99+
likelihood = IssueLikelihood(value)
100+
except ValueError:
101+
invalid.append(value)
102+
continue
103+
if likelihood not in _FILTERABLE_LIKELIHOODS:
104+
invalid.append(value)
105+
continue
106+
parsed.append(likelihood)
79107
if invalid:
80-
valid = ", ".join(sorted(_VALID_ISSUE_LIKELIHOODS))
108+
valid = ", ".join(sorted(v.value for v in _FILTERABLE_LIKELIHOODS))
81109
raise MCPUserError(f"Invalid issue_likelihood values {invalid}. Valid values: {valid}")
82-
return values
110+
return parsed
83111

84112

85-
def parse_pii_risk_list(values: list[str]) -> list[str]:
86-
invalid = [v for v in values if v not in _VALID_PII_RISKS]
113+
def parse_pii_risk_list(values: list[str]) -> list[PiiRisk]:
114+
parsed: list[PiiRisk] = []
115+
invalid: list[str] = []
116+
for value in values:
117+
try:
118+
parsed.append(PiiRisk(value))
119+
except ValueError:
120+
invalid.append(value)
87121
if invalid:
88-
valid = ", ".join(sorted(_VALID_PII_RISKS))
122+
valid = ", ".join(r.value for r in PiiRisk)
89123
raise MCPUserError(f"Invalid pii_risk values {invalid}. Valid values: {valid}")
90-
return values
124+
return parsed
91125

92126

93127
def resolve_test_type(short_name: str) -> str:

0 commit comments

Comments
 (0)