Skip to content

Commit 01ad413

Browse files
author
ci bot
committed
Merge branch 'feat/TG-1029-mcp-hygiene-issues' into 'enterprise'
feat(mcp): hygiene issues for MCP (TG-1029) See merge request dkinternal/testgen/dataops-testgen!499
2 parents 27a43d3 + ead8b6c commit 01ad413

15 files changed

Lines changed: 2628 additions & 25 deletions

testgen/common/enums.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""Shared enums used across multiple models, services, and surfaces.
2+
3+
Add an enum here when its values are referenced by more than one model file or by
4+
both the model layer and an outer surface (MCP, API, UI). Single-model enums live
5+
in their model file.
6+
"""
7+
from enum import StrEnum
8+
9+
10+
class QualityDimension(StrEnum):
11+
"""Stored ``dq_dimension`` values shared by ``profile_anomaly_types`` and ``test_types``.
12+
Surfaced to users as "Quality Dimension"."""
13+
ACCURACY = "Accuracy"
14+
COMPLETENESS = "Completeness"
15+
CONSISTENCY = "Consistency"
16+
RECENCY = "Recency"
17+
TIMELINESS = "Timeliness"
18+
UNIQUENESS = "Uniqueness"
19+
VALIDITY = "Validity"
20+
21+
22+
class ImpactDimension(StrEnum):
23+
"""Stored ``impact_dimension`` values shared by ``profile_anomaly_types`` /
24+
``profile_anomaly_results`` and ``test_types``. The primary dimension breakdown
25+
used by scorecards."""
26+
RELIABILITY = "Reliability"
27+
CONFORMANCE = "Conformance"
28+
REGULARITY = "Regularity"
29+
USABILITY = "Usability"

testgen/common/models/hygiene_issue.py

Lines changed: 287 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,49 @@
11
import re
22
from collections.abc import Iterable
33
from dataclasses import dataclass
4+
from datetime import datetime
5+
from enum import StrEnum
46
from typing import Self
57
from uuid import UUID, uuid4
68

7-
from sqlalchemy import Column, ForeignKey, String, and_, case, null, select
9+
from sqlalchemy import Boolean, Column, ForeignKey, String, and_, case, null, select, update
810
from sqlalchemy.dialects import postgresql
911
from sqlalchemy.ext.hybrid import hybrid_property
1012
from sqlalchemy.orm import aliased, relationship
1113
from sqlalchemy.sql.functions import func
1214

1315
from testgen.common.models import Base, get_current_session
1416
from testgen.common.models.entity import Entity
17+
from testgen.common.models.job_execution import JobExecution
18+
from testgen.common.models.profile_result import ProfileResult
19+
from testgen.common.models.profiling_run import ProfilingRun
20+
from testgen.common.models.table_group import TableGroup
1521

1622
PII_RISK_RE = re.compile(r"Risk: (MODERATE|HIGH),")
1723

1824

25+
class Disposition(StrEnum):
26+
"""Stored disposition values for ``profile_anomaly_results.disposition`` and
27+
``test_results.disposition``. The user-facing label for ``INACTIVE`` is "Muted"."""
28+
CONFIRMED = "Confirmed"
29+
DISMISSED = "Dismissed"
30+
INACTIVE = "Inactive"
31+
32+
33+
class IssueLikelihood(StrEnum):
34+
"""Stored ``profile_anomaly_types.issue_likelihood`` values."""
35+
DEFINITE = "Definite"
36+
LIKELY = "Likely"
37+
POSSIBLE = "Possible"
38+
POTENTIAL_PII = "Potential PII"
39+
40+
41+
class PiiRisk(StrEnum):
42+
"""Risk level extracted from PII issue ``detail`` strings via ``priority`` hybrid."""
43+
HIGH = "High"
44+
MODERATE = "Moderate"
45+
46+
1947
@dataclass
2048
class IssueLikelihoodCounts:
2149
"""Counts of hygiene issues by likelihood category, with dismissed/inactive separated."""
@@ -36,14 +64,97 @@ def active(self):
3664
return self.total - self.inactive
3765

3866

67+
@dataclass
68+
class HygieneIssueListRow:
69+
"""Row shape for ``list_hygiene_issues``."""
70+
71+
id: UUID
72+
project_code: str
73+
issue_type_name: str
74+
schema_name: str
75+
table_name: str
76+
column_name: str
77+
impact_dimension: str | None
78+
dq_dimension: str | None
79+
disposition: str
80+
priority: str | None
81+
detail: str
82+
detail_redactable: bool | None
83+
pii_flag: str | None
84+
85+
86+
@dataclass
87+
class HygieneIssueSearchRow:
88+
"""Row shape for ``search_hygiene_issues``. Adds run + table-group context vs the list row."""
89+
90+
id: UUID
91+
project_code: str
92+
issue_type_name: str
93+
table_groups_name: str
94+
job_execution_id: UUID | None
95+
started_at: datetime | None
96+
schema_name: str
97+
table_name: str
98+
column_name: str
99+
impact_dimension: str | None
100+
dq_dimension: str | None
101+
disposition: str
102+
priority: str | None
103+
detail: str
104+
detail_redactable: bool | None
105+
pii_flag: str | None
106+
107+
108+
@dataclass
109+
class HygieneIssueDetail:
110+
"""Full row + type definition + column-profile context for ``get_hygiene_issue``."""
111+
112+
id: UUID
113+
project_code: str
114+
issue_type_name: str
115+
type_description: str | None
116+
suggested_action: str | None
117+
schema_name: str
118+
table_name: str
119+
column_name: str
120+
dq_dimension: str | None
121+
impact_dimension: str | None
122+
disposition: str
123+
priority: str | None
124+
detail: str
125+
detail_redactable: bool | None
126+
pii_flag: str | None
127+
job_execution_id: UUID | None
128+
started_at: datetime | None
129+
column_general_type: str | None
130+
column_db_data_type: str | None
131+
column_record_ct: int | None
132+
column_null_value_ct: int | None
133+
column_distinct_value_ct: int | None
134+
135+
39136
class HygieneIssueType(Base):
40137
__tablename__ = "profile_anomaly_types"
41138

42139
id: str = Column(String, primary_key=True)
43140
likelihood: str = Column("issue_likelihood", String)
44141
name: str = Column("anomaly_name", String)
142+
description: str = Column("anomaly_description", String)
143+
suggested_action: str = Column(String)
144+
dq_dimension: str = Column(String)
145+
impact_dimension: str = Column(String)
146+
data_object: str = Column(String)
147+
detail_redactable: bool = Column(Boolean)
45148

46-
# Note: not all table columns are implemented by this entity
149+
# Unmapped: anomaly_type, anomaly_criteria, detail_expression,
150+
# dq_score_prevalence_formula, dq_score_risk_factor.
151+
152+
@classmethod
153+
def select_where(cls, *clauses, order_by=None) -> list[Self]:
154+
query = select(cls).where(*clauses)
155+
if order_by is not None:
156+
query = query.order_by(*order_by)
157+
return list(get_current_session().scalars(query))
47158

48159

49160
class HygieneIssue(Entity):
@@ -58,14 +169,17 @@ class HygieneIssue(Entity):
58169
type_id: str = Column("anomaly_id", String, ForeignKey("profile_anomaly_types.id"), nullable=False)
59170
type_ = relationship(HygieneIssueType)
60171

172+
column_id: UUID = Column(postgresql.UUID(as_uuid=True))
173+
61174
schema_name: str = Column(String, nullable=False)
62175
table_name: str = Column(String, nullable=False)
63176
column_name: str = Column(String, nullable=False)
64177

65178
detail: str = Column(String, nullable=False)
66179
disposition: str = Column(String)
180+
impact_dimension: str = Column(String)
67181

68-
# Note: not all table columns are implemented by this entity
182+
# Unmapped: column_type, db_data_type, dq_prevalence.
69183

70184
@hybrid_property
71185
def priority(self):
@@ -132,6 +246,176 @@ def _count_active(likelihood_values: tuple[str, ...]):
132246
row = get_current_session().execute(query).first()
133247
return IssueLikelihoodCounts(**{k: v for k, v in row._mapping.items() if v is not None})
134248

249+
@classmethod
250+
def _priority_order(cls):
251+
return case(
252+
(cls.priority == "Definite", 1),
253+
(cls.priority == "Likely", 2),
254+
(cls.priority == "Possible", 3),
255+
(cls.priority == "High", 4),
256+
(cls.priority == "Moderate", 5),
257+
else_=6,
258+
)
259+
260+
@classmethod
261+
def list_for_run(
262+
cls,
263+
job_execution_id: UUID,
264+
*clauses,
265+
page: int = 1,
266+
limit: int = 50,
267+
) -> tuple[list[HygieneIssueListRow], int]:
268+
"""Paginated hygiene issues for a single profiling run, scoped by its job_execution_id.
269+
270+
Caller-supplied ``*clauses`` carry every WHERE filter (project scoping, disposition,
271+
likelihood / pii_risk, table / column / dq_dimension / issue_type filters).
272+
"""
273+
query = (
274+
select(
275+
cls.id.label("id"),
276+
cls.project_code.label("project_code"),
277+
HygieneIssueType.name.label("issue_type_name"),
278+
cls.schema_name.label("schema_name"),
279+
cls.table_name.label("table_name"),
280+
cls.column_name.label("column_name"),
281+
cls.impact_dimension.label("impact_dimension"),
282+
HygieneIssueType.dq_dimension.label("dq_dimension"),
283+
func.coalesce(cls.disposition, Disposition.CONFIRMED).label("disposition"),
284+
cls.priority.label("priority"),
285+
cls.detail.label("detail"),
286+
HygieneIssueType.detail_redactable.label("detail_redactable"),
287+
ProfileResult.pii_flag.label("pii_flag"),
288+
)
289+
.join(HygieneIssueType, HygieneIssueType.id == cls.type_id)
290+
.join(ProfilingRun, ProfilingRun.id == cls.profile_run_id)
291+
.outerjoin(
292+
ProfileResult,
293+
and_(
294+
ProfileResult.profile_run_id == cls.profile_run_id,
295+
ProfileResult.schema_name == cls.schema_name,
296+
ProfileResult.table_name == cls.table_name,
297+
ProfileResult.column_name == cls.column_name,
298+
),
299+
)
300+
.where(ProfilingRun.job_execution_id == job_execution_id, *clauses)
301+
.order_by(cls._priority_order(), cls.table_name, cls.column_name, cls.id)
302+
)
303+
return cls._paginate(query, page=page, limit=limit, data_class=HygieneIssueListRow)
304+
305+
@classmethod
306+
def search(
307+
cls,
308+
*clauses,
309+
page: int = 1,
310+
limit: int = 50,
311+
) -> tuple[list[HygieneIssueSearchRow], int]:
312+
"""Cross-run paginated search over hygiene issues.
313+
314+
Always JOINs ``ProfilingRun`` + ``JobExecution`` (for ``started_at`` + ``job_execution_id``)
315+
and ``TableGroup`` (for ``table_groups_name``). Caller-supplied ``*clauses`` carry every
316+
WHERE filter (project scoping, ``JobExecution.started_at`` window, all user filters).
317+
"""
318+
query = (
319+
select(
320+
cls.id.label("id"),
321+
cls.project_code.label("project_code"),
322+
HygieneIssueType.name.label("issue_type_name"),
323+
TableGroup.table_groups_name.label("table_groups_name"),
324+
ProfilingRun.job_execution_id.label("job_execution_id"),
325+
JobExecution.started_at.label("started_at"),
326+
cls.schema_name.label("schema_name"),
327+
cls.table_name.label("table_name"),
328+
cls.column_name.label("column_name"),
329+
cls.impact_dimension.label("impact_dimension"),
330+
HygieneIssueType.dq_dimension.label("dq_dimension"),
331+
func.coalesce(cls.disposition, Disposition.CONFIRMED).label("disposition"),
332+
cls.priority.label("priority"),
333+
cls.detail.label("detail"),
334+
HygieneIssueType.detail_redactable.label("detail_redactable"),
335+
ProfileResult.pii_flag.label("pii_flag"),
336+
)
337+
.join(HygieneIssueType, HygieneIssueType.id == cls.type_id)
338+
.join(ProfilingRun, ProfilingRun.id == cls.profile_run_id)
339+
.outerjoin(JobExecution, JobExecution.id == ProfilingRun.job_execution_id)
340+
.join(TableGroup, TableGroup.id == cls.table_groups_id)
341+
.outerjoin(
342+
ProfileResult,
343+
and_(
344+
ProfileResult.profile_run_id == cls.profile_run_id,
345+
ProfileResult.schema_name == cls.schema_name,
346+
ProfileResult.table_name == cls.table_name,
347+
ProfileResult.column_name == cls.column_name,
348+
),
349+
)
350+
.where(*clauses)
351+
.order_by(JobExecution.started_at.desc(), cls._priority_order(), cls.id)
352+
)
353+
return cls._paginate(query, page=page, limit=limit, data_class=HygieneIssueSearchRow)
354+
355+
@classmethod
356+
def get_with_context(cls, issue_id: UUID, *clauses) -> HygieneIssueDetail | None:
357+
"""Fetch one hygiene issue with type definition + column-profile context.
358+
359+
Returns ``None`` when no row matches the id and ``*clauses`` together — the
360+
caller decides whether that's "missing", "not accessible", or both collapsed
361+
into one error.
362+
363+
Joins ``ProfileResult`` outer-style: table-level issues may have no matching
364+
column profile row, in which case the column_* fields stay ``None``.
365+
"""
366+
query = (
367+
select(
368+
cls.id.label("id"),
369+
cls.project_code.label("project_code"),
370+
HygieneIssueType.name.label("issue_type_name"),
371+
HygieneIssueType.description.label("type_description"),
372+
HygieneIssueType.suggested_action.label("suggested_action"),
373+
cls.schema_name.label("schema_name"),
374+
cls.table_name.label("table_name"),
375+
cls.column_name.label("column_name"),
376+
HygieneIssueType.dq_dimension.label("dq_dimension"),
377+
cls.impact_dimension.label("impact_dimension"),
378+
func.coalesce(cls.disposition, Disposition.CONFIRMED).label("disposition"),
379+
cls.priority.label("priority"),
380+
cls.detail.label("detail"),
381+
HygieneIssueType.detail_redactable.label("detail_redactable"),
382+
ProfileResult.pii_flag.label("pii_flag"),
383+
ProfilingRun.job_execution_id.label("job_execution_id"),
384+
JobExecution.started_at.label("started_at"),
385+
ProfileResult.general_type.label("column_general_type"),
386+
ProfileResult.db_data_type.label("column_db_data_type"),
387+
ProfileResult.record_ct.label("column_record_ct"),
388+
ProfileResult.null_value_ct.label("column_null_value_ct"),
389+
ProfileResult.distinct_value_ct.label("column_distinct_value_ct"),
390+
)
391+
.join(HygieneIssueType, HygieneIssueType.id == cls.type_id)
392+
.join(ProfilingRun, ProfilingRun.id == cls.profile_run_id)
393+
.outerjoin(JobExecution, JobExecution.id == ProfilingRun.job_execution_id)
394+
.outerjoin(
395+
ProfileResult,
396+
and_(
397+
ProfileResult.profile_run_id == cls.profile_run_id,
398+
ProfileResult.schema_name == cls.schema_name,
399+
ProfileResult.table_name == cls.table_name,
400+
ProfileResult.column_name == cls.column_name,
401+
),
402+
)
403+
.where(cls.id == issue_id, *clauses)
404+
)
405+
row = get_current_session().execute(query).mappings().first()
406+
return HygieneIssueDetail(**row) if row else None
407+
408+
@classmethod
409+
def update_disposition(cls, issue_id: UUID, disposition: str, *clauses) -> bool:
410+
"""Update disposition on a single hygiene issue, scoped by caller-supplied clauses.
411+
412+
Returns ``True`` if a row was updated, ``False`` if no row matched the id and
413+
``*clauses`` together.
414+
"""
415+
stmt = update(cls).where(cls.id == issue_id, *clauses).values(disposition=disposition)
416+
result = get_current_session().execute(stmt)
417+
return result.rowcount > 0
418+
135419
@classmethod
136420
def select_with_diff(
137421
cls, profiling_run_id: UUID, other_profiling_run_id: UUID | None, *where_clauses, limit: int | None = None

testgen/common/models/profiling_run.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,23 @@ def get_latest_run(cls, project_code: str) -> LatestProfilingRun | None:
162162
return LatestProfilingRun(str(result["id"]), result["run_time"])
163163
return None
164164

165+
@classmethod
166+
def get_latest_complete_je_id_for_table_group(cls, table_groups_id: UUID) -> UUID | None:
167+
"""Return the ``job_execution_id`` of the latest completed profiling run for a table group.
168+
169+
Computed live from ``profiling_runs`` joined to ``job_executions`` — does not read the
170+
legacy ``table_groups.last_complete_profile_run_id`` cache, which points at the internal
171+
run PK rather than the JE id.
172+
"""
173+
query = (
174+
select(cls.job_execution_id)
175+
.join(JobExecution, cls.job_execution_id == JobExecution.id)
176+
.where(cls.table_groups_id == table_groups_id, JobExecution.status == JobStatus.COMPLETED)
177+
.order_by(desc(JobExecution.started_at))
178+
.limit(1)
179+
)
180+
return get_current_session().scalar(query)
181+
165182
@classmethod
166183
@st.cache_data(show_spinner=False, hash_funcs=ENTITY_HASH_FUNCS)
167184
def select_minimal_where(

0 commit comments

Comments
 (0)