11import re
22from collections .abc import Iterable
33from dataclasses import dataclass
4+ from datetime import datetime
5+ from enum import StrEnum
46from typing import Self
57from uuid import UUID , uuid4
68
7- from sqlalchemy import Column , ForeignKey , String , and_ , case , null , select
9+ from sqlalchemy import Boolean , Column , ForeignKey , String , and_ , case , null , select , update
810from sqlalchemy .dialects import postgresql
911from sqlalchemy .ext .hybrid import hybrid_property
1012from sqlalchemy .orm import aliased , relationship
1113from sqlalchemy .sql .functions import func
1214
1315from testgen .common .models import Base , get_current_session
1416from testgen .common .models .entity import Entity
17+ from testgen .common .models .job_execution import JobExecution
18+ from testgen .common .models .profile_result import ProfileResult
19+ from testgen .common .models .profiling_run import ProfilingRun
20+ from testgen .common .models .table_group import TableGroup
1521
1622PII_RISK_RE = re .compile (r"Risk: (MODERATE|HIGH)," )
1723
1824
25+ class Disposition (StrEnum ):
26+ """Stored disposition values for ``profile_anomaly_results.disposition`` and
27+ ``test_results.disposition``. The user-facing label for ``INACTIVE`` is "Muted"."""
28+ CONFIRMED = "Confirmed"
29+ DISMISSED = "Dismissed"
30+ INACTIVE = "Inactive"
31+
32+
33+ class IssueLikelihood (StrEnum ):
34+ """Stored ``profile_anomaly_types.issue_likelihood`` values."""
35+ DEFINITE = "Definite"
36+ LIKELY = "Likely"
37+ POSSIBLE = "Possible"
38+ POTENTIAL_PII = "Potential PII"
39+
40+
41+ class PiiRisk (StrEnum ):
42+ """Risk level extracted from PII issue ``detail`` strings via ``priority`` hybrid."""
43+ HIGH = "High"
44+ MODERATE = "Moderate"
45+
46+
1947@dataclass
2048class IssueLikelihoodCounts :
2149 """Counts of hygiene issues by likelihood category, with dismissed/inactive separated."""
@@ -36,14 +64,97 @@ def active(self):
3664 return self .total - self .inactive
3765
3866
67+ @dataclass
68+ class HygieneIssueListRow :
69+ """Row shape for ``list_hygiene_issues``."""
70+
71+ id : UUID
72+ project_code : str
73+ issue_type_name : str
74+ schema_name : str
75+ table_name : str
76+ column_name : str
77+ impact_dimension : str | None
78+ dq_dimension : str | None
79+ disposition : str
80+ priority : str | None
81+ detail : str
82+ detail_redactable : bool | None
83+ pii_flag : str | None
84+
85+
86+ @dataclass
87+ class HygieneIssueSearchRow :
88+ """Row shape for ``search_hygiene_issues``. Adds run + table-group context vs the list row."""
89+
90+ id : UUID
91+ project_code : str
92+ issue_type_name : str
93+ table_groups_name : str
94+ job_execution_id : UUID | None
95+ started_at : datetime | None
96+ schema_name : str
97+ table_name : str
98+ column_name : str
99+ impact_dimension : str | None
100+ dq_dimension : str | None
101+ disposition : str
102+ priority : str | None
103+ detail : str
104+ detail_redactable : bool | None
105+ pii_flag : str | None
106+
107+
108+ @dataclass
109+ class HygieneIssueDetail :
110+ """Full row + type definition + column-profile context for ``get_hygiene_issue``."""
111+
112+ id : UUID
113+ project_code : str
114+ issue_type_name : str
115+ type_description : str | None
116+ suggested_action : str | None
117+ schema_name : str
118+ table_name : str
119+ column_name : str
120+ dq_dimension : str | None
121+ impact_dimension : str | None
122+ disposition : str
123+ priority : str | None
124+ detail : str
125+ detail_redactable : bool | None
126+ pii_flag : str | None
127+ job_execution_id : UUID | None
128+ started_at : datetime | None
129+ column_general_type : str | None
130+ column_db_data_type : str | None
131+ column_record_ct : int | None
132+ column_null_value_ct : int | None
133+ column_distinct_value_ct : int | None
134+
135+
39136class HygieneIssueType (Base ):
40137 __tablename__ = "profile_anomaly_types"
41138
42139 id : str = Column (String , primary_key = True )
43140 likelihood : str = Column ("issue_likelihood" , String )
44141 name : str = Column ("anomaly_name" , String )
142+ description : str = Column ("anomaly_description" , String )
143+ suggested_action : str = Column (String )
144+ dq_dimension : str = Column (String )
145+ impact_dimension : str = Column (String )
146+ data_object : str = Column (String )
147+ detail_redactable : bool = Column (Boolean )
45148
46- # Note: not all table columns are implemented by this entity
149+ # Unmapped: anomaly_type, anomaly_criteria, detail_expression,
150+ # dq_score_prevalence_formula, dq_score_risk_factor.
151+
152+ @classmethod
153+ def select_where (cls , * clauses , order_by = None ) -> list [Self ]:
154+ query = select (cls ).where (* clauses )
155+ if order_by is not None :
156+ query = query .order_by (* order_by )
157+ return list (get_current_session ().scalars (query ))
47158
48159
49160class HygieneIssue (Entity ):
@@ -58,14 +169,17 @@ class HygieneIssue(Entity):
58169 type_id : str = Column ("anomaly_id" , String , ForeignKey ("profile_anomaly_types.id" ), nullable = False )
59170 type_ = relationship (HygieneIssueType )
60171
172+ column_id : UUID = Column (postgresql .UUID (as_uuid = True ))
173+
61174 schema_name : str = Column (String , nullable = False )
62175 table_name : str = Column (String , nullable = False )
63176 column_name : str = Column (String , nullable = False )
64177
65178 detail : str = Column (String , nullable = False )
66179 disposition : str = Column (String )
180+ impact_dimension : str = Column (String )
67181
68- # Note: not all table columns are implemented by this entity
182+ # Unmapped: column_type, db_data_type, dq_prevalence.
69183
70184 @hybrid_property
71185 def priority (self ):
@@ -132,6 +246,176 @@ def _count_active(likelihood_values: tuple[str, ...]):
132246 row = get_current_session ().execute (query ).first ()
133247 return IssueLikelihoodCounts (** {k : v for k , v in row ._mapping .items () if v is not None })
134248
249+ @classmethod
250+ def _priority_order (cls ):
251+ return case (
252+ (cls .priority == "Definite" , 1 ),
253+ (cls .priority == "Likely" , 2 ),
254+ (cls .priority == "Possible" , 3 ),
255+ (cls .priority == "High" , 4 ),
256+ (cls .priority == "Moderate" , 5 ),
257+ else_ = 6 ,
258+ )
259+
260+ @classmethod
261+ def list_for_run (
262+ cls ,
263+ job_execution_id : UUID ,
264+ * clauses ,
265+ page : int = 1 ,
266+ limit : int = 50 ,
267+ ) -> tuple [list [HygieneIssueListRow ], int ]:
268+ """Paginated hygiene issues for a single profiling run, scoped by its job_execution_id.
269+
270+ Caller-supplied ``*clauses`` carry every WHERE filter (project scoping, disposition,
271+ likelihood / pii_risk, table / column / dq_dimension / issue_type filters).
272+ """
273+ query = (
274+ select (
275+ cls .id .label ("id" ),
276+ cls .project_code .label ("project_code" ),
277+ HygieneIssueType .name .label ("issue_type_name" ),
278+ cls .schema_name .label ("schema_name" ),
279+ cls .table_name .label ("table_name" ),
280+ cls .column_name .label ("column_name" ),
281+ cls .impact_dimension .label ("impact_dimension" ),
282+ HygieneIssueType .dq_dimension .label ("dq_dimension" ),
283+ func .coalesce (cls .disposition , Disposition .CONFIRMED ).label ("disposition" ),
284+ cls .priority .label ("priority" ),
285+ cls .detail .label ("detail" ),
286+ HygieneIssueType .detail_redactable .label ("detail_redactable" ),
287+ ProfileResult .pii_flag .label ("pii_flag" ),
288+ )
289+ .join (HygieneIssueType , HygieneIssueType .id == cls .type_id )
290+ .join (ProfilingRun , ProfilingRun .id == cls .profile_run_id )
291+ .outerjoin (
292+ ProfileResult ,
293+ and_ (
294+ ProfileResult .profile_run_id == cls .profile_run_id ,
295+ ProfileResult .schema_name == cls .schema_name ,
296+ ProfileResult .table_name == cls .table_name ,
297+ ProfileResult .column_name == cls .column_name ,
298+ ),
299+ )
300+ .where (ProfilingRun .job_execution_id == job_execution_id , * clauses )
301+ .order_by (cls ._priority_order (), cls .table_name , cls .column_name , cls .id )
302+ )
303+ return cls ._paginate (query , page = page , limit = limit , data_class = HygieneIssueListRow )
304+
305+ @classmethod
306+ def search (
307+ cls ,
308+ * clauses ,
309+ page : int = 1 ,
310+ limit : int = 50 ,
311+ ) -> tuple [list [HygieneIssueSearchRow ], int ]:
312+ """Cross-run paginated search over hygiene issues.
313+
314+ Always JOINs ``ProfilingRun`` + ``JobExecution`` (for ``started_at`` + ``job_execution_id``)
315+ and ``TableGroup`` (for ``table_groups_name``). Caller-supplied ``*clauses`` carry every
316+ WHERE filter (project scoping, ``JobExecution.started_at`` window, all user filters).
317+ """
318+ query = (
319+ select (
320+ cls .id .label ("id" ),
321+ cls .project_code .label ("project_code" ),
322+ HygieneIssueType .name .label ("issue_type_name" ),
323+ TableGroup .table_groups_name .label ("table_groups_name" ),
324+ ProfilingRun .job_execution_id .label ("job_execution_id" ),
325+ JobExecution .started_at .label ("started_at" ),
326+ cls .schema_name .label ("schema_name" ),
327+ cls .table_name .label ("table_name" ),
328+ cls .column_name .label ("column_name" ),
329+ cls .impact_dimension .label ("impact_dimension" ),
330+ HygieneIssueType .dq_dimension .label ("dq_dimension" ),
331+ func .coalesce (cls .disposition , Disposition .CONFIRMED ).label ("disposition" ),
332+ cls .priority .label ("priority" ),
333+ cls .detail .label ("detail" ),
334+ HygieneIssueType .detail_redactable .label ("detail_redactable" ),
335+ ProfileResult .pii_flag .label ("pii_flag" ),
336+ )
337+ .join (HygieneIssueType , HygieneIssueType .id == cls .type_id )
338+ .join (ProfilingRun , ProfilingRun .id == cls .profile_run_id )
339+ .outerjoin (JobExecution , JobExecution .id == ProfilingRun .job_execution_id )
340+ .join (TableGroup , TableGroup .id == cls .table_groups_id )
341+ .outerjoin (
342+ ProfileResult ,
343+ and_ (
344+ ProfileResult .profile_run_id == cls .profile_run_id ,
345+ ProfileResult .schema_name == cls .schema_name ,
346+ ProfileResult .table_name == cls .table_name ,
347+ ProfileResult .column_name == cls .column_name ,
348+ ),
349+ )
350+ .where (* clauses )
351+ .order_by (JobExecution .started_at .desc (), cls ._priority_order (), cls .id )
352+ )
353+ return cls ._paginate (query , page = page , limit = limit , data_class = HygieneIssueSearchRow )
354+
355+ @classmethod
356+ def get_with_context (cls , issue_id : UUID , * clauses ) -> HygieneIssueDetail | None :
357+ """Fetch one hygiene issue with type definition + column-profile context.
358+
359+ Returns ``None`` when no row matches the id and ``*clauses`` together — the
360+ caller decides whether that's "missing", "not accessible", or both collapsed
361+ into one error.
362+
363+ Joins ``ProfileResult`` outer-style: table-level issues may have no matching
364+ column profile row, in which case the column_* fields stay ``None``.
365+ """
366+ query = (
367+ select (
368+ cls .id .label ("id" ),
369+ cls .project_code .label ("project_code" ),
370+ HygieneIssueType .name .label ("issue_type_name" ),
371+ HygieneIssueType .description .label ("type_description" ),
372+ HygieneIssueType .suggested_action .label ("suggested_action" ),
373+ cls .schema_name .label ("schema_name" ),
374+ cls .table_name .label ("table_name" ),
375+ cls .column_name .label ("column_name" ),
376+ HygieneIssueType .dq_dimension .label ("dq_dimension" ),
377+ cls .impact_dimension .label ("impact_dimension" ),
378+ func .coalesce (cls .disposition , Disposition .CONFIRMED ).label ("disposition" ),
379+ cls .priority .label ("priority" ),
380+ cls .detail .label ("detail" ),
381+ HygieneIssueType .detail_redactable .label ("detail_redactable" ),
382+ ProfileResult .pii_flag .label ("pii_flag" ),
383+ ProfilingRun .job_execution_id .label ("job_execution_id" ),
384+ JobExecution .started_at .label ("started_at" ),
385+ ProfileResult .general_type .label ("column_general_type" ),
386+ ProfileResult .db_data_type .label ("column_db_data_type" ),
387+ ProfileResult .record_ct .label ("column_record_ct" ),
388+ ProfileResult .null_value_ct .label ("column_null_value_ct" ),
389+ ProfileResult .distinct_value_ct .label ("column_distinct_value_ct" ),
390+ )
391+ .join (HygieneIssueType , HygieneIssueType .id == cls .type_id )
392+ .join (ProfilingRun , ProfilingRun .id == cls .profile_run_id )
393+ .outerjoin (JobExecution , JobExecution .id == ProfilingRun .job_execution_id )
394+ .outerjoin (
395+ ProfileResult ,
396+ and_ (
397+ ProfileResult .profile_run_id == cls .profile_run_id ,
398+ ProfileResult .schema_name == cls .schema_name ,
399+ ProfileResult .table_name == cls .table_name ,
400+ ProfileResult .column_name == cls .column_name ,
401+ ),
402+ )
403+ .where (cls .id == issue_id , * clauses )
404+ )
405+ row = get_current_session ().execute (query ).mappings ().first ()
406+ return HygieneIssueDetail (** row ) if row else None
407+
408+ @classmethod
409+ def update_disposition (cls , issue_id : UUID , disposition : str , * clauses ) -> bool :
410+ """Update disposition on a single hygiene issue, scoped by caller-supplied clauses.
411+
412+ Returns ``True`` if a row was updated, ``False`` if no row matched the id and
413+ ``*clauses`` together.
414+ """
415+ stmt = update (cls ).where (cls .id == issue_id , * clauses ).values (disposition = disposition )
416+ result = get_current_session ().execute (stmt )
417+ return result .rowcount > 0
418+
135419 @classmethod
136420 def select_with_diff (
137421 cls , profiling_run_id : UUID , other_profiling_run_id : UUID | None , * where_clauses , limit : int | None = None
0 commit comments