Skip to content

Commit 24c2e03

Browse files
committed
TutorTask696: Add quality handling and univariate analysis
Pre-commit checks: All checks passed ✅
1 parent 60ef3b3 commit 24c2e03

9 files changed

Lines changed: 2714 additions & 3 deletions

File tree

agentic_eda/jupyterlab_extension_backend/src/main.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@
1515
import src.ingest.infer_structure as sinferstruct
1616
import src.ingest.infer_type as sinfert
1717
import src.ingest.integrity as sinteg
18+
import src.quality_handling.audit_missingness as sauditmiss
19+
import src.quality_handling.handle_missingness as shandlemiss
20+
import src.quality_handling.standardize as sstandard
21+
import src.univariate_analysis.test_transforms as stransforms
22+
import src.univariate_analysis.univariate_metrics_plotting as sunivar
1823

1924
_LOG = logging.getLogger(__name__)
2025

@@ -36,6 +41,11 @@ def _parse_args() -> argparse.Namespace:
3641
"infer_structure",
3742
"compute_temporal_stats",
3843
"integrity",
44+
"audit_missingness",
45+
"handle_missingness",
46+
"standardize",
47+
"univariate_metrics_plotting",
48+
"test_transforms",
3949
],
4050
help="Pipeline stage to execute.",
4151
)
@@ -68,6 +78,16 @@ def _run_cli(args: argparse.Namespace) -> dict:
6878
payload = sinferstruct.run_infer_structure(args.path)
6979
elif mode == "compute_temporal_stats":
7080
payload = sctstats.run_compute_temporal_stats(args.path)
81+
elif mode == "audit_missingness":
82+
payload = sauditmiss.run_audit_missingness(args.path)
83+
elif mode == "handle_missingness":
84+
payload = shandlemiss.run_handle_missingness(args.path)
85+
elif mode == "standardize":
86+
payload = sstandard.run_standardize(args.path)
87+
elif mode == "univariate_metrics_plotting":
88+
payload = sunivar.run_univariate_metrics_plotting(args.path)
89+
elif mode == "test_transforms":
90+
payload = stransforms.run_test_transforms(args.path)
7191
else:
7292
raise ValueError(f"Unsupported mode='{mode}'")
7393
return payload
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""
2+
Quality-handling stages and helpers for the Jupyter backend.
3+
"""
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
"""
2+
Import as:
3+
4+
import src.quality_handling.audit_missingness as sauditmiss
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import argparse
10+
import logging
11+
from typing import TypedDict
12+
13+
import langgraph.graph as lgraph
14+
15+
import src.ingest.compute_temporal_stats as sctstats
16+
import src.tools.input_tools as tinptool
17+
18+
_LOG = logging.getLogger(__name__)
19+
20+
21+
class MissingnessAuditState(TypedDict):
22+
"""
23+
Store deterministic missingness audit output.
24+
"""
25+
26+
missingness_report: dict
27+
28+
29+
class CompositeState(TypedDict):
30+
"""
31+
Store graph state for missingness auditing.
32+
"""
33+
34+
path: str
35+
done: list[str]
36+
has_header: bool
37+
has_missing_values: bool
38+
error: str
39+
info: str
40+
cols: list[str]
41+
temporal_cols: list[str]
42+
numeric_val_cols: list[str]
43+
categorical_val_cols: list[str]
44+
bad_rows: list[dict]
45+
metadata: dict
46+
time_col: str
47+
candidates: list[dict]
48+
winner_formatter: dict
49+
entity_col: str | None
50+
numeric_cols: list[str]
51+
nonnegative_cols: list[str]
52+
jump_mult: float
53+
report: dict
54+
summary: str
55+
flag: str
56+
type: str
57+
primary_key: str
58+
secondary_keys: list[str]
59+
numeric_continuous_cols: list[str]
60+
numeric_count_cols: list[str]
61+
binary_flag_cols: list[str]
62+
categorical_feature_cols: list[str]
63+
known_exogenous_cols: list[str]
64+
target_cols: list[str]
65+
covariate_cols: list[str]
66+
n_nat_time: int
67+
min_time: str | None
68+
max_time: str | None
69+
typical_delta_mode: str | None
70+
typical_delta_median: str | None
71+
expected_frequency: str | None
72+
dominant_frequency_fraction: float
73+
is_irregular_sampling: bool
74+
resampling_decision: str
75+
coverage_summary: dict
76+
coverage_per_entity: list[dict]
77+
missingness_report: dict
78+
79+
80+
def call_compute_temporal_stats(state: CompositeState) -> dict:
81+
"""
82+
Run the sequential pipeline up to temporal statistics.
83+
84+
:param state: graph state
85+
:return: composite payload from compute_temporal_stats
86+
"""
87+
payload = sctstats.run_compute_temporal_stats(state["path"])
88+
return payload
89+
90+
91+
def audit_missingness(state: CompositeState) -> dict:
92+
"""
93+
Audit value missingness and timestamp missingness deterministically.
94+
95+
:param state: graph state
96+
:return: missingness report payload
97+
"""
98+
missingness_report = tinptool.audit_missingness.invoke(
99+
{
100+
"path": state["path"],
101+
"time_col": state["primary_key"],
102+
"secondary_keys": state["secondary_keys"],
103+
"winner_formatter": state["winner_formatter"],
104+
}
105+
)
106+
trace_payload = {
107+
"primary_key": state["primary_key"],
108+
"secondary_keys": state["secondary_keys"],
109+
"missingness_report": missingness_report,
110+
}
111+
tinptool.write_stage_trace(state["path"], "audit_missingness", trace_payload)
112+
payload = {
113+
"missingness_report": missingness_report,
114+
"has_missing_values": bool(
115+
missingness_report["value_missingness_summary"]["total_missing_cells"] > 0
116+
or missingness_report["timestamp_missingness_summary"]["total_missing_timestamps"] > 0
117+
),
118+
}
119+
return payload
120+
121+
122+
missingness_audit = lgraph.StateGraph(CompositeState)
123+
missingness_audit.add_node("compute_temporal_stats_pipeline", call_compute_temporal_stats)
124+
missingness_audit.add_node("audit_missingness", audit_missingness)
125+
missingness_audit.add_edge(lgraph.START, "compute_temporal_stats_pipeline")
126+
missingness_audit.add_edge("compute_temporal_stats_pipeline", "audit_missingness")
127+
missingness_audit.add_edge("audit_missingness", lgraph.END)
128+
graph = missingness_audit.compile()
129+
130+
131+
def run_audit_missingness(path: str) -> dict:
132+
"""
133+
Execute missingness auditing end to end.
134+
135+
:param path: dataset path
136+
:return: full composite graph payload
137+
"""
138+
init_state: CompositeState = {
139+
"path": path,
140+
"done": [],
141+
"has_header": True,
142+
"has_missing_values": False,
143+
"error": "",
144+
"info": "",
145+
"cols": [],
146+
"temporal_cols": [],
147+
"numeric_val_cols": [],
148+
"categorical_val_cols": [],
149+
"bad_rows": [],
150+
"metadata": {},
151+
"time_col": "",
152+
"candidates": [],
153+
"winner_formatter": {},
154+
"entity_col": None,
155+
"numeric_cols": [],
156+
"nonnegative_cols": [],
157+
"jump_mult": 20.0,
158+
"report": {},
159+
"summary": "",
160+
"flag": "",
161+
"type": "",
162+
"primary_key": "",
163+
"secondary_keys": [],
164+
"numeric_continuous_cols": [],
165+
"numeric_count_cols": [],
166+
"binary_flag_cols": [],
167+
"categorical_feature_cols": [],
168+
"known_exogenous_cols": [],
169+
"target_cols": [],
170+
"covariate_cols": [],
171+
"n_nat_time": 0,
172+
"min_time": None,
173+
"max_time": None,
174+
"typical_delta_mode": None,
175+
"typical_delta_median": None,
176+
"expected_frequency": None,
177+
"dominant_frequency_fraction": 0.0,
178+
"is_irregular_sampling": False,
179+
"resampling_decision": "",
180+
"coverage_summary": {},
181+
"coverage_per_entity": [],
182+
"missingness_report": {},
183+
}
184+
out = graph.invoke(init_state)
185+
payload: CompositeState = out
186+
_LOG.info("Missingness audit output: %s", payload)
187+
return payload
188+
189+
190+
def _parse_args() -> argparse.Namespace:
191+
"""
192+
Parse command-line arguments.
193+
194+
:return: parsed arguments
195+
"""
196+
parser = argparse.ArgumentParser()
197+
parser.add_argument(
198+
"--path",
199+
required=True,
200+
help="Path to dataset file.",
201+
)
202+
args = parser.parse_args()
203+
return args
204+
205+
206+
if __name__ == "__main__":
207+
logging.basicConfig(level=logging.INFO)
208+
args = _parse_args()
209+
run_audit_missingness(args.path)

0 commit comments

Comments
 (0)