Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 96 additions & 15 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,31 @@
omdb_key = st.sidebar.text_input("OMDb API Key (OMDB_API_KEY)", type="password", value=os.getenv("OMDB_API_KEY", ""))
tmdb_key = st.sidebar.text_input("TMDb API Key (TMDB_API_KEY)", type="password", value=os.getenv("TMDB_API_KEY", ""))

st.sidebar.divider()
st.sidebar.header("🧩 ER / MVI Options")
st.sidebar.caption("ER=Entity Resolution (LLM-driven); MVI=Missing Value Imputation (API-driven)")
enable_er = st.sidebar.checkbox("Enable ER (Entity Resolution - deduplicate entities)", value=True)
enable_mvi = st.sidebar.checkbox("Enable MVI (Missing Value Imputation - Wikipedia/TMDb/OMDb/Kaggle)", value=True)
er_sample_rows = st.sidebar.slider("ER/MVI Sample Rows (to limit external requests)", min_value=50, max_value=1000,
value=200,
step=50)
st.sidebar.caption("👉 Without API keys, only Wikipedia (no key required) will be used.")
# ---------------------- Sidebar: Cleaning & Validation ----------------------
st.sidebar.header("🧽 Cleaning & Validation")
perform_cleaning = st.sidebar.checkbox("Enable data cleaning & external validation", value=False)
cleaning_mode = st.sidebar.selectbox(
"Cleaning mode",
options=["comprehensive", "llm", "type", "minimal"],
index=0,
disabled=not perform_cleaning
)
validation_source = st.sidebar.selectbox(
"Validation source",
options=["wikipedia", "tmdb", "omdb", "comprehensive"],
index=0,
disabled=not perform_cleaning
)
validation_cols_text = st.sidebar.text_input(
"Columns to validate (comma separated, optional)",
value="",
disabled=not perform_cleaning
)

validation_columns = None
if perform_cleaning:
cols = [c.strip() for c in validation_cols_text.split(",") if c.strip()]
validation_columns = cols if cols else None

# ---------------------- Queries Input ----------------------
st.subheader("📝 Natural Language Queries (one per line)")
Expand Down Expand Up @@ -119,8 +135,26 @@ def run_one_query(idx: int, question: str) -> dict:
"max_concurrency": int(max_conc),
"api_base": api_base,
"api_key": api_key or os.getenv("GPTNB_API_KEY", ""),
"er_mvi_sample_rows": int(er_sample_rows),
}
if perform_cleaning:
cfg.update({
"enable_cleaning": True,
"cleaning_mode": cleaning_mode,
"enable_validation": True,
"validation_source": validation_source,
"validation_columns": validation_columns,
"validation_api_keys": {
k: v for k, v in {
"tmdb": tmdb_key,
"omdb": omdb_key,
}.items() if v
}
})
else:
cfg.update({
"enable_cleaning": False,
"enable_validation": False,
})
cfg_path.write_text(json.dumps(cfg, ensure_ascii=False, indent=2), encoding="utf-8")

# Build subprocess environment: LLM + external enhancement
Expand All @@ -132,11 +166,6 @@ def run_one_query(idx: int, question: str) -> dict:
if omdb_key: env_for_child["OMDB_API_KEY"] = omdb_key
if tmdb_key: env_for_child["TMDB_API_KEY"] = tmdb_key

# Separate ER and MVI switches
env_for_child["ER_ENABLED"] = "1" if enable_er else "0"
env_for_child["MVI_ENABLED"] = "1" if enable_mvi else "0"
env_for_child["ER_MVI_SAMPLE_ROWS"] = str(er_sample_rows)

# Run main pipeline
t0 = time.time()
proc = subprocess.run(
Expand Down Expand Up @@ -168,6 +197,10 @@ def run_one_query(idx: int, question: str) -> dict:
"validation_score": None,
"validation_stdout": "",
"validation_stderr": "",
"cleaning_report": "",
"cleaning_summary": {},
"validation_provider_report": "",
"validation_summary": {},
}

# Read entity_report from meta (if exists)
Expand All @@ -181,6 +214,24 @@ def run_one_query(idx: int, question: str) -> dict:
erp_path = out_csv.with_name(out_csv.stem + "_entity_report.csv")
if erp_path.exists():
summary["entity_report"] = str(erp_path)
clean_report = meta.get("cleaning_report", "")
if clean_report:
clean_path = Path(clean_report)
if not clean_path.is_absolute() and out_csv_exists:
clean_path = out_csv.with_name(out_csv.stem + "_cleaning_report.json")
if clean_path.exists():
summary["cleaning_report"] = str(clean_path)
validation_provider_rep = meta.get("validation_report", "")
if validation_provider_rep:
vpr_path = Path(validation_provider_rep)
if not vpr_path.is_absolute() and out_csv_exists:
vpr_path = out_csv.with_name(out_csv.stem + "_validation_provider_report.json")
if vpr_path.exists():
summary["validation_provider_report"] = str(vpr_path)
if isinstance(meta.get("cleaning"), dict):
summary["cleaning_summary"] = meta["cleaning"]
if isinstance(meta.get("validation"), dict):
summary["validation_summary"] = meta["validation"]
except Exception:
pass

Expand Down Expand Up @@ -296,6 +347,36 @@ def run_one_query(idx: int, question: str) -> dict:
mime="text/csv"
)

# Cleaning stage report (if enabled)
if res.get("cleaning_report") or res.get("cleaning_summary"):
with st.expander("🧽 Cleaning Summary", expanded=False):
if res.get("cleaning_summary"):
st.json(res["cleaning_summary"])
if res.get("cleaning_report"):
clean_path = Path(res["cleaning_report"])
if clean_path.exists():
st.download_button(
"⬇️ Download Cleaning Report (JSON)",
data=clean_path.read_bytes(),
file_name=clean_path.name,
mime="application/json"
)

# External validation provider report (post-cleaning)
if res.get("validation_provider_report") or res.get("validation_summary"):
with st.expander("✅ External Validation Summary", expanded=False):
if res.get("validation_summary"):
st.json(res["validation_summary"])
if res.get("validation_provider_report"):
vpr = Path(res["validation_provider_report"])
if vpr.exists():
st.download_button(
"⬇️ Download External Validation Report (JSON)",
data=vpr.read_bytes(),
file_name=vpr.name,
mime="application/json"
)

# ═══════════════════════════════════════════════════════════
# 🎓 VLDB Validation Report (NEW)
# ═══════════════════════════════════════════════════════════
Expand Down
2 changes: 1 addition & 1 deletion cleaning_providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,4 @@ async def clean_data(
# 执行清洗
cleaned_df, report = await cleaner.clean(df, primary_keys, rules)

return cleaned_df, report
return cleaned_df, report
Loading