From 50b34f0ac1209d1f3a38f6ea07155200180ae6b6 Mon Sep 17 00:00:00 2001 From: "marcin p. joachimiak" <4625870+realmarcin@users.noreply.github.com> Date: Mon, 25 May 2026 19:08:54 -0700 Subject: [PATCH 1/2] Convert clean_metals_inplace.py to load/dump roundtrip with write_validated_community MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously this script mutated 149 community YAMLs in place via regex / line-edit surgery on the raw text, then wrote the result with path.write_text(string). The recent schema-gap audit (PR #85) flagged it as the highest-risk writer in the repo: no validation gate, no curation_history entry on changed files, and the audit detector itself was blind to it until the previous PR broadened the heuristic to catch read_text/write_text round-trips against kb/communities/. Now: load through yaml.safe_load, run the same cleaning logic on the parsed dict, append a CLEAN_METAL_FIELDS CurationEvent, and write back via write_validated_community (with backup-restore on validation failure, matching the apply_pmc_conversions / link_growth_media pattern from PR #85). The --dry-run default + --apply opt-in flag matches the existing repo convention. Cleaning semantics preserved verbatim: the same AMBIGUOUS_METAL_KEYWORDS table is consulted (TITANIUM / GOLD / PALLADIUM), and the same word-bounded evidence search runs over the document minus its metals_present block — implemented now by yaml.safe_dump'ing a shallow copy of the doc with metals_present stripped, which keyword_in_text scans identically (its regex anchors on non-alphanumeric boundaries, so YAML structural characters do not change match outcomes). Smoke-tested against a /tmp copy of the corpus: - Baseline corpus (already cleaned by a prior buggy-extractor run): reports 0/265 files would change, as expected. - With an injected false-positive (TITANIUM in a doc with no titanium evidence): correctly removed in dry-run preview and in --apply, with a schema-valid curation_history event appended and backup unlinked on success. Audit row before: yes no yes no no Audit row after: yes yes yes yes no (writes_yaml / appends_curation_history / has_write_safeguard / validates_before_write / wired_into_just — wired_into_just stays no; this script is a one-shot cleanup, not a recurring pipeline step.) Baselines remain green: scripts/validate_strict.py reports 0 errors across 265 files; pytest tests/ 136 passed / 9 skipped; ruff clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/clean_metals_inplace.py | 180 ++++++++++++++++++++++---------- 1 file changed, 127 insertions(+), 53 deletions(-) diff --git a/scripts/clean_metals_inplace.py b/scripts/clean_metals_inplace.py index 3e6e4aaf..07c30042 100644 --- a/scripts/clean_metals_inplace.py +++ b/scripts/clean_metals_inplace.py @@ -18,10 +18,7 @@ Out of scope deliberately: - `metal_notes` and `metal_relevance` are never touched. Curator-authored - values in those fields would otherwise be silently clobbered, and - rewriting a multi-line YAML scalar from a key-line regex is unsafe - (it leaves orphaned continuation lines that get re-folded into the - new value). + values in those fields would otherwise be silently clobbered. - Adding new metal entries is also out of scope. The fixed extractor may find legitimate metals the old buggy run missed (e.g., via CHEBI tier-1 matching), but adding them here would surprise curators and @@ -31,20 +28,48 @@ short enough to false-match the substring pattern that affects metals. Usage: - uv run python scripts/clean_metals_inplace.py --dry-run - uv run python scripts/clean_metals_inplace.py + uv run python scripts/clean_metals_inplace.py # dry-run (default) + uv run python scripts/clean_metals_inplace.py --apply # write changes The script self-bootstraps `src/` onto `sys.path`, so PYTHONPATH does not need to be set when invoking it directly. + +History +------- +Originally this script mutated each YAML in place via regex/line edits +on the raw text and `path.write_text(string)`. That bypassed the +write-time validation gate and never recorded a CurationEvent — both +flagged by the writers audit (PR #85). This rewrite loads the document +through `yaml.safe_load`, mutates the in-memory dict, records a +`CLEAN_METAL_FIELDS` curation event, and writes back through +`write_validated_community` (with backup-restore on validation failure, +matching the apply_pmc_conversions / link_growth_media pattern). + +The cleaning semantics are preserved verbatim: the same ambiguous-symbol +table is consulted, and the same word-bounded evidence search runs over +the document minus its `metals_present` block. The only externally +visible difference is that the file is now re-emitted as canonical YAML +(comments and incidental formatting are not preserved across the +roundtrip — this is the same cost paid by every other validated-writer +script in the repo). """ +from __future__ import annotations + import argparse import sys from pathlib import Path +import yaml + sys.path.insert(0, str(Path(__file__).parent.parent / "src")) +from communitymech.curate.curation_event import record_curation_event from communitymech.metal_extraction import keyword_in_text +from communitymech.validation.write_validated import ( + ValidationFailedError, + write_validated_community, +) # Metals whose keyword list contains a short symbol that the old buggy # substring matcher false-fired on. For each, the "unambiguous" tokens @@ -57,37 +82,36 @@ } -def _read_metals_block(lines: list[str]) -> tuple[int, int, list[str]] | None: - """Locate the metals_present block. Returns (start_idx, end_idx, entries).""" - for i, line in enumerate(lines): - if line.rstrip("\n") == "metals_present:" or line.startswith("metals_present: "): - inline = line.rstrip("\n").removeprefix("metals_present:").strip() - if inline: - # Inline scalar form (e.g., `metals_present: []`) — nothing to clean. - return i, i + 1, [] - entries: list[str] = [] - end = i + 1 - while end < len(lines) and lines[end].startswith("- "): - entries.append(lines[end][2:].strip()) - end += 1 - return i, end, entries - return None - - -def clean_file(path: Path, dry_run: bool) -> tuple[bool, str]: - text = path.read_text() - lines = text.splitlines(keepends=True) - located = _read_metals_block(lines) - if located is None: - return False, "" - start, end, entries = located +def _evidence_text(doc: dict) -> str: + """Serialize ``doc`` minus ``metals_present`` for word-bounded keyword search. + + The original (text-based) implementation built the evidence string by + concatenating every line of the source file *except* the + ``metals_present:`` block, so that an ambiguous entry would never + self-satisfy by matching its own listing. We mirror that exclusion + here by stripping the key from a shallow copy before dumping. + + ``keyword_in_text`` anchors on non-alphanumeric boundaries, so YAML + structural characters (``:``, ``-``, newlines, quotes) do not affect + match outcomes versus the original raw-text view. + """ + scratch = {k: v for k, v in doc.items() if k != "metals_present"} + return yaml.safe_dump(scratch, allow_unicode=True, sort_keys=False) + + +def clean_metal_fields_in_doc(doc: dict) -> tuple[list[str], list[str]]: + """Remove false-positive ambiguous metals from ``doc['metals_present']``. + + Mutates ``doc`` in place. Returns ``(removed, kept)`` so callers can + log what changed. When ``metals_present`` is missing, empty, or has + no entries to drop, returns ``([], existing_list)`` and leaves the + doc untouched. + """ + entries = doc.get("metals_present") if not entries: - return False, "" + return [], list(entries or []) - # Exclude the metals_present block itself when searching for evidence — - # otherwise the `- TITANIUM` entry we're trying to validate would match - # its own keywords and never be flagged as a false positive. - evidence_text = "".join(lines[:start] + lines[end:]) + evidence = _evidence_text(doc) kept: list[str] = [] removed: list[str] = [] @@ -96,47 +120,97 @@ def clean_file(path: Path, dry_run: bool) -> tuple[bool, str]: if unambig is None: kept.append(entry) continue - if any(keyword_in_text(kw, evidence_text) for kw in unambig): + if any(keyword_in_text(kw, evidence) for kw in unambig): kept.append(entry) else: removed.append(entry) - if not removed: - return False, "" + if removed: + doc["metals_present"] = kept + return removed, kept + - # Rebuild only the metals_present block; everything else (including - # metal_relevance, metal_notes, and any comments) is preserved. - if kept: - body = "".join(f"- {e}\n" for e in kept) - new_block = "metals_present:\n" + body - else: - new_block = "metals_present: []\n" +def clean_file(path: Path, apply: bool) -> tuple[bool, str]: + """Load, clean, and (if ``apply``) write a single community YAML. + + Returns ``(changed, summary)``. ``changed`` is True iff at least one + false-positive metal was removed. ``summary`` is a human-readable + one-liner suitable for printing alongside the file name. + """ + with path.open() as f: + doc = yaml.safe_load(f) + if not isinstance(doc, dict): + return False, "" - new_lines = lines[:start] + [new_block] + lines[end:] - new_text = "".join(new_lines) + removed, kept = clean_metal_fields_in_doc(doc) + if not removed: + return False, "" summary = f" removed: {removed}; kept: {kept}" - if not dry_run: - path.write_text(new_text) + + if not apply: + return True, summary + + record_curation_event( + doc, + curator="clean_metals_inplace", + action="CLEAN_METAL_FIELDS", + changes=( + f"Removed {len(removed)} false-positive metal(s) from metals_present: " + f"{', '.join(removed)}" + ), + ) + + # Restore-on-failure pattern (matches apply_pmc_conversions / + # link_growth_media from PR #85): rename to .bak first so a failed + # validation doesn't leave the original missing on disk. + backup = path.with_suffix(".yaml.bak_metals") + path.rename(backup) + try: + write_validated_community(doc, path) + except ValidationFailedError as exc: + backup.rename(path) + print( + f" ✗ validation failed for {path.name}: {exc.summary()} " + "(original restored from backup)", + file=sys.stderr, + ) + return False, summary + backup.unlink() # success — drop the backup return True, summary def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--dry-run", action="store_true") + parser = argparse.ArgumentParser(description=__doc__) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--dry-run", + action="store_true", + help="Preview changes without writing (default).", + ) + group.add_argument( + "--apply", + action="store_true", + help="Actually write changes back to disk.", + ) args = parser.parse_args() + apply = args.apply # dry-run is the default; --apply is the explicit opt-in + community_dir = Path("kb/communities") files = sorted(community_dir.glob("*.yaml")) changed = 0 for f in files: - did_change, summary = clean_file(f, dry_run=args.dry_run) + did_change, summary = clean_file(f, apply=apply) if did_change: changed += 1 print(f"{f.name}") print(summary) - verb = "would change" if args.dry_run else "changed" + + verb = "changed" if apply else "would change" print(f"\n{verb} {changed}/{len(files)} files") + if not apply: + print("(dry-run; re-run with --apply to write)") if __name__ == "__main__": From 4e6dcd0a1cfa72a94c2698973966e92018feba2c Mon Sep 17 00:00:00 2001 From: "marcin p. joachimiak" <4625870+realmarcin@users.noreply.github.com> Date: Mon, 25 May 2026 19:13:32 -0700 Subject: [PATCH 2/2] Address Copilot review on PR #86 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two findings: - The backup/restore flow only caught ValidationFailedError. If any other exception fired after path.rename(backup) (disk error, permission denied, unexpected serializer fault), the original would be left as .yaml.bak_metals with no .yaml. Catch BaseException too: restore the backup before re-raising so the corpus is never left with a missing canonical file. - _evidence_text serializes via yaml.safe_dump, which drops comments and can reflow scalars — strictly not a perfect raw-text equivalent. Document the caveat: the keywords we search for live in YAML content (description, evidence snippet, ecological_state), never in comments, so the on-disk behavior is identical for the corpus today. If a future workflow starts encoding evidence in YAML comments, the function should switch to path.read_text() with a line-strip filter. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/clean_metals_inplace.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/scripts/clean_metals_inplace.py b/scripts/clean_metals_inplace.py index 07c30042..77963ccd 100644 --- a/scripts/clean_metals_inplace.py +++ b/scripts/clean_metals_inplace.py @@ -91,9 +91,16 @@ def _evidence_text(doc: dict) -> str: self-satisfy by matching its own listing. We mirror that exclusion here by stripping the key from a shallow copy before dumping. - ``keyword_in_text`` anchors on non-alphanumeric boundaries, so YAML - structural characters (``:``, ``-``, newlines, quotes) do not affect - match outcomes versus the original raw-text view. + Caveat — this is a near-but-not-perfect equivalent of the raw-text + view: ``yaml.safe_dump`` drops YAML comments and may reflow scalars + (e.g. line widths on long descriptions). Practically, the keywords + we search for (``zinc``, ``copper``, ``calcium``, …) live in + ``description``, ``ecological_state``, evidence ``snippet`` values, + and other YAML *content* — never in YAML comments. So the on-disk + behavior is identical for the corpus today. If a future workflow + starts encoding evidence in YAML comments, this assumption breaks + and the function should switch to ``path.read_text()`` with a + ``metals_present`` line-strip filter to preserve full fidelity. """ scratch = {k: v for k, v in doc.items() if k != "metals_present"} return yaml.safe_dump(scratch, allow_unicode=True, sort_keys=False) @@ -163,7 +170,12 @@ def clean_file(path: Path, apply: bool) -> tuple[bool, str]: # Restore-on-failure pattern (matches apply_pmc_conversions / # link_growth_media from PR #85): rename to .bak first so a failed - # validation doesn't leave the original missing on disk. + # write doesn't leave the original missing on disk. Catch BOTH + # ValidationFailedError (the expected-error case, surfaced with a + # short summary) and *any other* exception (disk error, permission + # denied, unexpected serializer fault) so the bak file always gets + # rolled back rather than leaving the user with `.yaml.bak_metals` + # and no `.yaml`. backup = path.with_suffix(".yaml.bak_metals") path.rename(backup) try: @@ -176,6 +188,12 @@ def clean_file(path: Path, apply: bool) -> tuple[bool, str]: file=sys.stderr, ) return False, summary + except BaseException: + # Restore the backup before re-raising so the corpus is never + # left with a missing canonical file. + if backup.exists() and not path.exists(): + backup.rename(path) + raise backup.unlink() # success — drop the backup return True, summary