From ed733f160cda222a38f75f0e8c89db00c7ed9f1a Mon Sep 17 00:00:00 2001 From: "marcin p. joachimiak" <4625870+realmarcin@users.noreply.github.com> Date: Mon, 25 May 2026 19:14:22 -0700 Subject: [PATCH 1/2] Convert 3 more CommunityMech writers to use shared validate-and-record helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Brings CommunityMech writer coverage from 5/16 to 8/16. Continues the pattern established in PR #84 and refined by PR #85 (restore-on-failure backup handling): every script that mutates a community YAML loads through yaml.safe_load, mutates the dict, records a CurationEvent via record_curation_event(), and writes back via write_validated_community() which gates on closed-schema LinkML validation. Converted scripts: - scripts/intelligent_snippet_fixer.py (LLM-driven snippet repair; llm_assisted=True; action=FIX_SNIPPETS_LLM). Uses skip_if_recent=True on the curation event so a session of auto-approved fixes collapses into a single trail entry instead of one per snippet. The existing .yaml.bak_intelligent backup created at session start by shutil.copy2 remains the user-visible safety net. - scripts/enhance_strain_data.py (strain-ID enrichment; action=ENHANCE_STRAIN_DATA). Previously the script extracted strain data but only emitted a copy-paste snippets file; this PR adds an --apply mode that writes strain_designation entries back into matching taxonomy[*] entries via write_validated_community(). Default behavior preserves the historical extract-only flow (no kb/communities writes without --apply). --overwrite controls whether to replace existing curator-authored strain_designation values. - scripts/add_evidence_source.py (evidence_source enum backfill; action=BACKFILL_EVIDENCE_SOURCE). Uses the backup-then-rename pattern from PR #85 — the original is moved to .yaml.bak_source before the validated write; on ValidationFailedError the backup is renamed back in place so the batch loop can continue without leaving a half-written community on disk. Each per-record loop continues on ValidationFailedError so one bad file can't kill the batch. CLI surfaces (--auto, --interactive, --dry-run, --auto-approve, --file, --apply, etc.) preserved. After-state: scripts/audit_writers.py reports 8/16 writers gated (was 5/16). The remaining un-converted writers (apply_strain_designations, apply_taxonomy_corrections, backfill_metals, clean_metals_inplace, fix_reference_formats, plus a handful of smaller src/ writers) follow the same conversion pattern; left as future work to keep this PR focused. Note: scripts/add_evidence_source.py and scripts/intelligent_snippet_fixer.py import communitymech.literature_enhanced (a pre-existing module that does not currently exist in the repo) at module top-level. This PR does not introduce or fix that — the scripts have always failed at the import step when invoked from CLI. Out of scope for this conversion; tracked separately. Baseline (unchanged): - validate_strict: 0 ERROR rows / 265 files - pytest tests/: 136 passed, 9 skipped Co-Authored-By: Claude Opus 4.7 (1M context) --- reports/pipeline_writers_audit.tsv | 6 +- scripts/add_evidence_source.py | 54 ++++++--- scripts/enhance_strain_data.py | 167 +++++++++++++++++++++++++-- scripts/intelligent_snippet_fixer.py | 39 ++++++- 4 files changed, 238 insertions(+), 28 deletions(-) diff --git a/reports/pipeline_writers_audit.tsv b/reports/pipeline_writers_audit.tsv index 03a81b905..5a249b1bd 100644 --- a/reports/pipeline_writers_audit.tsv +++ b/reports/pipeline_writers_audit.tsv @@ -1,15 +1,15 @@ path writes_yaml appends_curation_history has_write_safeguard validates_before_write wired_into_just scripts/add_community_ids.py yes yes yes yes no -scripts/add_evidence_source.py yes no yes no no +scripts/add_evidence_source.py yes yes yes yes no scripts/apply_pmc_conversions.py yes yes yes yes no scripts/apply_strain_designations.py yes no no no no scripts/apply_taxonomy_corrections.py yes no no no no scripts/backfill_metals.py yes no yes no no scripts/clean_metals_inplace.py yes no yes no no -scripts/enhance_strain_data.py yes no no no no +scripts/enhance_strain_data.py yes yes yes yes no scripts/fix_network_integrity.py yes yes yes yes no scripts/fix_reference_formats.py yes no yes no no -scripts/intelligent_snippet_fixer.py yes no no no no +scripts/intelligent_snippet_fixer.py yes yes no yes no scripts/link_growth_media.py yes yes yes yes yes src/communitymech/cli.py yes no yes no no src/communitymech/network/batch_reporter.py yes no no no no diff --git a/scripts/add_evidence_source.py b/scripts/add_evidence_source.py index 99bfc2b8f..0c43bc618 100644 --- a/scripts/add_evidence_source.py +++ b/scripts/add_evidence_source.py @@ -19,15 +19,21 @@ """ import sys -import yaml from pathlib import Path -from typing import Dict, List, Optional -import re +from typing import Dict, Optional + +import yaml sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from communitymech.literature_enhanced import EnhancedLiteratureFetcher +from communitymech.curate.curation_event import record_curation_event +from communitymech.validation.write_validated import ( + ValidationFailedError, + write_validated_community, +) + class EvidenceSourceAdder: """Add evidence_source to evidence items""" @@ -269,18 +275,40 @@ def process_yaml( # Write back if changes made if changes: - # Backup + # Summarize the changes for the curation trail. + auto_count = sum(1 for c in changes if c.get('confidence') == 'auto') + manual_count = sum(1 for c in changes if c.get('confidence') == 'manual') + change_summary = ( + f"Backfilled evidence_source on {len(changes)} evidence item(s) " + f"(auto={auto_count}, manual={manual_count})" + ) + record_curation_event( + data, + curator="add_evidence_source", + action="BACKFILL_EVIDENCE_SOURCE", + changes=change_summary, + ) + + # Backup then write via closed-schema-gated writer. If validation + # fails, restore the backup so the loop can continue on the next + # community without leaving the disk in a torn state. backup_path = yaml_path.with_suffix('.yaml.bak_source') yaml_path.rename(backup_path) - - # Write updated - with open(yaml_path, 'w') as f: - yaml.dump(data, f, - default_flow_style=False, - sort_keys=False, - allow_unicode=True, - width=120, - indent=2) + try: + write_validated_community(data, yaml_path) + except ValidationFailedError as exc: + backup_path.rename(yaml_path) + print( + f" ✗ validation failed for {yaml_path.name}: {exc.summary()} " + "(original restored)", + file=sys.stderr, + ) + return { + 'file': yaml_path.name, + 'changes': [], + 'count': 0, + 'validation_failed': True, + } return { 'file': yaml_path.name, diff --git a/scripts/enhance_strain_data.py b/scripts/enhance_strain_data.py index 35e17ecb6..83a49317e 100644 --- a/scripts/enhance_strain_data.py +++ b/scripts/enhance_strain_data.py @@ -18,15 +18,26 @@ 4. ATCC catalog (type strains, genome links) """ +import argparse import re -import yaml -import duckdb +import sys +from collections import defaultdict +from dataclasses import dataclass, field from pathlib import Path from typing import Dict, List, Optional, Set, Tuple -from dataclasses import dataclass, field -import requests -import time -from collections import defaultdict + +import duckdb +import yaml + +# Add src to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from communitymech.curate.curation_event import record_curation_event +from communitymech.validation.write_validated import ( + ValidationFailedError, + write_validated_community, +) + # Color codes for output class Colors: @@ -343,13 +354,111 @@ def print_summary(self): print(f" Taxa with culture collections: {self.stats['strains_with_collections']}") print(f" Taxa with genome accessions: {self.stats['strains_with_genome']}") + def apply_strain_data_to_community( + self, + yaml_path: Path, + strain_data: Dict[str, StrainInfo], + *, + overwrite: bool = False, + ) -> int: + """Write extracted strain_designation entries back into a community YAML. + + Loads ``yaml_path``, attaches a ``strain_designation`` to each + matching ``taxonomy[*].taxon_term`` (matched by ``preferred_term``), + appends a ``CurationEvent``, and writes via + :func:`write_validated_community` so closed-schema LinkML validation + gates the disk write. Returns the number of taxa updated. + + Args: + yaml_path: Community YAML to update. + strain_data: Mapping ``preferred_term -> StrainInfo`` produced by + :meth:`extract_strain_from_yaml`. + overwrite: When False (default), skip taxa that already carry a + ``strain_designation`` so curator-authored data is preserved. + + Raises: + ValidationFailedError: re-raised by the caller for visibility; + callers in a batch loop should ``except`` and continue. + """ + with open(yaml_path) as f: + data = yaml.safe_load(f) + + if 'taxonomy' not in data: + return 0 + + updated_taxa = [] + for taxon_entry in data['taxonomy']: + taxon_term = taxon_entry.get('taxon_term') or {} + preferred_term = taxon_term.get('preferred_term', '') + if preferred_term not in strain_data: + continue + + if 'strain_designation' in taxon_entry and not overwrite: + continue + + snippet = self.generate_yaml_snippet(strain_data[preferred_term]) + if not snippet: + continue + + taxon_entry['strain_designation'] = snippet + updated_taxa.append(preferred_term) + + if not updated_taxa: + return 0 + + record_curation_event( + data, + curator="enhance_strain_data", + action="ENHANCE_STRAIN_DATA", + changes=( + f"Added strain_designation for {len(updated_taxa)} taxa: " + f"{', '.join(updated_taxa[:5])}" + + ("..." if len(updated_taxa) > 5 else "") + ), + ) + + write_validated_community(data, yaml_path) + return len(updated_taxa) + def main(): + parser = argparse.ArgumentParser( + description=( + "Phase 2: extract strain designations and (optionally) apply " + "them to community YAMLs" + ) + ) + parser.add_argument( + '--apply', + action='store_true', + help=( + "Write extracted strain_designation entries back into " + "kb/communities/*.yaml via write_validated_community(). " + "Without this flag the script only emits the report + snippets " + "files for human review (the historical default)." + ), + ) + parser.add_argument( + '--overwrite', + action='store_true', + help=( + "With --apply, replace existing strain_designation entries. " + "Default behavior preserves curator-authored values." + ), + ) + parser.add_argument( + '--kb-dir', + type=Path, + default=Path('kb/communities'), + help="Path to community YAML directory (default: kb/communities)", + ) + args = parser.parse_args() + print(f"{Colors.BOLD}{Colors.CYAN}Phase 2: Data Enhancement - Strain Resolution{Colors.RESET}") print(f"{Colors.CYAN}Strategy: Literature → kg-microbe → APIs{Colors.RESET}\n") # Paths - kb_dir = Path('/Users/marcin/Documents/VIMSS/ontology/KG-Hub/KG-Microbe/CommunityMech/CommunityMech/kb/communities') + kb_dir = args.kb_dir kgm_db = Path('kgm_taxonomy.duckdb') output_dir = Path('.') @@ -420,6 +529,47 @@ def main(): print(f"{Colors.GREEN}✓{Colors.RESET} Written: {snippets_path}") + # Apply strain designations to community YAMLs when --apply is set. + # Without --apply the script keeps its historical "extract + report" + # behavior and writes nothing to kb/communities/. With --apply each + # community is loaded, mutated in-memory, gets a CurationEvent appended, + # and is written via write_validated_community() so closed-schema + # validation refuses any doc that drifted into an invalid shape. + if args.apply: + print(f"\n{Colors.CYAN}Applying strain designations to community YAMLs...{Colors.RESET}") + applied_total = 0 + applied_files = 0 + failed_files = 0 + for yaml_path, strain_data in sorted(all_strain_data.items()): + try: + count = extractor.apply_strain_data_to_community( + yaml_path, + strain_data, + overwrite=args.overwrite, + ) + except ValidationFailedError as exc: + print( + f" {Colors.RED}✗{Colors.RESET} validation failed for " + f"{yaml_path.name}: {exc.summary()}", + file=sys.stderr, + ) + failed_files += 1 + continue + + if count > 0: + applied_total += count + applied_files += 1 + print( + f" {Colors.GREEN}✓{Colors.RESET} {yaml_path.name}: " + f"applied strain_designation to {count} taxa" + ) + + print( + f"\n{Colors.GREEN}Applied strain_designation to {applied_total} " + f"taxa across {applied_files} community file(s); " + f"{failed_files} file(s) failed validation.{Colors.RESET}" + ) + # Print summary extractor.print_summary() @@ -427,7 +577,8 @@ def main(): print(f"\n{Colors.CYAN}Next steps:{Colors.RESET}") print(f" 1. Review {report_path}") print(f" 2. Review {snippets_path}") - print(f" 3. Apply strain designations to YAML files (Phase 2B)") + if not args.apply: + print(f" 3. Apply strain designations to YAML files: re-run with --apply") print(f" 4. Query BacDive/ATCC APIs for additional metadata (Phase 2C)") if __name__ == '__main__': diff --git a/scripts/intelligent_snippet_fixer.py b/scripts/intelligent_snippet_fixer.py index 7d8a69e44..2733b2dc2 100755 --- a/scripts/intelligent_snippet_fixer.py +++ b/scripts/intelligent_snippet_fixer.py @@ -17,13 +17,19 @@ import shutil import sys from pathlib import Path -from typing import Dict, List, Tuple, Optional +from typing import Dict, List, Optional, Tuple + import yaml # Add src to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / "src")) +from communitymech.curate.curation_event import record_curation_event from communitymech.literature_enhanced import EnhancedLiteratureFetcher +from communitymech.validation.write_validated import ( + ValidationFailedError, + write_validated_community, +) class SnippetSuggestion: @@ -471,9 +477,34 @@ def apply_snippet_fix_to_yaml( print(f" ❌ Could not find evidence item with name='{organism}' and reference='{reference}' in section='{section}'") return False - # Write back to YAML with nice formatting - with open(yaml_path, 'w', encoding='utf-8') as f: - yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120) + # Record curation event for the LLM-driven snippet fix. ``skip_if_recent`` + # collapses repeated per-snippet events in the same session into a single + # FIX_SNIPPETS_LLM trail entry so the history doesn't balloon when a user + # auto-approves dozens of fixes in one run. + record_curation_event( + data, + curator="intelligent_snippet_fixer", + action="FIX_SNIPPETS_LLM", + changes=( + f"Replaced evidence snippet for {organism} " + f"(reference={reference}, section={section})" + ), + llm_assisted=True, + skip_if_recent=True, + ) + + # Write back via closed-schema-gated writer (replaces direct yaml.dump). + # The ``.yaml.bak_intelligent`` backup created at the start of + # ``interactive_fix_workflow`` is the safety net if validation refuses + # the doc — the user can restore from it manually, just like before. + try: + write_validated_community(data, yaml_path) + except ValidationFailedError as exc: + print( + f" ✗ validation failed for {yaml_path.name}: {exc.summary()}", + file=sys.stderr, + ) + return False return True From a9f062a241165b01761bd3e1898c5bdcf0fcc3bb Mon Sep 17 00:00:00 2001 From: "marcin p. joachimiak" <4625870+realmarcin@users.noreply.github.com> Date: Mon, 25 May 2026 19:15:56 -0700 Subject: [PATCH 2/2] Refresh audit TSV after rebase onto #86 PR #86 (Convert clean_metals_inplace.py) just merged into main. After rebasing, scripts/clean_metals_inplace.py is now gated, so the appends_curation_history / validates_before_write columns for it flip to yes. Re-running scripts/audit_writers.py produces a 1-row delta; commit it so the report reflects the actual post-rebase state. Combined post-merge: 9/16 appends_curation_history, 10/16 validates_before_write (was 5/16 and 6/16 respectively at the start of this PR series). Co-Authored-By: Claude Opus 4.7 (1M context) --- reports/pipeline_writers_audit.tsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/pipeline_writers_audit.tsv b/reports/pipeline_writers_audit.tsv index 5a249b1bd..44e3a37aa 100644 --- a/reports/pipeline_writers_audit.tsv +++ b/reports/pipeline_writers_audit.tsv @@ -5,7 +5,7 @@ scripts/apply_pmc_conversions.py yes yes yes yes no scripts/apply_strain_designations.py yes no no no no scripts/apply_taxonomy_corrections.py yes no no no no scripts/backfill_metals.py yes no yes no no -scripts/clean_metals_inplace.py yes no yes no no +scripts/clean_metals_inplace.py yes yes yes yes no scripts/enhance_strain_data.py yes yes yes yes no scripts/fix_network_integrity.py yes yes yes yes no scripts/fix_reference_formats.py yes no yes no no