diff --git a/reports/pipeline_writers_audit.tsv b/reports/pipeline_writers_audit.tsv index 03a81b90..44e3a37a 100644 --- a/reports/pipeline_writers_audit.tsv +++ b/reports/pipeline_writers_audit.tsv @@ -1,15 +1,15 @@ path writes_yaml appends_curation_history has_write_safeguard validates_before_write wired_into_just scripts/add_community_ids.py yes yes yes yes no -scripts/add_evidence_source.py yes no yes no no +scripts/add_evidence_source.py yes yes yes yes no scripts/apply_pmc_conversions.py yes yes yes yes no scripts/apply_strain_designations.py yes no no no no scripts/apply_taxonomy_corrections.py yes no no no no scripts/backfill_metals.py yes no yes no no -scripts/clean_metals_inplace.py yes no yes no no -scripts/enhance_strain_data.py yes no no no no +scripts/clean_metals_inplace.py yes yes yes yes no +scripts/enhance_strain_data.py yes yes yes yes no scripts/fix_network_integrity.py yes yes yes yes no scripts/fix_reference_formats.py yes no yes no no -scripts/intelligent_snippet_fixer.py yes no no no no +scripts/intelligent_snippet_fixer.py yes yes no yes no scripts/link_growth_media.py yes yes yes yes yes src/communitymech/cli.py yes no yes no no src/communitymech/network/batch_reporter.py yes no no no no diff --git a/scripts/add_evidence_source.py b/scripts/add_evidence_source.py index 99bfc2b8..0c43bc61 100644 --- a/scripts/add_evidence_source.py +++ b/scripts/add_evidence_source.py @@ -19,15 +19,21 @@ """ import sys -import yaml from pathlib import Path -from typing import Dict, List, Optional -import re +from typing import Dict, Optional + +import yaml sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from communitymech.literature_enhanced import EnhancedLiteratureFetcher +from communitymech.curate.curation_event import record_curation_event +from communitymech.validation.write_validated import ( + ValidationFailedError, + write_validated_community, +) + class EvidenceSourceAdder: """Add evidence_source to evidence items""" @@ -269,18 +275,40 @@ def process_yaml( # Write back if changes made if changes: - # Backup + # Summarize the changes for the curation trail. + auto_count = sum(1 for c in changes if c.get('confidence') == 'auto') + manual_count = sum(1 for c in changes if c.get('confidence') == 'manual') + change_summary = ( + f"Backfilled evidence_source on {len(changes)} evidence item(s) " + f"(auto={auto_count}, manual={manual_count})" + ) + record_curation_event( + data, + curator="add_evidence_source", + action="BACKFILL_EVIDENCE_SOURCE", + changes=change_summary, + ) + + # Backup then write via closed-schema-gated writer. If validation + # fails, restore the backup so the loop can continue on the next + # community without leaving the disk in a torn state. backup_path = yaml_path.with_suffix('.yaml.bak_source') yaml_path.rename(backup_path) - - # Write updated - with open(yaml_path, 'w') as f: - yaml.dump(data, f, - default_flow_style=False, - sort_keys=False, - allow_unicode=True, - width=120, - indent=2) + try: + write_validated_community(data, yaml_path) + except ValidationFailedError as exc: + backup_path.rename(yaml_path) + print( + f" ✗ validation failed for {yaml_path.name}: {exc.summary()} " + "(original restored)", + file=sys.stderr, + ) + return { + 'file': yaml_path.name, + 'changes': [], + 'count': 0, + 'validation_failed': True, + } return { 'file': yaml_path.name, diff --git a/scripts/enhance_strain_data.py b/scripts/enhance_strain_data.py index 35e17ecb..83a49317 100644 --- a/scripts/enhance_strain_data.py +++ b/scripts/enhance_strain_data.py @@ -18,15 +18,26 @@ 4. ATCC catalog (type strains, genome links) """ +import argparse import re -import yaml -import duckdb +import sys +from collections import defaultdict +from dataclasses import dataclass, field from pathlib import Path from typing import Dict, List, Optional, Set, Tuple -from dataclasses import dataclass, field -import requests -import time -from collections import defaultdict + +import duckdb +import yaml + +# Add src to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from communitymech.curate.curation_event import record_curation_event +from communitymech.validation.write_validated import ( + ValidationFailedError, + write_validated_community, +) + # Color codes for output class Colors: @@ -343,13 +354,111 @@ def print_summary(self): print(f" Taxa with culture collections: {self.stats['strains_with_collections']}") print(f" Taxa with genome accessions: {self.stats['strains_with_genome']}") + def apply_strain_data_to_community( + self, + yaml_path: Path, + strain_data: Dict[str, StrainInfo], + *, + overwrite: bool = False, + ) -> int: + """Write extracted strain_designation entries back into a community YAML. + + Loads ``yaml_path``, attaches a ``strain_designation`` to each + matching ``taxonomy[*].taxon_term`` (matched by ``preferred_term``), + appends a ``CurationEvent``, and writes via + :func:`write_validated_community` so closed-schema LinkML validation + gates the disk write. Returns the number of taxa updated. + + Args: + yaml_path: Community YAML to update. + strain_data: Mapping ``preferred_term -> StrainInfo`` produced by + :meth:`extract_strain_from_yaml`. + overwrite: When False (default), skip taxa that already carry a + ``strain_designation`` so curator-authored data is preserved. + + Raises: + ValidationFailedError: re-raised by the caller for visibility; + callers in a batch loop should ``except`` and continue. + """ + with open(yaml_path) as f: + data = yaml.safe_load(f) + + if 'taxonomy' not in data: + return 0 + + updated_taxa = [] + for taxon_entry in data['taxonomy']: + taxon_term = taxon_entry.get('taxon_term') or {} + preferred_term = taxon_term.get('preferred_term', '') + if preferred_term not in strain_data: + continue + + if 'strain_designation' in taxon_entry and not overwrite: + continue + + snippet = self.generate_yaml_snippet(strain_data[preferred_term]) + if not snippet: + continue + + taxon_entry['strain_designation'] = snippet + updated_taxa.append(preferred_term) + + if not updated_taxa: + return 0 + + record_curation_event( + data, + curator="enhance_strain_data", + action="ENHANCE_STRAIN_DATA", + changes=( + f"Added strain_designation for {len(updated_taxa)} taxa: " + f"{', '.join(updated_taxa[:5])}" + + ("..." if len(updated_taxa) > 5 else "") + ), + ) + + write_validated_community(data, yaml_path) + return len(updated_taxa) + def main(): + parser = argparse.ArgumentParser( + description=( + "Phase 2: extract strain designations and (optionally) apply " + "them to community YAMLs" + ) + ) + parser.add_argument( + '--apply', + action='store_true', + help=( + "Write extracted strain_designation entries back into " + "kb/communities/*.yaml via write_validated_community(). " + "Without this flag the script only emits the report + snippets " + "files for human review (the historical default)." + ), + ) + parser.add_argument( + '--overwrite', + action='store_true', + help=( + "With --apply, replace existing strain_designation entries. " + "Default behavior preserves curator-authored values." + ), + ) + parser.add_argument( + '--kb-dir', + type=Path, + default=Path('kb/communities'), + help="Path to community YAML directory (default: kb/communities)", + ) + args = parser.parse_args() + print(f"{Colors.BOLD}{Colors.CYAN}Phase 2: Data Enhancement - Strain Resolution{Colors.RESET}") print(f"{Colors.CYAN}Strategy: Literature → kg-microbe → APIs{Colors.RESET}\n") # Paths - kb_dir = Path('/Users/marcin/Documents/VIMSS/ontology/KG-Hub/KG-Microbe/CommunityMech/CommunityMech/kb/communities') + kb_dir = args.kb_dir kgm_db = Path('kgm_taxonomy.duckdb') output_dir = Path('.') @@ -420,6 +529,47 @@ def main(): print(f"{Colors.GREEN}✓{Colors.RESET} Written: {snippets_path}") + # Apply strain designations to community YAMLs when --apply is set. + # Without --apply the script keeps its historical "extract + report" + # behavior and writes nothing to kb/communities/. With --apply each + # community is loaded, mutated in-memory, gets a CurationEvent appended, + # and is written via write_validated_community() so closed-schema + # validation refuses any doc that drifted into an invalid shape. + if args.apply: + print(f"\n{Colors.CYAN}Applying strain designations to community YAMLs...{Colors.RESET}") + applied_total = 0 + applied_files = 0 + failed_files = 0 + for yaml_path, strain_data in sorted(all_strain_data.items()): + try: + count = extractor.apply_strain_data_to_community( + yaml_path, + strain_data, + overwrite=args.overwrite, + ) + except ValidationFailedError as exc: + print( + f" {Colors.RED}✗{Colors.RESET} validation failed for " + f"{yaml_path.name}: {exc.summary()}", + file=sys.stderr, + ) + failed_files += 1 + continue + + if count > 0: + applied_total += count + applied_files += 1 + print( + f" {Colors.GREEN}✓{Colors.RESET} {yaml_path.name}: " + f"applied strain_designation to {count} taxa" + ) + + print( + f"\n{Colors.GREEN}Applied strain_designation to {applied_total} " + f"taxa across {applied_files} community file(s); " + f"{failed_files} file(s) failed validation.{Colors.RESET}" + ) + # Print summary extractor.print_summary() @@ -427,7 +577,8 @@ def main(): print(f"\n{Colors.CYAN}Next steps:{Colors.RESET}") print(f" 1. Review {report_path}") print(f" 2. Review {snippets_path}") - print(f" 3. Apply strain designations to YAML files (Phase 2B)") + if not args.apply: + print(f" 3. Apply strain designations to YAML files: re-run with --apply") print(f" 4. Query BacDive/ATCC APIs for additional metadata (Phase 2C)") if __name__ == '__main__': diff --git a/scripts/intelligent_snippet_fixer.py b/scripts/intelligent_snippet_fixer.py index 7d8a69e4..2733b2dc 100755 --- a/scripts/intelligent_snippet_fixer.py +++ b/scripts/intelligent_snippet_fixer.py @@ -17,13 +17,19 @@ import shutil import sys from pathlib import Path -from typing import Dict, List, Tuple, Optional +from typing import Dict, List, Optional, Tuple + import yaml # Add src to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / "src")) +from communitymech.curate.curation_event import record_curation_event from communitymech.literature_enhanced import EnhancedLiteratureFetcher +from communitymech.validation.write_validated import ( + ValidationFailedError, + write_validated_community, +) class SnippetSuggestion: @@ -471,9 +477,34 @@ def apply_snippet_fix_to_yaml( print(f" ❌ Could not find evidence item with name='{organism}' and reference='{reference}' in section='{section}'") return False - # Write back to YAML with nice formatting - with open(yaml_path, 'w', encoding='utf-8') as f: - yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120) + # Record curation event for the LLM-driven snippet fix. ``skip_if_recent`` + # collapses repeated per-snippet events in the same session into a single + # FIX_SNIPPETS_LLM trail entry so the history doesn't balloon when a user + # auto-approves dozens of fixes in one run. + record_curation_event( + data, + curator="intelligent_snippet_fixer", + action="FIX_SNIPPETS_LLM", + changes=( + f"Replaced evidence snippet for {organism} " + f"(reference={reference}, section={section})" + ), + llm_assisted=True, + skip_if_recent=True, + ) + + # Write back via closed-schema-gated writer (replaces direct yaml.dump). + # The ``.yaml.bak_intelligent`` backup created at the start of + # ``interactive_fix_workflow`` is the safety net if validation refuses + # the doc — the user can restore from it manually, just like before. + try: + write_validated_community(data, yaml_path) + except ValidationFailedError as exc: + print( + f" ✗ validation failed for {yaml_path.name}: {exc.summary()}", + file=sys.stderr, + ) + return False return True