Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions reports/pipeline_writers_audit.tsv
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
path writes_yaml appends_curation_history has_write_safeguard validates_before_write wired_into_just
scripts/add_community_ids.py yes yes yes yes no
scripts/add_evidence_source.py yes no yes no no
scripts/add_evidence_source.py yes yes yes yes no
scripts/apply_pmc_conversions.py yes yes yes yes no
scripts/apply_strain_designations.py yes no no no no
scripts/apply_taxonomy_corrections.py yes no no no no
scripts/backfill_metals.py yes no yes no no
scripts/clean_metals_inplace.py yes no yes no no
scripts/enhance_strain_data.py yes no no no no
scripts/clean_metals_inplace.py yes yes yes yes no
scripts/enhance_strain_data.py yes yes yes yes no
scripts/fix_network_integrity.py yes yes yes yes no
scripts/fix_reference_formats.py yes no yes no no
scripts/intelligent_snippet_fixer.py yes no no no no
scripts/intelligent_snippet_fixer.py yes yes no yes no
scripts/link_growth_media.py yes yes yes yes yes
src/communitymech/cli.py yes no yes no no
src/communitymech/network/batch_reporter.py yes no no no no
Expand Down
54 changes: 41 additions & 13 deletions scripts/add_evidence_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,21 @@
"""

import sys
import yaml
from pathlib import Path
from typing import Dict, List, Optional
import re
from typing import Dict, Optional

import yaml

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from communitymech.literature_enhanced import EnhancedLiteratureFetcher

from communitymech.curate.curation_event import record_curation_event
from communitymech.validation.write_validated import (
ValidationFailedError,
write_validated_community,
)


class EvidenceSourceAdder:
"""Add evidence_source to evidence items"""
Expand Down Expand Up @@ -269,18 +275,40 @@ def process_yaml(

# Write back if changes made
if changes:
# Backup
# Summarize the changes for the curation trail.
auto_count = sum(1 for c in changes if c.get('confidence') == 'auto')
manual_count = sum(1 for c in changes if c.get('confidence') == 'manual')
change_summary = (
f"Backfilled evidence_source on {len(changes)} evidence item(s) "
f"(auto={auto_count}, manual={manual_count})"
)
record_curation_event(
data,
curator="add_evidence_source",
action="BACKFILL_EVIDENCE_SOURCE",
changes=change_summary,
)

# Backup then write via closed-schema-gated writer. If validation
# fails, restore the backup so the loop can continue on the next
# community without leaving the disk in a torn state.
backup_path = yaml_path.with_suffix('.yaml.bak_source')
yaml_path.rename(backup_path)

# Write updated
with open(yaml_path, 'w') as f:
yaml.dump(data, f,
default_flow_style=False,
sort_keys=False,
allow_unicode=True,
width=120,
indent=2)
try:
write_validated_community(data, yaml_path)
except ValidationFailedError as exc:
backup_path.rename(yaml_path)
print(
f" ✗ validation failed for {yaml_path.name}: {exc.summary()} "
"(original restored)",
file=sys.stderr,
)
return {
'file': yaml_path.name,
'changes': [],
'count': 0,
'validation_failed': True,
}

return {
'file': yaml_path.name,
Expand Down
167 changes: 159 additions & 8 deletions scripts/enhance_strain_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,26 @@
4. ATCC catalog (type strains, genome links)
"""

import argparse
import re
import yaml
import duckdb
import sys
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple
from dataclasses import dataclass, field
import requests
import time
from collections import defaultdict

import duckdb
import yaml

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from communitymech.curate.curation_event import record_curation_event
from communitymech.validation.write_validated import (
ValidationFailedError,
write_validated_community,
)


# Color codes for output
class Colors:
Expand Down Expand Up @@ -343,13 +354,111 @@ def print_summary(self):
print(f" Taxa with culture collections: {self.stats['strains_with_collections']}")
print(f" Taxa with genome accessions: {self.stats['strains_with_genome']}")

def apply_strain_data_to_community(
self,
yaml_path: Path,
strain_data: Dict[str, StrainInfo],
*,
overwrite: bool = False,
) -> int:
"""Write extracted strain_designation entries back into a community YAML.

Loads ``yaml_path``, attaches a ``strain_designation`` to each
matching ``taxonomy[*].taxon_term`` (matched by ``preferred_term``),
appends a ``CurationEvent``, and writes via
:func:`write_validated_community` so closed-schema LinkML validation
gates the disk write. Returns the number of taxa updated.

Args:
yaml_path: Community YAML to update.
strain_data: Mapping ``preferred_term -> StrainInfo`` produced by
:meth:`extract_strain_from_yaml`.
overwrite: When False (default), skip taxa that already carry a
``strain_designation`` so curator-authored data is preserved.

Raises:
ValidationFailedError: re-raised by the caller for visibility;
callers in a batch loop should ``except`` and continue.
"""
with open(yaml_path) as f:
data = yaml.safe_load(f)

if 'taxonomy' not in data:
return 0

updated_taxa = []
for taxon_entry in data['taxonomy']:
taxon_term = taxon_entry.get('taxon_term') or {}
preferred_term = taxon_term.get('preferred_term', '')
if preferred_term not in strain_data:
continue

if 'strain_designation' in taxon_entry and not overwrite:
continue

snippet = self.generate_yaml_snippet(strain_data[preferred_term])
if not snippet:
continue

taxon_entry['strain_designation'] = snippet
updated_taxa.append(preferred_term)

if not updated_taxa:
return 0

record_curation_event(
data,
curator="enhance_strain_data",
action="ENHANCE_STRAIN_DATA",
changes=(
f"Added strain_designation for {len(updated_taxa)} taxa: "
f"{', '.join(updated_taxa[:5])}"
+ ("..." if len(updated_taxa) > 5 else "")
),
)

write_validated_community(data, yaml_path)
return len(updated_taxa)


def main():
parser = argparse.ArgumentParser(
description=(
"Phase 2: extract strain designations and (optionally) apply "
"them to community YAMLs"
)
)
parser.add_argument(
'--apply',
action='store_true',
help=(
"Write extracted strain_designation entries back into "
"kb/communities/*.yaml via write_validated_community(). "
"Without this flag the script only emits the report + snippets "
"files for human review (the historical default)."
),
)
parser.add_argument(
'--overwrite',
action='store_true',
help=(
"With --apply, replace existing strain_designation entries. "
"Default behavior preserves curator-authored values."
),
)
parser.add_argument(
'--kb-dir',
type=Path,
default=Path('kb/communities'),
help="Path to community YAML directory (default: kb/communities)",
)
args = parser.parse_args()

print(f"{Colors.BOLD}{Colors.CYAN}Phase 2: Data Enhancement - Strain Resolution{Colors.RESET}")
print(f"{Colors.CYAN}Strategy: Literature → kg-microbe → APIs{Colors.RESET}\n")

# Paths
kb_dir = Path('/Users/marcin/Documents/VIMSS/ontology/KG-Hub/KG-Microbe/CommunityMech/CommunityMech/kb/communities')
kb_dir = args.kb_dir
kgm_db = Path('kgm_taxonomy.duckdb')
output_dir = Path('.')

Expand Down Expand Up @@ -420,14 +529,56 @@ def main():

print(f"{Colors.GREEN}✓{Colors.RESET} Written: {snippets_path}")

# Apply strain designations to community YAMLs when --apply is set.
# Without --apply the script keeps its historical "extract + report"
# behavior and writes nothing to kb/communities/. With --apply each
# community is loaded, mutated in-memory, gets a CurationEvent appended,
# and is written via write_validated_community() so closed-schema
# validation refuses any doc that drifted into an invalid shape.
if args.apply:
print(f"\n{Colors.CYAN}Applying strain designations to community YAMLs...{Colors.RESET}")
applied_total = 0
applied_files = 0
failed_files = 0
for yaml_path, strain_data in sorted(all_strain_data.items()):
try:
count = extractor.apply_strain_data_to_community(
yaml_path,
strain_data,
overwrite=args.overwrite,
)
except ValidationFailedError as exc:
print(
f" {Colors.RED}✗{Colors.RESET} validation failed for "
f"{yaml_path.name}: {exc.summary()}",
file=sys.stderr,
)
failed_files += 1
continue

if count > 0:
applied_total += count
applied_files += 1
print(
f" {Colors.GREEN}✓{Colors.RESET} {yaml_path.name}: "
f"applied strain_designation to {count} taxa"
)

print(
f"\n{Colors.GREEN}Applied strain_designation to {applied_total} "
f"taxa across {applied_files} community file(s); "
f"{failed_files} file(s) failed validation.{Colors.RESET}"
)

# Print summary
extractor.print_summary()

print(f"\n{Colors.GREEN}{Colors.BOLD}✓ Phase 2 strain extraction complete!{Colors.RESET}")
print(f"\n{Colors.CYAN}Next steps:{Colors.RESET}")
print(f" 1. Review {report_path}")
print(f" 2. Review {snippets_path}")
print(f" 3. Apply strain designations to YAML files (Phase 2B)")
if not args.apply:
print(f" 3. Apply strain designations to YAML files: re-run with --apply")
print(f" 4. Query BacDive/ATCC APIs for additional metadata (Phase 2C)")

if __name__ == '__main__':
Expand Down
39 changes: 35 additions & 4 deletions scripts/intelligent_snippet_fixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,19 @@
import shutil
import sys
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from typing import Dict, List, Optional, Tuple

import yaml

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from communitymech.curate.curation_event import record_curation_event
from communitymech.literature_enhanced import EnhancedLiteratureFetcher
from communitymech.validation.write_validated import (
ValidationFailedError,
write_validated_community,
)


class SnippetSuggestion:
Expand Down Expand Up @@ -471,9 +477,34 @@ def apply_snippet_fix_to_yaml(
print(f" ❌ Could not find evidence item with name='{organism}' and reference='{reference}' in section='{section}'")
return False

# Write back to YAML with nice formatting
with open(yaml_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120)
# Record curation event for the LLM-driven snippet fix. ``skip_if_recent``
# collapses repeated per-snippet events in the same session into a single
# FIX_SNIPPETS_LLM trail entry so the history doesn't balloon when a user
# auto-approves dozens of fixes in one run.
record_curation_event(
data,
curator="intelligent_snippet_fixer",
action="FIX_SNIPPETS_LLM",
changes=(
f"Replaced evidence snippet for {organism} "
f"(reference={reference}, section={section})"
),
llm_assisted=True,
skip_if_recent=True,
)

# Write back via closed-schema-gated writer (replaces direct yaml.dump).
# The ``.yaml.bak_intelligent`` backup created at the start of
# ``interactive_fix_workflow`` is the safety net if validation refuses
# the doc — the user can restore from it manually, just like before.
try:
write_validated_community(data, yaml_path)
except ValidationFailedError as exc:
print(
f" ✗ validation failed for {yaml_path.name}: {exc.summary()}",
file=sys.stderr,
)
return False

return True

Expand Down