diff --git a/download.yaml b/download.yaml index 991b2700..d91457f5 100644 --- a/download.yaml +++ b/download.yaml @@ -442,3 +442,6 @@ - url: https://s3.amazonaws.com/bbop-sqlite/mesh.db.gz local_name: mesh.db.gz +- + url: https://s3.amazonaws.com/bbop-sqlite/bto.db.gz + local_name: bto.db.gz diff --git a/kg_microbe/transform_utils/bacdive/bacdive.py b/kg_microbe/transform_utils/bacdive/bacdive.py index 72407576..9596d9bd 100644 --- a/kg_microbe/transform_utils/bacdive/bacdive.py +++ b/kg_microbe/transform_utils/bacdive/bacdive.py @@ -2988,20 +2988,24 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu # ontology. Loaded-ontology targets (UBERON, ENVO, ...) get # their canonical node from the ontologies transform. # - # NCIT and mesh stub nodes are NOT emitted here — the + # NCIT, mesh, and BTO stub nodes are NOT emitted here — the # OntologiesStubsTransform (kg_microbe/transform_utils/ # ontologies_stubs/) writes label+synonym+xref-enriched # stubs from the SemSQL DBs, which is strictly richer # than the label-only fallback below. Emitting both # here and there would produce duplicate node rows # that the merge would have to dedupe. The PRIDE/PCO/ - # GENEPIO/FAO/BTO/SNOMED prefixes stay on the inline - # path because each has 1-3 IDs in the whole repo — - # not worth a SemSQL fetch. + # GENEPIO/FAO/SNOMED prefixes stay on the inline path + # because each has 1-3 IDs in the whole repo — not + # worth a SemSQL fetch. (BTO was originally in that + # group too but moved to the SemSQL path after the + # MIM 2026-05-18 republish added `BTO:0004304 cell + # lysate`, doubling its in-repo footprint.) stub_prefix = subject_id.split(":", 1)[0] if ":" in subject_id else "" if stub_prefix in STUB_ONTOLOGY_PREFIXES and stub_prefix not in { "NCIT", "mesh", + "BTO", }: node_writer.writerow( self._create_node_row( diff --git a/kg_microbe/transform_utils/ontologies_stubs/ontologies_stubs_transform.py b/kg_microbe/transform_utils/ontologies_stubs/ontologies_stubs_transform.py index 48f33f47..83054a0d 100644 --- a/kg_microbe/transform_utils/ontologies_stubs/ontologies_stubs_transform.py +++ b/kg_microbe/transform_utils/ontologies_stubs/ontologies_stubs_transform.py @@ -68,6 +68,14 @@ "db_filename": "mesh.db", "knowledge_source": "infores:mesh", }, + "BTO": { + # BRENDA Tissue Ontology. Only ~2 CURIEs in current kg-microbe + # mappings (wound fluid from BacDive isolation_source; cell lysate + # added by the MIM 2026-05-18 republish). Added here so those nodes + # carry full label + synonyms + xrefs instead of label-only stubs. + "db_filename": "bto.db", + "knowledge_source": "infores:bto", + }, } ONTOLOGIES_STUBS_SOURCE_NAME = "ontologies_stubs" @@ -75,7 +83,7 @@ class OntologiesStubsTransform(Transform): - """Emit one labelled stub node per referenced NCIT / MESH CURIE.""" + """Emit one labelled stub node per referenced NCIT / MESH / BTO CURIE.""" def __init__( self, diff --git a/kg_microbe/utils/isolation_source_mapping_utils.py b/kg_microbe/utils/isolation_source_mapping_utils.py index 8c5aa8c7..d4493f64 100644 --- a/kg_microbe/utils/isolation_source_mapping_utils.py +++ b/kg_microbe/utils/isolation_source_mapping_utils.py @@ -90,18 +90,18 @@ # # Two stub-import paths exist for these prefixes: # -# 1. NCIT and mesh: a SemSQL-backed enriched stub source. The +# 1. NCIT, mesh, and BTO: a SemSQL-backed enriched stub source. The # OntologiesStubsTransform (kg_microbe/transform_utils/ontologies_stubs/) -# queries data/raw/ncit.db and data/raw/mesh.db via OAK to fetch -# rdfs:label, exact synonyms, and dbxrefs for every NCIT/mesh CURIE that -# appears anywhere under mappings/. Output: -# data/transformed/ontologies_stubs/{ncit,mesh}_nodes.tsv. This is the -# preferred path — stubs carry full metadata, not just a label. The +# queries data/raw/{ncit,mesh,bto}.db via OAK to fetch rdfs:label, exact +# synonyms, and dbxrefs for every NCIT/mesh/BTO CURIE that appears +# anywhere under mappings/. Output: +# data/transformed/ontologies_stubs/{ncit,mesh,bto}_nodes.tsv. This is +# the preferred path — stubs carry full metadata, not just a label. The # BacDive inline emit at bacdive.py defers to this transform for these -# two prefixes (see the `not in {"NCIT", "mesh"}` branch there). +# three prefixes (see the `not in {"NCIT", "mesh", "BTO"}` branch there). # -# 2. The long-tail prefixes (PRIDE, PCO, GENEPIO, FAO, BTO, SNOMED): each -# has 1-3 IDs in the whole repo, so the BacDive transform writes a thin +# 2. The long-tail prefixes (PRIDE, PCO, GENEPIO, FAO, SNOMED): each has +# 1-3 IDs in the whole repo, so the BacDive transform writes a thin # label-only node row inline at edge-emit time using the object_label # from the mapping TSV. Setting up SemSQL DBs for these would be # overkill. diff --git a/merge.no_metatraits.yaml b/merge.no_metatraits.yaml index b49599f7..ec3152a9 100644 --- a/merge.no_metatraits.yaml +++ b/merge.no_metatraits.yaml @@ -76,6 +76,7 @@ merged_graph: filename: - data/transformed/ontologies_stubs/ncit_nodes.tsv - data/transformed/ontologies_stubs/mesh_nodes.tsv + - data/transformed/ontologies_stubs/bto_nodes.tsv bacdive: name: "bacdive" input: diff --git a/merge.yaml b/merge.yaml index ec475cf5..dccc420f 100644 --- a/merge.yaml +++ b/merge.yaml @@ -92,6 +92,7 @@ merged_graph: filename: - data/transformed/ontologies_stubs/ncit_nodes.tsv - data/transformed/ontologies_stubs/mesh_nodes.tsv + - data/transformed/ontologies_stubs/bto_nodes.tsv bacdive: name: "bacdive" input: diff --git a/merge_bakta.yaml b/merge_bakta.yaml index da1020f0..fc42b229 100644 --- a/merge_bakta.yaml +++ b/merge_bakta.yaml @@ -92,6 +92,7 @@ merged_graph: filename: - data/transformed/ontologies_stubs/ncit_nodes.tsv - data/transformed/ontologies_stubs/mesh_nodes.tsv + - data/transformed/ontologies_stubs/bto_nodes.tsv bacdive: name: "bacdive" input: diff --git a/tests/test_ontologies_stubs.py b/tests/test_ontologies_stubs.py index 6bd108d3..50ea60cc 100644 --- a/tests/test_ontologies_stubs.py +++ b/tests/test_ontologies_stubs.py @@ -88,9 +88,15 @@ def test_stub_ontology_sources_subset_of_stub_prefixes(): assert set(STUB_ONTOLOGY_SOURCES.keys()).issubset(STUB_ONTOLOGY_PREFIXES) -def test_stub_ontology_sources_covers_ncit_and_mesh(): - """NCIT and mesh are the two prefixes that need full enrichment.""" - assert set(STUB_ONTOLOGY_SOURCES.keys()) == {"NCIT", "mesh"} +def test_stub_ontology_sources_covers_ncit_mesh_bto(): + """ + Cover the three prefixes that need full SemSQL-backed enrichment. + + NCIT and mesh were added in the initial commit; BTO was added after the + MIM 2026-05-18 republish brought in `BTO:0004304 cell lysate`, doubling + the BTO footprint and crossing the "worth a SemSQL fetch" threshold. + """ + assert set(STUB_ONTOLOGY_SOURCES.keys()) == {"NCIT", "mesh", "BTO"} # ---------------------------------------------------------------------------