Merge pull request #570 from Knowledge-Graph-Hub/add-bto-to-stub-import

realmarcin · web-flow · commit 02ac4443fde2 · 2026-05-19T23:32:33.000-07:00
Extend stub-import transform to cover BTO
diff --git a/download.yaml b/download.yaml
@@ -442,3 +442,6 @@
 -
   url: https://s3.amazonaws.com/bbop-sqlite/mesh.db.gz
   local_name: mesh.db.gz
+-
+  url: https://s3.amazonaws.com/bbop-sqlite/bto.db.gz
+  local_name: bto.db.gz
diff --git a/kg_microbe/transform_utils/bacdive/bacdive.py b/kg_microbe/transform_utils/bacdive/bacdive.py
@@ -2988,20 +2988,24 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu
                             # ontology. Loaded-ontology targets (UBERON, ENVO, ...) get
                             # their canonical node from the ontologies transform.
                             #
-                            # NCIT and mesh stub nodes are NOT emitted here — the
+                            # NCIT, mesh, and BTO stub nodes are NOT emitted here — the
                             # OntologiesStubsTransform (kg_microbe/transform_utils/
                             # ontologies_stubs/) writes label+synonym+xref-enriched
                             # stubs from the SemSQL DBs, which is strictly richer
                             # than the label-only fallback below. Emitting both
                             # here and there would produce duplicate node rows
                             # that the merge would have to dedupe. The PRIDE/PCO/
-                            # GENEPIO/FAO/BTO/SNOMED prefixes stay on the inline
-                            # path because each has 1-3 IDs in the whole repo —
-                            # not worth a SemSQL fetch.
+                            # GENEPIO/FAO/SNOMED prefixes stay on the inline path
+                            # because each has 1-3 IDs in the whole repo — not
+                            # worth a SemSQL fetch. (BTO was originally in that
+                            # group too but moved to the SemSQL path after the
+                            # MIM 2026-05-18 republish added `BTO:0004304 cell
+                            # lysate`, doubling its in-repo footprint.)
                             stub_prefix = subject_id.split(":", 1)[0] if ":" in subject_id else ""
                             if stub_prefix in STUB_ONTOLOGY_PREFIXES and stub_prefix not in {
                                 "NCIT",
                                 "mesh",
+                                "BTO",
                             }:
                                 node_writer.writerow(
                                     self._create_node_row(
diff --git a/kg_microbe/transform_utils/ontologies_stubs/ontologies_stubs_transform.py b/kg_microbe/transform_utils/ontologies_stubs/ontologies_stubs_transform.py
@@ -68,14 +68,22 @@
         "db_filename": "mesh.db",
         "knowledge_source": "infores:mesh",
     },
+    "BTO": {
+        # BRENDA Tissue Ontology. Only ~2 CURIEs in current kg-microbe
+        # mappings (wound fluid from BacDive isolation_source; cell lysate
+        # added by the MIM 2026-05-18 republish). Added here so those nodes
+        # carry full label + synonyms + xrefs instead of label-only stubs.
+        "db_filename": "bto.db",
+        "knowledge_source": "infores:bto",
+    },
 }
 
 ONTOLOGIES_STUBS_SOURCE_NAME = "ontologies_stubs"
 
 
 class OntologiesStubsTransform(Transform):
 
-    """Emit one labelled stub node per referenced NCIT / MESH CURIE."""
+    """Emit one labelled stub node per referenced NCIT / MESH / BTO CURIE."""
 
     def __init__(
         self,
diff --git a/kg_microbe/utils/isolation_source_mapping_utils.py b/kg_microbe/utils/isolation_source_mapping_utils.py
@@ -90,18 +90,18 @@
 #
 # Two stub-import paths exist for these prefixes:
 #
-# 1. NCIT and mesh: a SemSQL-backed enriched stub source. The
+# 1. NCIT, mesh, and BTO: a SemSQL-backed enriched stub source. The
 #    OntologiesStubsTransform (kg_microbe/transform_utils/ontologies_stubs/)
-#    queries data/raw/ncit.db and data/raw/mesh.db via OAK to fetch
-#    rdfs:label, exact synonyms, and dbxrefs for every NCIT/mesh CURIE that
-#    appears anywhere under mappings/. Output:
-#    data/transformed/ontologies_stubs/{ncit,mesh}_nodes.tsv. This is the
-#    preferred path — stubs carry full metadata, not just a label. The
+#    queries data/raw/{ncit,mesh,bto}.db via OAK to fetch rdfs:label, exact
+#    synonyms, and dbxrefs for every NCIT/mesh/BTO CURIE that appears
+#    anywhere under mappings/. Output:
+#    data/transformed/ontologies_stubs/{ncit,mesh,bto}_nodes.tsv. This is
+#    the preferred path — stubs carry full metadata, not just a label. The
 #    BacDive inline emit at bacdive.py defers to this transform for these
-#    two prefixes (see the `not in {"NCIT", "mesh"}` branch there).
+#    three prefixes (see the `not in {"NCIT", "mesh", "BTO"}` branch there).
 #
-# 2. The long-tail prefixes (PRIDE, PCO, GENEPIO, FAO, BTO, SNOMED): each
-#    has 1-3 IDs in the whole repo, so the BacDive transform writes a thin
+# 2. The long-tail prefixes (PRIDE, PCO, GENEPIO, FAO, SNOMED): each has
+#    1-3 IDs in the whole repo, so the BacDive transform writes a thin
 #    label-only node row inline at edge-emit time using the object_label
 #    from the mapping TSV. Setting up SemSQL DBs for these would be
 #    overkill.
diff --git a/merge.no_metatraits.yaml b/merge.no_metatraits.yaml
@@ -76,6 +76,7 @@ merged_graph:
         filename:
           - data/transformed/ontologies_stubs/ncit_nodes.tsv
           - data/transformed/ontologies_stubs/mesh_nodes.tsv
+          - data/transformed/ontologies_stubs/bto_nodes.tsv
     bacdive:
       name: "bacdive"
       input:
diff --git a/merge.yaml b/merge.yaml
@@ -92,6 +92,7 @@ merged_graph:
         filename:
           - data/transformed/ontologies_stubs/ncit_nodes.tsv
           - data/transformed/ontologies_stubs/mesh_nodes.tsv
+          - data/transformed/ontologies_stubs/bto_nodes.tsv
     bacdive:
       name: "bacdive"
       input:
diff --git a/merge_bakta.yaml b/merge_bakta.yaml
@@ -92,6 +92,7 @@ merged_graph:
         filename:
           - data/transformed/ontologies_stubs/ncit_nodes.tsv
           - data/transformed/ontologies_stubs/mesh_nodes.tsv
+          - data/transformed/ontologies_stubs/bto_nodes.tsv
     bacdive:
       name: "bacdive"
       input:
diff --git a/tests/test_ontologies_stubs.py b/tests/test_ontologies_stubs.py
@@ -88,9 +88,15 @@ def test_stub_ontology_sources_subset_of_stub_prefixes():
     assert set(STUB_ONTOLOGY_SOURCES.keys()).issubset(STUB_ONTOLOGY_PREFIXES)
 
 
-def test_stub_ontology_sources_covers_ncit_and_mesh():
-    """NCIT and mesh are the two prefixes that need full enrichment."""
-    assert set(STUB_ONTOLOGY_SOURCES.keys()) == {"NCIT", "mesh"}
+def test_stub_ontology_sources_covers_ncit_mesh_bto():
+    """
+    Cover the three prefixes that need full SemSQL-backed enrichment.
+
+    NCIT and mesh were added in the initial commit; BTO was added after the
+    MIM 2026-05-18 republish brought in `BTO:0004304 cell lysate`, doubling
+    the BTO footprint and crossing the "worth a SemSQL fetch" threshold.
+    """
+    assert set(STUB_ONTOLOGY_SOURCES.keys()) == {"NCIT", "mesh", "BTO"}
 
 
 # ---------------------------------------------------------------------------

Original file line number	Diff line number	Diff line change
`@@ -442,3 +442,6 @@`
`442`	`442`	`-`
`443`	`443`	`url: https://s3.amazonaws.com/bbop-sqlite/mesh.db.gz`
`444`	`444`	`local_name: mesh.db.gz`
	`445`	`+-`
	`446`	`+ url: https://s3.amazonaws.com/bbop-sqlite/bto.db.gz`
	`447`	`+ local_name: bto.db.gz`