Extend stub-import transform to cover BTO

realmarcin · claude · realmarcin · commit 8f044ac8ea1c · 2026-05-19T23:09:56.000-07:00
The 2026-05-18 MIM SSSOM republish (PR #564) added a `MIM:Cell_Lysate → BTO:0004304 cell lysate` row. Combined with the pre-existing `Wound-fluid → BTO:0003114 wound fluid` row in `mappings/isolation_source_to_ontology.tsv`, the BTO footprint in kg-microbe is now 2 IDs — past the threshold where the original ontologies-stubs design (PR #565) opted to leave BTO on the label-only inline path. Promote BTO to the same SemSQL-backed enriched-stub treatment as NCIT and mesh, so the merged KG carries full label + synonyms + xrefs on both BTO nodes. Changes: - kg_microbe/transform_utils/ontologies_stubs/ontologies_stubs_transform.py: add `BTO` entry to `STUB_ONTOLOGY_SOURCES` (db_filename=bto.db, knowledge_source=infores:bto). Class docstring updated. - kg_microbe/transform_utils/bacdive/bacdive.py:2991-3007: extend the inline-emit skip-list from `{"NCIT", "mesh"}` to `{"NCIT", "mesh", "BTO"}` so BacDive defers BTO stub-node emission to the new transform (avoids duplicate node rows). Code comment updated to reflect the new partitioning. - kg_microbe/utils/isolation_source_mapping_utils.py: STUB_ONTOLOGY_PREFIXES docstring updated to document the new partitioning (NCIT/mesh/BTO enriched via SemSQL; PRIDE/PCO/GENEPIO/FAO/SNOMED stay on the label- only inline path). - download.yaml: add `bto.db.gz` from s3.amazonaws.com/bbop-sqlite (~30 MB, same distribution as the NCIT and mesh SemSQL DBs). - merge.yaml / merge.no_metatraits.yaml / merge_bakta.yaml: add `data/transformed/ontologies_stubs/bto_nodes.tsv` to the ontologies_stubs source filename list in each variant. - tests/test_ontologies_stubs.py: rename + update `test_stub_ontology_sources_covers_ncit_and_mesh` → `test_stub_ontology_sources_covers_ncit_mesh_bto`; assert the set is now exactly `{"NCIT", "mesh", "BTO"}`. Verified: - `collect_stub_curies(['NCIT', 'mesh', 'BTO'])` finds 73 NCIT + 95 mesh + 2 BTO CURIEs from the committed mappings. - 13 unit tests pass; integration test still skipped pending real SemSQL DB download. - ruff clean. End-to-end (requires `poetry run kg download` to fetch the three DBs, ~400 MB total): poetry run kg transform -s ontologies_stubs # → data/transformed/ontologies_stubs/{ncit,mesh,bto}_nodes.tsv poetry run pytest tests/test_ontologies_stubs.py -v # integration test no longer skipped; asserts every collector- # discovered CURIE has a corresponding stub-node row. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/download.yaml b/download.yaml
@@ -442,3 +442,6 @@
 -
   url: https://s3.amazonaws.com/bbop-sqlite/mesh.db.gz
   local_name: mesh.db.gz
+-
+  url: https://s3.amazonaws.com/bbop-sqlite/bto.db.gz
+  local_name: bto.db.gz
diff --git a/kg_microbe/transform_utils/bacdive/bacdive.py b/kg_microbe/transform_utils/bacdive/bacdive.py
@@ -2988,20 +2988,24 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu
                             # ontology. Loaded-ontology targets (UBERON, ENVO, ...) get
                             # their canonical node from the ontologies transform.
                             #
-                            # NCIT and mesh stub nodes are NOT emitted here — the
+                            # NCIT, mesh, and BTO stub nodes are NOT emitted here — the
                             # OntologiesStubsTransform (kg_microbe/transform_utils/
                             # ontologies_stubs/) writes label+synonym+xref-enriched
                             # stubs from the SemSQL DBs, which is strictly richer
                             # than the label-only fallback below. Emitting both
                             # here and there would produce duplicate node rows
                             # that the merge would have to dedupe. The PRIDE/PCO/
-                            # GENEPIO/FAO/BTO/SNOMED prefixes stay on the inline
-                            # path because each has 1-3 IDs in the whole repo —
-                            # not worth a SemSQL fetch.
+                            # GENEPIO/FAO/SNOMED prefixes stay on the inline path
+                            # because each has 1-3 IDs in the whole repo — not
+                            # worth a SemSQL fetch. (BTO was originally in that
+                            # group too but moved to the SemSQL path after the
+                            # MIM 2026-05-18 republish added `BTO:0004304 cell
+                            # lysate`, doubling its in-repo footprint.)
                             stub_prefix = subject_id.split(":", 1)[0] if ":" in subject_id else ""
                             if stub_prefix in STUB_ONTOLOGY_PREFIXES and stub_prefix not in {
                                 "NCIT",
                                 "mesh",
+                                "BTO",
                             }:
                                 node_writer.writerow(
                                     self._create_node_row(
diff --git a/kg_microbe/transform_utils/ontologies_stubs/ontologies_stubs_transform.py b/kg_microbe/transform_utils/ontologies_stubs/ontologies_stubs_transform.py
@@ -68,14 +68,22 @@
         "db_filename": "mesh.db",
         "knowledge_source": "infores:mesh",
     },
+    "BTO": {
+        # BRENDA Tissue Ontology. Only ~2 CURIEs in current kg-microbe
+        # mappings (wound fluid from BacDive isolation_source; cell lysate
+        # added by the MIM 2026-05-18 republish). Added here so those nodes
+        # carry full label + synonyms + xrefs instead of label-only stubs.
+        "db_filename": "bto.db",
+        "knowledge_source": "infores:bto",
+    },
 }
 
 ONTOLOGIES_STUBS_SOURCE_NAME = "ontologies_stubs"
 
 
 class OntologiesStubsTransform(Transform):
 
-    """Emit one labelled stub node per referenced NCIT / MESH CURIE."""
+    """Emit one labelled stub node per referenced NCIT / MESH / BTO CURIE."""
 
     def __init__(
         self,
diff --git a/kg_microbe/utils/isolation_source_mapping_utils.py b/kg_microbe/utils/isolation_source_mapping_utils.py
@@ -90,18 +90,18 @@
 #
 # Two stub-import paths exist for these prefixes:
 #
-# 1. NCIT and mesh: a SemSQL-backed enriched stub source. The
+# 1. NCIT, mesh, and BTO: a SemSQL-backed enriched stub source. The
 #    OntologiesStubsTransform (kg_microbe/transform_utils/ontologies_stubs/)
-#    queries data/raw/ncit.db and data/raw/mesh.db via OAK to fetch
-#    rdfs:label, exact synonyms, and dbxrefs for every NCIT/mesh CURIE that
-#    appears anywhere under mappings/. Output:
-#    data/transformed/ontologies_stubs/{ncit,mesh}_nodes.tsv. This is the
-#    preferred path — stubs carry full metadata, not just a label. The
+#    queries data/raw/{ncit,mesh,bto}.db via OAK to fetch rdfs:label, exact
+#    synonyms, and dbxrefs for every NCIT/mesh/BTO CURIE that appears
+#    anywhere under mappings/. Output:
+#    data/transformed/ontologies_stubs/{ncit,mesh,bto}_nodes.tsv. This is
+#    the preferred path — stubs carry full metadata, not just a label. The
 #    BacDive inline emit at bacdive.py defers to this transform for these
-#    two prefixes (see the `not in {"NCIT", "mesh"}` branch there).
+#    three prefixes (see the `not in {"NCIT", "mesh", "BTO"}` branch there).
 #
-# 2. The long-tail prefixes (PRIDE, PCO, GENEPIO, FAO, BTO, SNOMED): each
-#    has 1-3 IDs in the whole repo, so the BacDive transform writes a thin
+# 2. The long-tail prefixes (PRIDE, PCO, GENEPIO, FAO, SNOMED): each has
+#    1-3 IDs in the whole repo, so the BacDive transform writes a thin
 #    label-only node row inline at edge-emit time using the object_label
 #    from the mapping TSV. Setting up SemSQL DBs for these would be
 #    overkill.
diff --git a/merge.no_metatraits.yaml b/merge.no_metatraits.yaml
@@ -76,6 +76,7 @@ merged_graph:
         filename:
           - data/transformed/ontologies_stubs/ncit_nodes.tsv
           - data/transformed/ontologies_stubs/mesh_nodes.tsv
+          - data/transformed/ontologies_stubs/bto_nodes.tsv
     bacdive:
       name: "bacdive"
       input:
diff --git a/merge.yaml b/merge.yaml
@@ -92,6 +92,7 @@ merged_graph:
         filename:
           - data/transformed/ontologies_stubs/ncit_nodes.tsv
           - data/transformed/ontologies_stubs/mesh_nodes.tsv
+          - data/transformed/ontologies_stubs/bto_nodes.tsv
     bacdive:
       name: "bacdive"
       input:
diff --git a/merge_bakta.yaml b/merge_bakta.yaml
@@ -92,6 +92,7 @@ merged_graph:
         filename:
           - data/transformed/ontologies_stubs/ncit_nodes.tsv
           - data/transformed/ontologies_stubs/mesh_nodes.tsv
+          - data/transformed/ontologies_stubs/bto_nodes.tsv
     bacdive:
       name: "bacdive"
       input:
diff --git a/tests/test_ontologies_stubs.py b/tests/test_ontologies_stubs.py
@@ -88,9 +88,15 @@ def test_stub_ontology_sources_subset_of_stub_prefixes():
     assert set(STUB_ONTOLOGY_SOURCES.keys()).issubset(STUB_ONTOLOGY_PREFIXES)
 
 
-def test_stub_ontology_sources_covers_ncit_and_mesh():
-    """NCIT and mesh are the two prefixes that need full enrichment."""
-    assert set(STUB_ONTOLOGY_SOURCES.keys()) == {"NCIT", "mesh"}
+def test_stub_ontology_sources_covers_ncit_mesh_bto():
+    """
+    Cover the three prefixes that need full SemSQL-backed enrichment.
+
+    NCIT and mesh were added in the initial commit; BTO was added after the
+    MIM 2026-05-18 republish brought in `BTO:0004304 cell lysate`, doubling
+    the BTO footprint and crossing the "worth a SemSQL fetch" threshold.
+    """
+    assert set(STUB_ONTOLOGY_SOURCES.keys()) == {"NCIT", "mesh", "BTO"}
 
 
 # ---------------------------------------------------------------------------

Original file line number	Diff line number	Diff line change
`@@ -442,3 +442,6 @@`
`442`	`442`	`-`
`443`	`443`	`url: https://s3.amazonaws.com/bbop-sqlite/mesh.db.gz`
`444`	`444`	`local_name: mesh.db.gz`
	`445`	`+-`
	`446`	`+ url: https://s3.amazonaws.com/bbop-sqlite/bto.db.gz`
	`447`	`+ local_name: bto.db.gz`