Skip to content

Commit 02ac444

Browse files
authored
Merge pull request #570 from Knowledge-Graph-Hub/add-bto-to-stub-import
Extend stub-import transform to cover BTO
2 parents d2b03ea + 8f044ac commit 02ac444

8 files changed

Lines changed: 41 additions & 17 deletions

File tree

download.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,3 +442,6 @@
442442
-
443443
url: https://s3.amazonaws.com/bbop-sqlite/mesh.db.gz
444444
local_name: mesh.db.gz
445+
-
446+
url: https://s3.amazonaws.com/bbop-sqlite/bto.db.gz
447+
local_name: bto.db.gz

kg_microbe/transform_utils/bacdive/bacdive.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2988,20 +2988,24 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu
29882988
# ontology. Loaded-ontology targets (UBERON, ENVO, ...) get
29892989
# their canonical node from the ontologies transform.
29902990
#
2991-
# NCIT and mesh stub nodes are NOT emitted here — the
2991+
# NCIT, mesh, and BTO stub nodes are NOT emitted here — the
29922992
# OntologiesStubsTransform (kg_microbe/transform_utils/
29932993
# ontologies_stubs/) writes label+synonym+xref-enriched
29942994
# stubs from the SemSQL DBs, which is strictly richer
29952995
# than the label-only fallback below. Emitting both
29962996
# here and there would produce duplicate node rows
29972997
# that the merge would have to dedupe. The PRIDE/PCO/
2998-
# GENEPIO/FAO/BTO/SNOMED prefixes stay on the inline
2999-
# path because each has 1-3 IDs in the whole repo —
3000-
# not worth a SemSQL fetch.
2998+
# GENEPIO/FAO/SNOMED prefixes stay on the inline path
2999+
# because each has 1-3 IDs in the whole repo — not
3000+
# worth a SemSQL fetch. (BTO was originally in that
3001+
# group too but moved to the SemSQL path after the
3002+
# MIM 2026-05-18 republish added `BTO:0004304 cell
3003+
# lysate`, doubling its in-repo footprint.)
30013004
stub_prefix = subject_id.split(":", 1)[0] if ":" in subject_id else ""
30023005
if stub_prefix in STUB_ONTOLOGY_PREFIXES and stub_prefix not in {
30033006
"NCIT",
30043007
"mesh",
3008+
"BTO",
30053009
}:
30063010
node_writer.writerow(
30073011
self._create_node_row(

kg_microbe/transform_utils/ontologies_stubs/ontologies_stubs_transform.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,22 @@
6868
"db_filename": "mesh.db",
6969
"knowledge_source": "infores:mesh",
7070
},
71+
"BTO": {
72+
# BRENDA Tissue Ontology. Only ~2 CURIEs in current kg-microbe
73+
# mappings (wound fluid from BacDive isolation_source; cell lysate
74+
# added by the MIM 2026-05-18 republish). Added here so those nodes
75+
# carry full label + synonyms + xrefs instead of label-only stubs.
76+
"db_filename": "bto.db",
77+
"knowledge_source": "infores:bto",
78+
},
7179
}
7280

7381
ONTOLOGIES_STUBS_SOURCE_NAME = "ontologies_stubs"
7482

7583

7684
class OntologiesStubsTransform(Transform):
7785

78-
"""Emit one labelled stub node per referenced NCIT / MESH CURIE."""
86+
"""Emit one labelled stub node per referenced NCIT / MESH / BTO CURIE."""
7987

8088
def __init__(
8189
self,

kg_microbe/utils/isolation_source_mapping_utils.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -90,18 +90,18 @@
9090
#
9191
# Two stub-import paths exist for these prefixes:
9292
#
93-
# 1. NCIT and mesh: a SemSQL-backed enriched stub source. The
93+
# 1. NCIT, mesh, and BTO: a SemSQL-backed enriched stub source. The
9494
# OntologiesStubsTransform (kg_microbe/transform_utils/ontologies_stubs/)
95-
# queries data/raw/ncit.db and data/raw/mesh.db via OAK to fetch
96-
# rdfs:label, exact synonyms, and dbxrefs for every NCIT/mesh CURIE that
97-
# appears anywhere under mappings/. Output:
98-
# data/transformed/ontologies_stubs/{ncit,mesh}_nodes.tsv. This is the
99-
# preferred path — stubs carry full metadata, not just a label. The
95+
# queries data/raw/{ncit,mesh,bto}.db via OAK to fetch rdfs:label, exact
96+
# synonyms, and dbxrefs for every NCIT/mesh/BTO CURIE that appears
97+
# anywhere under mappings/. Output:
98+
# data/transformed/ontologies_stubs/{ncit,mesh,bto}_nodes.tsv. This is
99+
# the preferred path — stubs carry full metadata, not just a label. The
100100
# BacDive inline emit at bacdive.py defers to this transform for these
101-
# two prefixes (see the `not in {"NCIT", "mesh"}` branch there).
101+
# three prefixes (see the `not in {"NCIT", "mesh", "BTO"}` branch there).
102102
#
103-
# 2. The long-tail prefixes (PRIDE, PCO, GENEPIO, FAO, BTO, SNOMED): each
104-
# has 1-3 IDs in the whole repo, so the BacDive transform writes a thin
103+
# 2. The long-tail prefixes (PRIDE, PCO, GENEPIO, FAO, SNOMED): each has
104+
# 1-3 IDs in the whole repo, so the BacDive transform writes a thin
105105
# label-only node row inline at edge-emit time using the object_label
106106
# from the mapping TSV. Setting up SemSQL DBs for these would be
107107
# overkill.

merge.no_metatraits.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ merged_graph:
7676
filename:
7777
- data/transformed/ontologies_stubs/ncit_nodes.tsv
7878
- data/transformed/ontologies_stubs/mesh_nodes.tsv
79+
- data/transformed/ontologies_stubs/bto_nodes.tsv
7980
bacdive:
8081
name: "bacdive"
8182
input:

merge.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ merged_graph:
9292
filename:
9393
- data/transformed/ontologies_stubs/ncit_nodes.tsv
9494
- data/transformed/ontologies_stubs/mesh_nodes.tsv
95+
- data/transformed/ontologies_stubs/bto_nodes.tsv
9596
bacdive:
9697
name: "bacdive"
9798
input:

merge_bakta.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ merged_graph:
9292
filename:
9393
- data/transformed/ontologies_stubs/ncit_nodes.tsv
9494
- data/transformed/ontologies_stubs/mesh_nodes.tsv
95+
- data/transformed/ontologies_stubs/bto_nodes.tsv
9596
bacdive:
9697
name: "bacdive"
9798
input:

tests/test_ontologies_stubs.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,15 @@ def test_stub_ontology_sources_subset_of_stub_prefixes():
8888
assert set(STUB_ONTOLOGY_SOURCES.keys()).issubset(STUB_ONTOLOGY_PREFIXES)
8989

9090

91-
def test_stub_ontology_sources_covers_ncit_and_mesh():
92-
"""NCIT and mesh are the two prefixes that need full enrichment."""
93-
assert set(STUB_ONTOLOGY_SOURCES.keys()) == {"NCIT", "mesh"}
91+
def test_stub_ontology_sources_covers_ncit_mesh_bto():
92+
"""
93+
Cover the three prefixes that need full SemSQL-backed enrichment.
94+
95+
NCIT and mesh were added in the initial commit; BTO was added after the
96+
MIM 2026-05-18 republish brought in `BTO:0004304 cell lysate`, doubling
97+
the BTO footprint and crossing the "worth a SemSQL fetch" threshold.
98+
"""
99+
assert set(STUB_ONTOLOGY_SOURCES.keys()) == {"NCIT", "mesh", "BTO"}
94100

95101

96102
# ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)