Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -442,3 +442,6 @@
-
url: https://s3.amazonaws.com/bbop-sqlite/mesh.db.gz
local_name: mesh.db.gz
-
url: https://s3.amazonaws.com/bbop-sqlite/bto.db.gz
local_name: bto.db.gz
Comment on lines 443 to +447
12 changes: 8 additions & 4 deletions kg_microbe/transform_utils/bacdive/bacdive.py
Original file line number Diff line number Diff line change
Expand Up @@ -2988,20 +2988,24 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu
# ontology. Loaded-ontology targets (UBERON, ENVO, ...) get
# their canonical node from the ontologies transform.
#
# NCIT and mesh stub nodes are NOT emitted here — the
# NCIT, mesh, and BTO stub nodes are NOT emitted here — the
# OntologiesStubsTransform (kg_microbe/transform_utils/
# ontologies_stubs/) writes label+synonym+xref-enriched
# stubs from the SemSQL DBs, which is strictly richer
# than the label-only fallback below. Emitting both
# here and there would produce duplicate node rows
# that the merge would have to dedupe. The PRIDE/PCO/
# GENEPIO/FAO/BTO/SNOMED prefixes stay on the inline
# path because each has 1-3 IDs in the whole repo —
# not worth a SemSQL fetch.
# GENEPIO/FAO/SNOMED prefixes stay on the inline path
# because each has 1-3 IDs in the whole repo — not
# worth a SemSQL fetch. (BTO was originally in that
# group too but moved to the SemSQL path after the
# MIM 2026-05-18 republish added `BTO:0004304 cell
# lysate`, doubling its in-repo footprint.)
stub_prefix = subject_id.split(":", 1)[0] if ":" in subject_id else ""
if stub_prefix in STUB_ONTOLOGY_PREFIXES and stub_prefix not in {
"NCIT",
"mesh",
"BTO",
}:
node_writer.writerow(
self._create_node_row(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,22 @@
"db_filename": "mesh.db",
"knowledge_source": "infores:mesh",
},
"BTO": {
# BRENDA Tissue Ontology. Only ~2 CURIEs in current kg-microbe
# mappings (wound fluid from BacDive isolation_source; cell lysate
# added by the MIM 2026-05-18 republish). Added here so those nodes
# carry full label + synonyms + xrefs instead of label-only stubs.
Comment on lines +71 to +75
"db_filename": "bto.db",
"knowledge_source": "infores:bto",
},
}

ONTOLOGIES_STUBS_SOURCE_NAME = "ontologies_stubs"


class OntologiesStubsTransform(Transform):

"""Emit one labelled stub node per referenced NCIT / MESH CURIE."""
"""Emit one labelled stub node per referenced NCIT / MESH / BTO CURIE."""

def __init__(
self,
Expand Down
18 changes: 9 additions & 9 deletions kg_microbe/utils/isolation_source_mapping_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,18 +90,18 @@
#
# Two stub-import paths exist for these prefixes:
#
# 1. NCIT and mesh: a SemSQL-backed enriched stub source. The
# 1. NCIT, mesh, and BTO: a SemSQL-backed enriched stub source. The
# OntologiesStubsTransform (kg_microbe/transform_utils/ontologies_stubs/)
# queries data/raw/ncit.db and data/raw/mesh.db via OAK to fetch
# rdfs:label, exact synonyms, and dbxrefs for every NCIT/mesh CURIE that
# appears anywhere under mappings/. Output:
# data/transformed/ontologies_stubs/{ncit,mesh}_nodes.tsv. This is the
# preferred path — stubs carry full metadata, not just a label. The
# queries data/raw/{ncit,mesh,bto}.db via OAK to fetch rdfs:label, exact
# synonyms, and dbxrefs for every NCIT/mesh/BTO CURIE that appears
# anywhere under mappings/. Output:
Comment on lines +93 to +97
# data/transformed/ontologies_stubs/{ncit,mesh,bto}_nodes.tsv. This is
# the preferred path — stubs carry full metadata, not just a label. The
# BacDive inline emit at bacdive.py defers to this transform for these
# two prefixes (see the `not in {"NCIT", "mesh"}` branch there).
# three prefixes (see the `not in {"NCIT", "mesh", "BTO"}` branch there).
#
# 2. The long-tail prefixes (PRIDE, PCO, GENEPIO, FAO, BTO, SNOMED): each
# has 1-3 IDs in the whole repo, so the BacDive transform writes a thin
# 2. The long-tail prefixes (PRIDE, PCO, GENEPIO, FAO, SNOMED): each has
# 1-3 IDs in the whole repo, so the BacDive transform writes a thin
# label-only node row inline at edge-emit time using the object_label
# from the mapping TSV. Setting up SemSQL DBs for these would be
# overkill.
Expand Down
1 change: 1 addition & 0 deletions merge.no_metatraits.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ merged_graph:
filename:
- data/transformed/ontologies_stubs/ncit_nodes.tsv
- data/transformed/ontologies_stubs/mesh_nodes.tsv
- data/transformed/ontologies_stubs/bto_nodes.tsv
bacdive:
name: "bacdive"
input:
Expand Down
1 change: 1 addition & 0 deletions merge.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ merged_graph:
filename:
- data/transformed/ontologies_stubs/ncit_nodes.tsv
- data/transformed/ontologies_stubs/mesh_nodes.tsv
- data/transformed/ontologies_stubs/bto_nodes.tsv
bacdive:
name: "bacdive"
input:
Expand Down
1 change: 1 addition & 0 deletions merge_bakta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ merged_graph:
filename:
- data/transformed/ontologies_stubs/ncit_nodes.tsv
- data/transformed/ontologies_stubs/mesh_nodes.tsv
- data/transformed/ontologies_stubs/bto_nodes.tsv
bacdive:
name: "bacdive"
input:
Expand Down
12 changes: 9 additions & 3 deletions tests/test_ontologies_stubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,15 @@ def test_stub_ontology_sources_subset_of_stub_prefixes():
assert set(STUB_ONTOLOGY_SOURCES.keys()).issubset(STUB_ONTOLOGY_PREFIXES)


def test_stub_ontology_sources_covers_ncit_and_mesh():
"""NCIT and mesh are the two prefixes that need full enrichment."""
assert set(STUB_ONTOLOGY_SOURCES.keys()) == {"NCIT", "mesh"}
def test_stub_ontology_sources_covers_ncit_mesh_bto():
"""
Cover the three prefixes that need full SemSQL-backed enrichment.

NCIT and mesh were added in the initial commit; BTO was added after the
MIM 2026-05-18 republish brought in `BTO:0004304 cell lysate`, doubling
the BTO footprint and crossing the "worth a SemSQL fetch" threshold.
"""
assert set(STUB_ONTOLOGY_SOURCES.keys()) == {"NCIT", "mesh", "BTO"}
Comment on lines +91 to +99


# ---------------------------------------------------------------------------
Expand Down
Loading