Skip to content

Commit d2b03ea

Browse files
authored
Merge pull request #565 from Knowledge-Graph-Hub/ncit-mesh-stub-import
Selective per-CURIE NCIT/MESH stub-import transform
2 parents 852360d + 770865d commit d2b03ea

13 files changed

Lines changed: 812 additions & 7 deletions

download.yaml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,3 +423,22 @@
423423
-
424424
url: https://raw.githubusercontent.com/biolink/kgx/master/docs/kgx_format.md
425425
local_name: kgx-format.md
426+
427+
428+
#
429+
# **** Selective stub-import ontologies (NCIT, MESH) ****
430+
#
431+
# KG-Microbe does NOT load the full NCIT or MESH ontologies — those belong to
432+
# kg-microbe-biomedical. But the chemical-mapping consolidator and BacDive
433+
# isolation-source mapper reference ~150 NCIT/MESH IDs as canonical xrefs for
434+
# ingredients (e.g. NCIT:C29298 'Oatmeal', mesh:D011136 'Tween'). The
435+
# OntologiesStubsTransform queries these SemSQL DBs to harvest just the
436+
# referenced IDs (label + synonyms + xrefs), emitting one labelled stub node
437+
# each. The DBs themselves are never loaded into the merged KG.
438+
#
439+
-
440+
url: https://s3.amazonaws.com/bbop-sqlite/ncit.db.gz
441+
local_name: ncit.db.gz
442+
-
443+
url: https://s3.amazonaws.com/bbop-sqlite/mesh.db.gz
444+
local_name: mesh.db.gz

kg_microbe/transform.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
METATRAITS,
2121
METATRAITS_GTDB,
2222
ONTOLOGIES,
23+
ONTOLOGIES_STUBS,
2324
RHEAMAPPINGS,
2425
)
2526
from kg_microbe.transform_utils.gtdb.gtdb import GTDBTransform
@@ -32,6 +33,9 @@
3233
ONTOLOGIES_MAP,
3334
OntologiesTransform,
3435
)
36+
from kg_microbe.transform_utils.ontologies_stubs.ontologies_stubs_transform import (
37+
OntologiesStubsTransform,
38+
)
3539
from kg_microbe.transform_utils.rhea_mappings.rhea_mappings import RheaMappingsTransform
3640

3741
DATA_SOURCES = {
@@ -44,6 +48,10 @@
4448
# "ProteinAtlasTransform": ProteinAtlasTransform,
4549
# "STRINGTransform": STRINGTransform,
4650
ONTOLOGIES: OntologiesTransform,
51+
# Run ontologies_stubs after ontologies so the SemSQL DBs are present and
52+
# so the stub-node TSVs land in data/transformed/ontologies_stubs/ before
53+
# the merge step picks them up.
54+
ONTOLOGIES_STUBS: OntologiesStubsTransform,
4755
BACDIVE: BacDiveTransform,
4856
BAKTA: BaktaTransform,
4957
COG: COGTransform,

kg_microbe/transform_utils/bacdive/bacdive.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2987,8 +2987,22 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu
29872987
# emit a thin node row here instead of pulling in the full
29882988
# ontology. Loaded-ontology targets (UBERON, ENVO, ...) get
29892989
# their canonical node from the ontologies transform.
2990+
#
2991+
# NCIT and mesh stub nodes are NOT emitted here — the
2992+
# OntologiesStubsTransform (kg_microbe/transform_utils/
2993+
# ontologies_stubs/) writes label+synonym+xref-enriched
2994+
# stubs from the SemSQL DBs, which is strictly richer
2995+
# than the label-only fallback below. Emitting both
2996+
# here and there would produce duplicate node rows
2997+
# that the merge would have to dedupe. The PRIDE/PCO/
2998+
# GENEPIO/FAO/BTO/SNOMED prefixes stay on the inline
2999+
# path because each has 1-3 IDs in the whole repo —
3000+
# not worth a SemSQL fetch.
29903001
stub_prefix = subject_id.split(":", 1)[0] if ":" in subject_id else ""
2991-
if stub_prefix in STUB_ONTOLOGY_PREFIXES:
3002+
if stub_prefix in STUB_ONTOLOGY_PREFIXES and stub_prefix not in {
3003+
"NCIT",
3004+
"mesh",
3005+
}:
29923006
node_writer.writerow(
29933007
self._create_node_row(
29943008
subject_id,

kg_microbe/transform_utils/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
KEGG = "kegg"
1414
RHEAMAPPINGS = "rhea_mappings"
1515
ONTOLOGIES = "ontologies"
16+
ONTOLOGIES_STUBS = "ontologies_stubs"
1617
WALLEN_ETAL = "wallen_etal"
1718
CTD = "ctd"
1819
DISBIOME = "disbiome"
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
"""Ontologies-stubs transform package."""
2+
3+
from kg_microbe.transform_utils.ontologies_stubs.ontologies_stubs_transform import (
4+
OntologiesStubsTransform,
5+
)
6+
7+
__all__ = ["OntologiesStubsTransform"]
Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
"""
2+
Ontologies-stubs transform.
3+
4+
KG-Microbe deliberately does NOT load the full NCIT or MESH ontologies — those
5+
belong to the sibling ``kg-microbe-biomedical`` pipeline. But the
6+
chemical-mapping consolidator and the BacDive isolation-source mapper reference
7+
~150 NCIT and MESH IDs as canonical xrefs for ingredients (e.g.
8+
``NCIT:C29298 'Oatmeal'``, ``mesh:D011136 'Tween'``). Without this transform
9+
those CURIEs would appear as dangling node ids in the merged KG: edges point at
10+
them but no node row carries the label.
11+
12+
This transform:
13+
14+
1. Calls :func:`~kg_microbe.utils.stub_curie_collection.collect_stub_curies` to
15+
discover every NCIT and MESH CURIE referenced anywhere under ``mappings/``.
16+
2. For each CURIE, queries the local SemSQL DB (``data/raw/ncit.db``,
17+
``data/raw/mesh.db``) via OAK to fetch its ``rdfs:label``, exact synonyms,
18+
and dbxrefs. The same pattern is used by the chemical-mapping consolidator
19+
for ChEBI in ``scripts/consolidate_chemical_mappings.py``.
20+
3. Writes one KGX node TSV per stub ontology to
21+
``data/transformed/ontologies_stubs/{ncit,mesh}_nodes.tsv`` carrying
22+
``id, category, name, synonym, xref, provided_by, knowledge_source``.
23+
No edges file — stubs are isolated nodes; edges arrive from the source
24+
transforms (BacDive, MediaDive ingredients via the chemical-mapping path,
25+
etc.).
26+
27+
Note for downstream consumers: if a KG built with this transform is ever
28+
merged with a kg-microbe-biomedical KG that loads NCIT/MESH fully, biolink
29+
merge semantics will union nodes — the stub node here is a strict subset of
30+
what the full ontology would emit (label/synonym/xref only; no edges, no
31+
deprecated flag, no parent classes), so the union will simply pick the
32+
fuller record.
33+
"""
34+
35+
from __future__ import annotations
36+
37+
import csv
38+
import gzip
39+
import shutil
40+
from pathlib import Path
41+
from typing import Dict, Iterable, List, Optional, Set
42+
43+
from kg_microbe.transform_utils.constants import (
44+
CATEGORY_COLUMN,
45+
DEPRECATED_COLUMN,
46+
DESCRIPTION_COLUMN,
47+
ID_COLUMN,
48+
NAME_COLUMN,
49+
PROVIDED_BY_COLUMN,
50+
SAME_AS_COLUMN,
51+
SYNONYM_COLUMN,
52+
XREF_COLUMN,
53+
)
54+
from kg_microbe.transform_utils.transform import Transform
55+
from kg_microbe.utils.isolation_source_mapping_utils import STUB_ONTOLOGY_CATEGORY
56+
from kg_microbe.utils.stub_curie_collection import collect_stub_curies
57+
58+
# Stub ontologies handled by this transform. Each entry maps the canonical
59+
# CURIE prefix (case-sensitive — must match how the prefix appears in
60+
# existing mapping rows) to the local SemSQL DB and the InforES knowledge
61+
# source string.
62+
STUB_ONTOLOGY_SOURCES: Dict[str, Dict[str, str]] = {
63+
"NCIT": {
64+
"db_filename": "ncit.db",
65+
"knowledge_source": "infores:ncit",
66+
},
67+
"mesh": {
68+
"db_filename": "mesh.db",
69+
"knowledge_source": "infores:mesh",
70+
},
71+
}
72+
73+
ONTOLOGIES_STUBS_SOURCE_NAME = "ontologies_stubs"
74+
75+
76+
class OntologiesStubsTransform(Transform):
77+
78+
"""Emit one labelled stub node per referenced NCIT / MESH CURIE."""
79+
80+
def __init__(
81+
self,
82+
input_dir: Optional[Path] = None,
83+
output_dir: Optional[Path] = None,
84+
):
85+
"""
86+
Instantiate transform.
87+
88+
:param input_dir: Where the SemSQL DBs live (defaults to ``data/raw/``).
89+
:param output_dir: Where ``ontologies_stubs/{ncit,mesh}_nodes.tsv`` are
90+
written (defaults to ``data/transformed/``).
91+
"""
92+
super().__init__(ONTOLOGIES_STUBS_SOURCE_NAME, input_dir, output_dir)
93+
94+
def run(self, data_file=None) -> None: # noqa: D401 — base class signature
95+
"""
96+
Collect stub CURIEs, fetch metadata via OAK, write per-ontology node TSVs.
97+
98+
:param data_file: Unused (kept for the base-class signature). The
99+
transform discovers its inputs from the mapping TSVs and the
100+
SemSQL DBs in ``input_base_dir``.
101+
"""
102+
prefixes = list(STUB_ONTOLOGY_SOURCES.keys())
103+
curies_by_prefix = collect_stub_curies(prefixes)
104+
105+
for prefix, curies in curies_by_prefix.items():
106+
cfg = STUB_ONTOLOGY_SOURCES[prefix]
107+
db_path = self.input_base_dir / cfg["db_filename"]
108+
output_file = self.output_dir / f"{prefix.lower()}_nodes.tsv"
109+
self._write_stub_nodes(
110+
prefix=prefix,
111+
curies=sorted(curies),
112+
db_path=db_path,
113+
knowledge_source=cfg["knowledge_source"],
114+
output_file=output_file,
115+
)
116+
117+
# ------------------------------------------------------------------
118+
# internal helpers
119+
# ------------------------------------------------------------------
120+
121+
def _write_stub_nodes(
122+
self,
123+
prefix: str,
124+
curies: List[str],
125+
db_path: Path,
126+
knowledge_source: str,
127+
output_file: Path,
128+
) -> None:
129+
"""Fetch label/synonyms/xrefs per CURIE and write the node TSV."""
130+
if not curies:
131+
print(f" [{prefix}] no CURIEs to import; skipping {output_file.name}")
132+
# Write an empty file with header so the merge step doesn't fail
133+
# on a missing file declared in merge.yaml.
134+
self._write_node_file(output_file, [])
135+
return
136+
137+
adapter = self._open_adapter(prefix, db_path)
138+
if adapter is None:
139+
raise FileNotFoundError(
140+
f"OAK adapter for {prefix} could not be opened (expected SemSQL DB at "
141+
f"{db_path}). Run `poetry run kg download` to fetch it. The stub "
142+
f"transform refuses to silently emit unlabelled nodes — that would "
143+
f"reintroduce the dangling-xref hazard this transform exists to fix."
144+
)
145+
146+
rows: List[List[Optional[str]]] = []
147+
missing: List[str] = []
148+
for curie in curies:
149+
label, synonyms, xrefs = self._fetch_metadata(adapter, curie)
150+
if not label:
151+
# Last-resort fallback: use the CURIE as the name. Log it so
152+
# curators can chase down obsolete or missing entries upstream.
153+
missing.append(curie)
154+
label = curie
155+
row = [
156+
curie, # id
157+
STUB_ONTOLOGY_CATEGORY, # category
158+
label, # name
159+
None, # description
160+
_join_pipe(xrefs), # xref
161+
ONTOLOGIES_STUBS_SOURCE_NAME, # provided_by
162+
_join_pipe(synonyms), # synonym
163+
None, # deprecated
164+
None, # same_as
165+
]
166+
rows.append(row)
167+
168+
self._write_node_file(output_file, rows)
169+
print(
170+
f" [{prefix}] wrote {len(rows)} stub nodes to {output_file.name} "
171+
f"(knowledge_source={knowledge_source}, missing labels: {len(missing)})"
172+
)
173+
if missing:
174+
print(f" [{prefix}] CURIEs with no SemSQL label (used CURIE as name): {missing}")
175+
176+
def _open_adapter(self, prefix: str, db_path: Path):
177+
"""
178+
Open an OAK SemSQL adapter against the local DB; return None on failure.
179+
180+
OBO Foundry distributes the SemSQL DBs as ``.db.gz`` and ``download.yaml``
181+
stores the gzipped form. If the unzipped ``.db`` is missing but a sibling
182+
``.db.gz`` is present, decompress it once (idempotent) and use the result.
183+
"""
184+
if not db_path.is_file():
185+
gz_path = db_path.with_suffix(db_path.suffix + ".gz")
186+
if gz_path.is_file():
187+
print(f" [{prefix}] decompressing {gz_path.name}{db_path.name}")
188+
with gzip.open(gz_path, "rb") as src, db_path.open("wb") as dst:
189+
shutil.copyfileobj(src, dst)
190+
else:
191+
return None
192+
try:
193+
from oaklib import get_adapter
194+
except ImportError as exc: # pragma: no cover — oaklib is a dep
195+
raise RuntimeError(
196+
f"oaklib import failed while opening SemSQL adapter for {prefix}: {exc}"
197+
) from exc
198+
return get_adapter(f"sqlite:{db_path}")
199+
200+
def _fetch_metadata(self, adapter, curie: str):
201+
"""Return (label, synonyms_set, xrefs_set) for ``curie`` via the OAK adapter."""
202+
label = ""
203+
synonyms: Set[str] = set()
204+
xrefs: Set[str] = set()
205+
try:
206+
label = adapter.label(curie) or ""
207+
except Exception: # noqa: S110 — obsolete CURIEs are expected to miss
208+
pass
209+
try:
210+
synonyms = {s for s in adapter.entity_aliases(curie) if s}
211+
except Exception: # noqa: S110
212+
pass
213+
# Drop the canonical label out of the synonym set to keep them disjoint.
214+
synonyms.discard(label)
215+
try:
216+
metadata = adapter.entity_metadata_map(curie) or {}
217+
except Exception: # noqa: S110
218+
metadata = {}
219+
# OAK returns metadata keyed by short-form predicate. dbxref entries
220+
# land under "oio:hasDbXref" (or "oboInOwl:hasDbXref" on older
221+
# adapters). Accept both.
222+
for predicate_key in ("oio:hasDbXref", "oboInOwl:hasDbXref"):
223+
for value in metadata.get(predicate_key, []) or []:
224+
if value:
225+
xrefs.add(str(value))
226+
return label, sorted(synonyms), sorted(xrefs)
227+
228+
def _write_node_file(self, path: Path, rows: Iterable[Iterable[Optional[str]]]) -> None:
229+
"""Write ``rows`` to ``path`` using the standard Transform node header."""
230+
path.parent.mkdir(parents=True, exist_ok=True)
231+
# Use the canonical 9-column node header from the Transform base class.
232+
header = [
233+
ID_COLUMN,
234+
CATEGORY_COLUMN,
235+
NAME_COLUMN,
236+
DESCRIPTION_COLUMN,
237+
XREF_COLUMN,
238+
PROVIDED_BY_COLUMN,
239+
SYNONYM_COLUMN,
240+
DEPRECATED_COLUMN,
241+
SAME_AS_COLUMN,
242+
]
243+
with path.open("w", newline="", encoding="utf-8") as fh:
244+
writer = csv.writer(fh, delimiter="\t", lineterminator="\n")
245+
writer.writerow(header)
246+
for row in rows:
247+
writer.writerow(["" if cell is None else cell for cell in row])
248+
249+
250+
def _join_pipe(values: Iterable[str]) -> str:
251+
"""Pipe-join a sequence; return ``""`` when empty (matches existing TSV convention)."""
252+
items = [v for v in values if v]
253+
return "|".join(items) if items else ""

kg_microbe/utils/isolation_source_mapping_utils.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,12 +86,30 @@
8686
# but that are NOT loaded by the ontologies transform (see ONTOLOGIES_MAP in
8787
# kg_microbe/transform_utils/ontologies/ontologies_transform.py). Each prefix
8888
# either has only a tiny number of distinct IDs in use, or its full load is
89-
# impractical (mesh and NCIT are huge clinical thesauri), so the BacDive
90-
# transform writes a thin node row per resolved CURIE using the object_label
91-
# from the mapping TSV. The category is biolink:OntologyClass for all stubs
92-
# because they're typically categorical terms (host body site, microbial
93-
# community, abscess, etc.) rather than specific anatomy / environmental
94-
# features whose canonical metadata would come from a loaded ontology.
89+
# impractical (mesh and NCIT are huge clinical thesauri).
90+
#
91+
# Two stub-import paths exist for these prefixes:
92+
#
93+
# 1. NCIT and mesh: a SemSQL-backed enriched stub source. The
94+
# OntologiesStubsTransform (kg_microbe/transform_utils/ontologies_stubs/)
95+
# queries data/raw/ncit.db and data/raw/mesh.db via OAK to fetch
96+
# rdfs:label, exact synonyms, and dbxrefs for every NCIT/mesh CURIE that
97+
# appears anywhere under mappings/. Output:
98+
# data/transformed/ontologies_stubs/{ncit,mesh}_nodes.tsv. This is the
99+
# preferred path — stubs carry full metadata, not just a label. The
100+
# BacDive inline emit at bacdive.py defers to this transform for these
101+
# two prefixes (see the `not in {"NCIT", "mesh"}` branch there).
102+
#
103+
# 2. The long-tail prefixes (PRIDE, PCO, GENEPIO, FAO, BTO, SNOMED): each
104+
# has 1-3 IDs in the whole repo, so the BacDive transform writes a thin
105+
# label-only node row inline at edge-emit time using the object_label
106+
# from the mapping TSV. Setting up SemSQL DBs for these would be
107+
# overkill.
108+
#
109+
# The category is biolink:OntologyClass for all stubs because they're
110+
# typically categorical terms (host body site, microbial community,
111+
# abscess, etc.) rather than specific anatomy / environmental features
112+
# whose canonical metadata would come from a loaded ontology.
95113
#
96114
# Codex adversarial review #558 found that without stubs for these prefixes
97115
# the BacDive transform was emitting edges to dangling node IDs because the

0 commit comments

Comments
 (0)