Skip to content

Commit 476f485

Browse files
realmarcinclaude
andcommitted
Drop owl:deprecated ontology terms before KG ingest
The ontologies transform loaded every class from each obograph, including those flagged owl:deprecated. METPO alone retains ~1,216 deprecated classes from an ID-scheme migration alongside its ~405 active terms, so the merged graph was inflated with obsolete orphan nodes (and, in METPO 2026-06-12, ~1,163 edges touching them). Add _drop_deprecated_terms(), which removes owl:deprecated nodes (both the meta.deprecated and basicPropertyValues encodings) and any edges touching them from the obograph JSON before the KGX load. Applied to every ontology, so retired terms from any source are filtered, not just METPO. Verified on METPO 2026-06-12: 1,621 -> 405 metpo nodes, 0 deprecated remaining. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
1 parent 4c23ed9 commit 476f485

2 files changed

Lines changed: 180 additions & 0 deletions

File tree

kg_microbe/transform_utils/ontologies/ontologies_transform.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,14 @@ def parse(self, name: str, data_file: Optional[Path], source: str) -> None:
207207
# entries so the load proceeds.
208208
self._sanitize_obograph_synonyms(Path(data_file))
209209

210+
# Drop owl:deprecated (obsolete) classes so retired terms do not enter
211+
# the KG as orphan nodes. METPO, for example, retains ~1,200 deprecated
212+
# classes from an ID-scheme migration alongside its ~370 active terms;
213+
# ingesting all of them inflates the graph with disconnected obsolete
214+
# nodes. Removal happens before the KGX load so neither the nodes nor
215+
# any edges touching them reach the output.
216+
self._drop_deprecated_terms(Path(data_file))
217+
210218
transform(
211219
inputs=[data_file],
212220
input_format="obojson",
@@ -262,6 +270,71 @@ def _sanitize_obograph_synonyms(self, json_path: Path) -> None:
262270
with open(json_path, "w", encoding="utf-8") as f:
263271
json.dump(data, f)
264272

273+
@staticmethod
274+
def _is_deprecated_node(node: dict) -> bool:
275+
"""
276+
Return True if an obograph node is flagged ``owl:deprecated true``.
277+
278+
Obograph JSON encodes deprecation either as a top-level
279+
``meta.deprecated`` boolean or as a ``meta.basicPropertyValues`` entry
280+
whose predicate is ``owl#deprecated`` with the literal value ``"true"``.
281+
Both forms are checked.
282+
"""
283+
meta = node.get("meta") or {}
284+
if meta.get("deprecated") is True:
285+
return True
286+
for bpv in meta.get("basicPropertyValues", []) or []:
287+
pred = str(bpv.get("pred", ""))
288+
if pred.endswith("owl#deprecated") and str(bpv.get("val", "")).lower() == "true":
289+
return True
290+
return False
291+
292+
def _drop_deprecated_terms(self, json_path: Path) -> None:
293+
"""
294+
Remove ``owl:deprecated`` (obsolete) classes from an obograph JSON in place.
295+
296+
Retired terms are dropped before the KGX load so they never become KG
297+
nodes, and any edges referencing a dropped term are removed too so no
298+
dangling edges remain. Active terms are untouched. The file is only
299+
rewritten when something is actually removed.
300+
"""
301+
if not json_path.is_file() or json_path.suffix != ".json":
302+
return
303+
try:
304+
with open(json_path, "r", encoding="utf-8") as f:
305+
data = json.load(f)
306+
except (OSError, json.JSONDecodeError):
307+
return # let downstream KGX raise the more informative error
308+
309+
dropped_nodes = 0
310+
dropped_edges = 0
311+
for graph in data.get("graphs", []) or []:
312+
deprecated_ids = {
313+
node["id"]
314+
for node in graph.get("nodes", []) or []
315+
if node.get("id") and self._is_deprecated_node(node)
316+
}
317+
if not deprecated_ids:
318+
continue
319+
kept_nodes = [n for n in graph.get("nodes", []) or [] if n.get("id") not in deprecated_ids]
320+
dropped_nodes += len(graph.get("nodes", []) or []) - len(kept_nodes)
321+
graph["nodes"] = kept_nodes
322+
323+
edges = graph.get("edges", []) or []
324+
kept_edges = [
325+
e for e in edges if e.get("sub") not in deprecated_ids and e.get("obj") not in deprecated_ids
326+
]
327+
dropped_edges += len(edges) - len(kept_edges)
328+
graph["edges"] = kept_edges
329+
330+
if dropped_nodes:
331+
print(
332+
f" Dropped {dropped_nodes} deprecated (owl:deprecated) terms "
333+
f"and {dropped_edges} edges touching them from {json_path.name}"
334+
)
335+
with open(json_path, "w", encoding="utf-8") as f:
336+
json.dump(data, f)
337+
265338
def _add_kgx_metadata_to_edges(self, edges_file_path: Path):
266339
"""
267340
Add knowledge_level and agent_type columns to ontology edge files.
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
"""Tests for the owl:deprecated term filter in the ontologies transform."""
2+
3+
import json
4+
from pathlib import Path
5+
from unittest import TestCase
6+
7+
from kg_microbe.transform_utils.ontologies.ontologies_transform import OntologiesTransform
8+
9+
10+
def _make_obograph():
11+
"""Build a tiny obograph with one active node, two deprecated nodes, and edges."""
12+
return {
13+
"graphs": [
14+
{
15+
"id": "https://w3id.org/metpo/test.json",
16+
"nodes": [
17+
{"id": "https://w3id.org/metpo/1000001", "lbl": "active term", "type": "CLASS"},
18+
{
19+
"id": "https://w3id.org/metpo/0000001",
20+
"lbl": "obsolete term (basicPropertyValues)",
21+
"type": "CLASS",
22+
"meta": {
23+
"basicPropertyValues": [
24+
{"pred": "http://www.w3.org/2002/07/owl#deprecated", "val": "true"}
25+
]
26+
},
27+
},
28+
{
29+
"id": "https://w3id.org/metpo/0000002",
30+
"lbl": "obsolete term (meta.deprecated)",
31+
"type": "CLASS",
32+
"meta": {"deprecated": True},
33+
},
34+
],
35+
"edges": [
36+
# edge between two active-ish nodes (subject active, object active) -> kept
37+
{
38+
"sub": "https://w3id.org/metpo/1000001",
39+
"pred": "is_a",
40+
"obj": "https://w3id.org/metpo/1000001",
41+
},
42+
# edge touching a deprecated node -> dropped
43+
{
44+
"sub": "https://w3id.org/metpo/1000001",
45+
"pred": "is_a",
46+
"obj": "https://w3id.org/metpo/0000001",
47+
},
48+
],
49+
}
50+
]
51+
}
52+
53+
54+
class TestDeprecatedFilter(TestCase):
55+
56+
"""Test owl:deprecated term removal from obograph JSON before KGX load."""
57+
58+
def setUp(self):
59+
"""Instantiate the transform without running __init__ side effects."""
60+
# The filter methods only rely on self._is_deprecated_node; bypass the
61+
# base Transform.__init__ (which sets up source dirs) to keep the test
62+
# isolated and side-effect free.
63+
self.transform = OntologiesTransform.__new__(OntologiesTransform)
64+
65+
def test_is_deprecated_node_detects_both_encodings(self):
66+
"""Both basicPropertyValues and meta.deprecated encodings are detected."""
67+
graph = _make_obograph()["graphs"][0]
68+
active, dep_bpv, dep_meta = graph["nodes"]
69+
self.assertFalse(self.transform._is_deprecated_node(active))
70+
self.assertTrue(self.transform._is_deprecated_node(dep_bpv))
71+
self.assertTrue(self.transform._is_deprecated_node(dep_meta))
72+
73+
def test_drop_deprecated_terms_removes_nodes_and_dangling_edges(self):
74+
"""Deprecated nodes and any edges touching them are removed in place."""
75+
path = Path(self.tmp_json())
76+
self.transform._drop_deprecated_terms(path)
77+
78+
result = json.loads(path.read_text())
79+
graph = result["graphs"][0]
80+
node_ids = {n["id"] for n in graph["nodes"]}
81+
82+
# Only the active term survives.
83+
self.assertEqual(node_ids, {"https://w3id.org/metpo/1000001"})
84+
# No node flagged deprecated remains.
85+
self.assertFalse(any(self.transform._is_deprecated_node(n) for n in graph["nodes"]))
86+
# The edge touching a deprecated node is gone; the active-only edge stays.
87+
self.assertEqual(len(graph["edges"]), 1)
88+
self.assertEqual(graph["edges"][0]["obj"], "https://w3id.org/metpo/1000001")
89+
90+
def test_active_only_graph_is_untouched(self):
91+
"""A graph with no deprecated terms is left unchanged (no rewrite)."""
92+
data = {"graphs": [{"nodes": [{"id": "https://w3id.org/metpo/1000001", "lbl": "x"}], "edges": []}]}
93+
path = Path(self.tmp_json(data))
94+
before = path.read_text()
95+
self.transform._drop_deprecated_terms(path)
96+
self.assertEqual(path.read_text(), before)
97+
98+
def tmp_json(self, data=None):
99+
"""Write an obograph dict to a temp file and return its path."""
100+
import tempfile
101+
102+
if data is None:
103+
data = _make_obograph()
104+
fd = tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False)
105+
json.dump(data, fd)
106+
fd.close()
107+
return fd.name

0 commit comments

Comments
 (0)