Skip to content

Commit cb783ab

Browse files
authored
Merge pull request #575 from Knowledge-Graph-Hub/fix/metpo-drop-deprecated-terms
Drop owl:deprecated ontology terms before KG ingest
2 parents 4c23ed9 + 476f485 commit cb783ab

2 files changed

Lines changed: 180 additions & 0 deletions

File tree

kg_microbe/transform_utils/ontologies/ontologies_transform.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,14 @@ def parse(self, name: str, data_file: Optional[Path], source: str) -> None:
207207
# entries so the load proceeds.
208208
self._sanitize_obograph_synonyms(Path(data_file))
209209

210+
# Drop owl:deprecated (obsolete) classes so retired terms do not enter
211+
# the KG as orphan nodes. METPO, for example, retains ~1,200 deprecated
212+
# classes from an ID-scheme migration alongside its ~370 active terms;
213+
# ingesting all of them inflates the graph with disconnected obsolete
214+
# nodes. Removal happens before the KGX load so neither the nodes nor
215+
# any edges touching them reach the output.
216+
self._drop_deprecated_terms(Path(data_file))
217+
210218
transform(
211219
inputs=[data_file],
212220
input_format="obojson",
@@ -262,6 +270,71 @@ def _sanitize_obograph_synonyms(self, json_path: Path) -> None:
262270
with open(json_path, "w", encoding="utf-8") as f:
263271
json.dump(data, f)
264272

273+
@staticmethod
274+
def _is_deprecated_node(node: dict) -> bool:
275+
"""
276+
Return True if an obograph node is flagged ``owl:deprecated true``.
277+
278+
Obograph JSON encodes deprecation either as a top-level
279+
``meta.deprecated`` boolean or as a ``meta.basicPropertyValues`` entry
280+
whose predicate is ``owl#deprecated`` with the literal value ``"true"``.
281+
Both forms are checked.
282+
"""
283+
meta = node.get("meta") or {}
284+
if meta.get("deprecated") is True:
285+
return True
286+
for bpv in meta.get("basicPropertyValues", []) or []:
287+
pred = str(bpv.get("pred", ""))
288+
if pred.endswith("owl#deprecated") and str(bpv.get("val", "")).lower() == "true":
289+
return True
290+
return False
291+
292+
def _drop_deprecated_terms(self, json_path: Path) -> None:
293+
"""
294+
Remove ``owl:deprecated`` (obsolete) classes from an obograph JSON in place.
295+
296+
Retired terms are dropped before the KGX load so they never become KG
297+
nodes, and any edges referencing a dropped term are removed too so no
298+
dangling edges remain. Active terms are untouched. The file is only
299+
rewritten when something is actually removed.
300+
"""
301+
if not json_path.is_file() or json_path.suffix != ".json":
302+
return
303+
try:
304+
with open(json_path, "r", encoding="utf-8") as f:
305+
data = json.load(f)
306+
except (OSError, json.JSONDecodeError):
307+
return # let downstream KGX raise the more informative error
308+
309+
dropped_nodes = 0
310+
dropped_edges = 0
311+
for graph in data.get("graphs", []) or []:
312+
deprecated_ids = {
313+
node["id"]
314+
for node in graph.get("nodes", []) or []
315+
if node.get("id") and self._is_deprecated_node(node)
316+
}
317+
if not deprecated_ids:
318+
continue
319+
kept_nodes = [n for n in graph.get("nodes", []) or [] if n.get("id") not in deprecated_ids]
320+
dropped_nodes += len(graph.get("nodes", []) or []) - len(kept_nodes)
321+
graph["nodes"] = kept_nodes
322+
323+
edges = graph.get("edges", []) or []
324+
kept_edges = [
325+
e for e in edges if e.get("sub") not in deprecated_ids and e.get("obj") not in deprecated_ids
326+
]
327+
dropped_edges += len(edges) - len(kept_edges)
328+
graph["edges"] = kept_edges
329+
330+
if dropped_nodes:
331+
print(
332+
f" Dropped {dropped_nodes} deprecated (owl:deprecated) terms "
333+
f"and {dropped_edges} edges touching them from {json_path.name}"
334+
)
335+
with open(json_path, "w", encoding="utf-8") as f:
336+
json.dump(data, f)
337+
265338
def _add_kgx_metadata_to_edges(self, edges_file_path: Path):
266339
"""
267340
Add knowledge_level and agent_type columns to ontology edge files.
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
"""Tests for the owl:deprecated term filter in the ontologies transform."""
2+
3+
import json
4+
from pathlib import Path
5+
from unittest import TestCase
6+
7+
from kg_microbe.transform_utils.ontologies.ontologies_transform import OntologiesTransform
8+
9+
10+
def _make_obograph():
11+
"""Build a tiny obograph with one active node, two deprecated nodes, and edges."""
12+
return {
13+
"graphs": [
14+
{
15+
"id": "https://w3id.org/metpo/test.json",
16+
"nodes": [
17+
{"id": "https://w3id.org/metpo/1000001", "lbl": "active term", "type": "CLASS"},
18+
{
19+
"id": "https://w3id.org/metpo/0000001",
20+
"lbl": "obsolete term (basicPropertyValues)",
21+
"type": "CLASS",
22+
"meta": {
23+
"basicPropertyValues": [
24+
{"pred": "http://www.w3.org/2002/07/owl#deprecated", "val": "true"}
25+
]
26+
},
27+
},
28+
{
29+
"id": "https://w3id.org/metpo/0000002",
30+
"lbl": "obsolete term (meta.deprecated)",
31+
"type": "CLASS",
32+
"meta": {"deprecated": True},
33+
},
34+
],
35+
"edges": [
36+
# edge between two active-ish nodes (subject active, object active) -> kept
37+
{
38+
"sub": "https://w3id.org/metpo/1000001",
39+
"pred": "is_a",
40+
"obj": "https://w3id.org/metpo/1000001",
41+
},
42+
# edge touching a deprecated node -> dropped
43+
{
44+
"sub": "https://w3id.org/metpo/1000001",
45+
"pred": "is_a",
46+
"obj": "https://w3id.org/metpo/0000001",
47+
},
48+
],
49+
}
50+
]
51+
}
52+
53+
54+
class TestDeprecatedFilter(TestCase):
55+
56+
"""Test owl:deprecated term removal from obograph JSON before KGX load."""
57+
58+
def setUp(self):
59+
"""Instantiate the transform without running __init__ side effects."""
60+
# The filter methods only rely on self._is_deprecated_node; bypass the
61+
# base Transform.__init__ (which sets up source dirs) to keep the test
62+
# isolated and side-effect free.
63+
self.transform = OntologiesTransform.__new__(OntologiesTransform)
64+
65+
def test_is_deprecated_node_detects_both_encodings(self):
66+
"""Both basicPropertyValues and meta.deprecated encodings are detected."""
67+
graph = _make_obograph()["graphs"][0]
68+
active, dep_bpv, dep_meta = graph["nodes"]
69+
self.assertFalse(self.transform._is_deprecated_node(active))
70+
self.assertTrue(self.transform._is_deprecated_node(dep_bpv))
71+
self.assertTrue(self.transform._is_deprecated_node(dep_meta))
72+
73+
def test_drop_deprecated_terms_removes_nodes_and_dangling_edges(self):
74+
"""Deprecated nodes and any edges touching them are removed in place."""
75+
path = Path(self.tmp_json())
76+
self.transform._drop_deprecated_terms(path)
77+
78+
result = json.loads(path.read_text())
79+
graph = result["graphs"][0]
80+
node_ids = {n["id"] for n in graph["nodes"]}
81+
82+
# Only the active term survives.
83+
self.assertEqual(node_ids, {"https://w3id.org/metpo/1000001"})
84+
# No node flagged deprecated remains.
85+
self.assertFalse(any(self.transform._is_deprecated_node(n) for n in graph["nodes"]))
86+
# The edge touching a deprecated node is gone; the active-only edge stays.
87+
self.assertEqual(len(graph["edges"]), 1)
88+
self.assertEqual(graph["edges"][0]["obj"], "https://w3id.org/metpo/1000001")
89+
90+
def test_active_only_graph_is_untouched(self):
91+
"""A graph with no deprecated terms is left unchanged (no rewrite)."""
92+
data = {"graphs": [{"nodes": [{"id": "https://w3id.org/metpo/1000001", "lbl": "x"}], "edges": []}]}
93+
path = Path(self.tmp_json(data))
94+
before = path.read_text()
95+
self.transform._drop_deprecated_terms(path)
96+
self.assertEqual(path.read_text(), before)
97+
98+
def tmp_json(self, data=None):
99+
"""Write an obograph dict to a temp file and return its path."""
100+
import tempfile
101+
102+
if data is None:
103+
data = _make_obograph()
104+
fd = tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False)
105+
json.dump(data, fd)
106+
fd.close()
107+
return fd.name

0 commit comments

Comments
 (0)