Skip to content

Commit 7108f5b

Browse files
committed
Support ontology id filtering
1 parent bd33e35 commit 7108f5b

2 files changed

Lines changed: 113 additions & 0 deletions

File tree

opensyndrome/filter.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,28 @@ def _build_text_expr(
229229
return pl.any_horizontal(exprs)
230230

231231

232+
def _build_ontology_id_expr(
233+
criterion: dict, columns: list[ColumnSpec], concept: str
234+
) -> pl.Expr:
235+
"""Build a Polars expression matching a criterion's ``ontology_id`` against text columns.
236+
237+
Searches all columns mapped to concept for the exact ``ontology_id`` value.
238+
Matching is case-insensitive to tolerate prefix casing differences (e.g.
239+
``HP:0002045`` vs ``hp:0002045``).
240+
241+
Raises :exc:`UnresolvableCriterion` if no column is mapped to *concept*.
242+
"""
243+
ontology_id = criterion.get("ontology_id", "")
244+
matching_cols = [column for column in columns if column.concept == concept]
245+
if not matching_cols:
246+
raise UnresolvableCriterion(f"No column mapped to concept '{concept}'.")
247+
exprs = [
248+
pl.col(column.col_name).str.to_lowercase().eq(ontology_id.lower())
249+
for column in matching_cols
250+
]
251+
return pl.any_horizontal(exprs)
252+
253+
232254
def _build_attr_expr(
233255
criterion: dict,
234256
columns: list[ColumnSpec],
@@ -321,6 +343,9 @@ def _parse_criterion(
321343
if "attribute" in criterion and "operator" in criterion:
322344
return _build_attr_expr(criterion, columns, value_encodings, df_schema)
323345

346+
if "ontology_id" in criterion and ctype in _VALID_CONCEPTS:
347+
return _build_ontology_id_expr(criterion, columns, ctype)
348+
324349
if ctype in _VALID_CONCEPTS:
325350
return _build_text_expr(criterion, columns, ctype)
326351

tests/test_filter.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
_build_code_expr,
1717
_build_text_expr,
1818
_build_attr_expr,
19+
_build_ontology_id_expr,
1920
_parse_criterion,
2021
)
2122

@@ -393,6 +394,68 @@ def test_raises_when_no_column_mapped_to_concept(self, icd_columns):
393394
_build_text_expr(criterion, icd_columns, "epidemiological_history")
394395

395396

397+
class TestBuildOntologyIdExpr:
398+
@pytest.fixture
399+
def ontology_df(self):
400+
return pl.DataFrame(
401+
{"phenotype": ["hpo:0002045", "hpo:0000975", "hpo:0001649", None]}
402+
)
403+
404+
@pytest.fixture
405+
def symptom_columns(self):
406+
return [ColumnSpec("phenotype", concept="symptom")]
407+
408+
def test_matches_exact_ontology_id(self, ontology_df, symptom_columns):
409+
criterion = {
410+
"type": "symptom",
411+
"name": "Hyperthermia",
412+
"ontology_id": "hpo:0002045",
413+
}
414+
result = ontology_df.filter(
415+
_build_ontology_id_expr(criterion, symptom_columns, "symptom")
416+
)
417+
assert result.height == 1
418+
assert result["phenotype"][0] == "hpo:0002045"
419+
420+
def test_no_match_returns_empty(self, ontology_df, symptom_columns):
421+
criterion = {"type": "symptom", "name": "Unknown", "ontology_id": "hpo:9999999"}
422+
result = ontology_df.filter(
423+
_build_ontology_id_expr(criterion, symptom_columns, "symptom")
424+
)
425+
assert result.is_empty()
426+
427+
def test_matching_is_case_insensitive(self, symptom_columns):
428+
df = pl.DataFrame({"phenotype": ["HPO:0002045", "hpo:0000975"]})
429+
criterion = {"type": "symptom", "ontology_id": "hpo:0002045"}
430+
result = df.filter(
431+
_build_ontology_id_expr(criterion, symptom_columns, "symptom")
432+
)
433+
assert result.height == 1
434+
assert result["phenotype"][0] == "HPO:0002045"
435+
436+
def test_matches_across_multiple_columns(self):
437+
df = pl.DataFrame(
438+
{
439+
"phenotype_1": ["hpo:0002045", "hpo:0000001"],
440+
"phenotype_2": ["hpo:0000001", "hpo:0001649"],
441+
}
442+
)
443+
columns = [
444+
ColumnSpec("phenotype_1", concept="symptom"),
445+
ColumnSpec("phenotype_2", concept="symptom"),
446+
]
447+
criterion = {"type": "symptom", "ontology_id": "hpo:0001649"}
448+
result = df.filter(_build_ontology_id_expr(criterion, columns, "symptom"))
449+
assert result.height == 1
450+
assert result["phenotype_2"][0] == "hpo:0001649"
451+
452+
def test_raises_when_no_column_mapped_to_concept(self):
453+
columns = [ColumnSpec("icd_code", concept="diagnosis")]
454+
criterion = {"type": "symptom", "ontology_id": "hpo:0002045"}
455+
with pytest.raises(UnresolvableCriterion, match="symptom"):
456+
_build_ontology_id_expr(criterion, columns, "symptom")
457+
458+
396459
class TestBuildAttrExpr:
397460
def test_numeric_greater_than(self, fake_dataset, demographic_columns):
398461
assert (fake_dataset["age"] > 60).sum() == 190 # expected value
@@ -624,6 +687,31 @@ def test_epidemiological_history_with_attribute_uses_attr_expr(self, fake_datase
624687
result = fake_dataset.filter(_parse_criterion(criterion, columns))
625688
assert (result["age"].le(14)).all()
626689

690+
def test_ontology_id_routes_to_ontology_expr(self):
691+
df = pl.DataFrame({"phenotype": ["hpo:0002045", "hpo:0000975", "hpo:0001649"]})
692+
columns = [ColumnSpec("phenotype", concept="symptom")]
693+
criterion = {
694+
"type": "symptom",
695+
"name": "Tachycardia",
696+
"ontology_id": "hpo:0001649",
697+
}
698+
result = df.filter(_parse_criterion(criterion, columns))
699+
assert result.height == 1
700+
assert result["phenotype"][0] == "hpo:0001649"
701+
702+
def test_ontology_id_takes_precedence_over_text_matching(self):
703+
# Column contains ontology IDs; name would not match, ontology_id does
704+
df = pl.DataFrame({"phenotype": ["hpo:0001649", "Frankfurt"]})
705+
columns = [ColumnSpec("phenotype", concept="symptom")]
706+
criterion = {
707+
"type": "symptom",
708+
"name": "Frankfurt",
709+
"ontology_id": "hpo:0001649",
710+
}
711+
result = df.filter(_parse_criterion(criterion, columns))
712+
assert result.height == 1
713+
assert result["phenotype"][0] == "hpo:0001649"
714+
627715
def test_syndrome_type_raises(self, all_columns):
628716
with pytest.raises(UnresolvableCriterion, match="syndrome"):
629717
_parse_criterion({"type": "syndrome", "name": "Dengue"}, all_columns)

0 commit comments

Comments
 (0)