Support ontology id filtering

anapaulagomes · anapaulagomes · commit 7108f5ba5f33 · 2026-03-17T14:23:19.000+01:00
diff --git a/opensyndrome/filter.py b/opensyndrome/filter.py
@@ -229,6 +229,28 @@ def _build_text_expr(
     return pl.any_horizontal(exprs)
 
 
+def _build_ontology_id_expr(
+    criterion: dict, columns: list[ColumnSpec], concept: str
+) -> pl.Expr:
+    """Build a Polars expression matching a criterion's ``ontology_id`` against text columns.
+
+    Searches all columns mapped to concept for the exact ``ontology_id`` value.
+    Matching is case-insensitive to tolerate prefix casing differences (e.g.
+    ``HP:0002045`` vs ``hp:0002045``).
+
+    Raises :exc:`UnresolvableCriterion` if no column is mapped to *concept*.
+    """
+    ontology_id = criterion.get("ontology_id", "")
+    matching_cols = [column for column in columns if column.concept == concept]
+    if not matching_cols:
+        raise UnresolvableCriterion(f"No column mapped to concept '{concept}'.")
+    exprs = [
+        pl.col(column.col_name).str.to_lowercase().eq(ontology_id.lower())
+        for column in matching_cols
+    ]
+    return pl.any_horizontal(exprs)
+
+
 def _build_attr_expr(
     criterion: dict,
     columns: list[ColumnSpec],
@@ -321,6 +343,9 @@ def _parse_criterion(
     if "attribute" in criterion and "operator" in criterion:
         return _build_attr_expr(criterion, columns, value_encodings, df_schema)
 
+    if "ontology_id" in criterion and ctype in _VALID_CONCEPTS:
+        return _build_ontology_id_expr(criterion, columns, ctype)
+
     if ctype in _VALID_CONCEPTS:
         return _build_text_expr(criterion, columns, ctype)
 
diff --git a/tests/test_filter.py b/tests/test_filter.py
@@ -16,6 +16,7 @@
     _build_code_expr,
     _build_text_expr,
     _build_attr_expr,
+    _build_ontology_id_expr,
     _parse_criterion,
 )
 
@@ -393,6 +394,68 @@ def test_raises_when_no_column_mapped_to_concept(self, icd_columns):
             _build_text_expr(criterion, icd_columns, "epidemiological_history")
 
 
+class TestBuildOntologyIdExpr:
+    @pytest.fixture
+    def ontology_df(self):
+        return pl.DataFrame(
+            {"phenotype": ["hpo:0002045", "hpo:0000975", "hpo:0001649", None]}
+        )
+
+    @pytest.fixture
+    def symptom_columns(self):
+        return [ColumnSpec("phenotype", concept="symptom")]
+
+    def test_matches_exact_ontology_id(self, ontology_df, symptom_columns):
+        criterion = {
+            "type": "symptom",
+            "name": "Hyperthermia",
+            "ontology_id": "hpo:0002045",
+        }
+        result = ontology_df.filter(
+            _build_ontology_id_expr(criterion, symptom_columns, "symptom")
+        )
+        assert result.height == 1
+        assert result["phenotype"][0] == "hpo:0002045"
+
+    def test_no_match_returns_empty(self, ontology_df, symptom_columns):
+        criterion = {"type": "symptom", "name": "Unknown", "ontology_id": "hpo:9999999"}
+        result = ontology_df.filter(
+            _build_ontology_id_expr(criterion, symptom_columns, "symptom")
+        )
+        assert result.is_empty()
+
+    def test_matching_is_case_insensitive(self, symptom_columns):
+        df = pl.DataFrame({"phenotype": ["HPO:0002045", "hpo:0000975"]})
+        criterion = {"type": "symptom", "ontology_id": "hpo:0002045"}
+        result = df.filter(
+            _build_ontology_id_expr(criterion, symptom_columns, "symptom")
+        )
+        assert result.height == 1
+        assert result["phenotype"][0] == "HPO:0002045"
+
+    def test_matches_across_multiple_columns(self):
+        df = pl.DataFrame(
+            {
+                "phenotype_1": ["hpo:0002045", "hpo:0000001"],
+                "phenotype_2": ["hpo:0000001", "hpo:0001649"],
+            }
+        )
+        columns = [
+            ColumnSpec("phenotype_1", concept="symptom"),
+            ColumnSpec("phenotype_2", concept="symptom"),
+        ]
+        criterion = {"type": "symptom", "ontology_id": "hpo:0001649"}
+        result = df.filter(_build_ontology_id_expr(criterion, columns, "symptom"))
+        assert result.height == 1
+        assert result["phenotype_2"][0] == "hpo:0001649"
+
+    def test_raises_when_no_column_mapped_to_concept(self):
+        columns = [ColumnSpec("icd_code", concept="diagnosis")]
+        criterion = {"type": "symptom", "ontology_id": "hpo:0002045"}
+        with pytest.raises(UnresolvableCriterion, match="symptom"):
+            _build_ontology_id_expr(criterion, columns, "symptom")
+
+
 class TestBuildAttrExpr:
     def test_numeric_greater_than(self, fake_dataset, demographic_columns):
         assert (fake_dataset["age"] > 60).sum() == 190  # expected value
@@ -624,6 +687,31 @@ def test_epidemiological_history_with_attribute_uses_attr_expr(self, fake_datase
         result = fake_dataset.filter(_parse_criterion(criterion, columns))
         assert (result["age"].le(14)).all()
 
+    def test_ontology_id_routes_to_ontology_expr(self):
+        df = pl.DataFrame({"phenotype": ["hpo:0002045", "hpo:0000975", "hpo:0001649"]})
+        columns = [ColumnSpec("phenotype", concept="symptom")]
+        criterion = {
+            "type": "symptom",
+            "name": "Tachycardia",
+            "ontology_id": "hpo:0001649",
+        }
+        result = df.filter(_parse_criterion(criterion, columns))
+        assert result.height == 1
+        assert result["phenotype"][0] == "hpo:0001649"
+
+    def test_ontology_id_takes_precedence_over_text_matching(self):
+        # Column contains ontology IDs; name would not match, ontology_id does
+        df = pl.DataFrame({"phenotype": ["hpo:0001649", "Frankfurt"]})
+        columns = [ColumnSpec("phenotype", concept="symptom")]
+        criterion = {
+            "type": "symptom",
+            "name": "Frankfurt",
+            "ontology_id": "hpo:0001649",
+        }
+        result = df.filter(_parse_criterion(criterion, columns))
+        assert result.height == 1
+        assert result["phenotype"][0] == "hpo:0001649"
+
     def test_syndrome_type_raises(self, all_columns):
         with pytest.raises(UnresolvableCriterion, match="syndrome"):
             _parse_criterion({"type": "syndrome", "name": "Dengue"}, all_columns)