|
| 1 | +import pytest |
| 2 | + |
| 3 | +from cytetype.preprocessing.validation import _is_gene_id_like, _id_like_percentage |
| 4 | + |
| 5 | + |
| 6 | +class TestIsGeneIdLike: |
| 7 | + |
| 8 | + @pytest.mark.parametrize("value", [ |
| 9 | + "ENSG00000000003", |
| 10 | + "ENSG00000000003.14", |
| 11 | + "ENSMUSG00000000001", |
| 12 | + "ensg00000000003", |
| 13 | + ]) |
| 14 | + def test_ensembl_ids(self, value: str) -> None: |
| 15 | + assert _is_gene_id_like(value) is True |
| 16 | + |
| 17 | + @pytest.mark.parametrize("value", [ |
| 18 | + "NM_001301", |
| 19 | + "NR_046018", |
| 20 | + "XM_011541", |
| 21 | + "XR_001737", |
| 22 | + ]) |
| 23 | + def test_refseq_ids(self, value: str) -> None: |
| 24 | + assert _is_gene_id_like(value) is True |
| 25 | + |
| 26 | + @pytest.mark.parametrize("value", [ |
| 27 | + "7157", |
| 28 | + "672", |
| 29 | + "11286", |
| 30 | + "0", |
| 31 | + ]) |
| 32 | + def test_integer_entrez_ids(self, value: str) -> None: |
| 33 | + assert _is_gene_id_like(value) is True |
| 34 | + |
| 35 | + @pytest.mark.parametrize("value", [ |
| 36 | + "7157.0", |
| 37 | + "672.0", |
| 38 | + "11286.0", |
| 39 | + "0.0", |
| 40 | + ]) |
| 41 | + def test_float_stringified_entrez_ids(self, value: str) -> None: |
| 42 | + assert _is_gene_id_like(value) is True |
| 43 | + |
| 44 | + @pytest.mark.parametrize("value", [ |
| 45 | + "AFFY_HG_U133A.207163_S_AT", |
| 46 | + "ILLUMINA_HUMANHT_12_V4.ILMN_1762337", |
| 47 | + ]) |
| 48 | + def test_long_dotted_ids(self, value: str) -> None: |
| 49 | + assert _is_gene_id_like(value) is True |
| 50 | + |
| 51 | + @pytest.mark.parametrize("value", [ |
| 52 | + "TSPAN6", |
| 53 | + "DPM1", |
| 54 | + "SCYL3", |
| 55 | + "TP53", |
| 56 | + "BRCA1", |
| 57 | + "CD8A", |
| 58 | + "MS4A1", |
| 59 | + ]) |
| 60 | + def test_gene_symbols_not_flagged(self, value: str) -> None: |
| 61 | + assert _is_gene_id_like(value) is False |
| 62 | + |
| 63 | + @pytest.mark.parametrize("value", [ |
| 64 | + "", |
| 65 | + " ", |
| 66 | + "7157.5", |
| 67 | + ]) |
| 68 | + def test_edge_cases(self, value: str) -> None: |
| 69 | + assert _is_gene_id_like(value) is False |
| 70 | + |
| 71 | + |
| 72 | +class TestIdLikePercentage: |
| 73 | + |
| 74 | + def test_all_gene_symbols(self) -> None: |
| 75 | + values = ["TSPAN6", "DPM1", "SCYL3", "TP53", "BRCA1"] |
| 76 | + assert _id_like_percentage(values) == 0.0 |
| 77 | + |
| 78 | + def test_all_ensembl_ids(self) -> None: |
| 79 | + values = [f"ENSG{i:011d}" for i in range(20)] |
| 80 | + assert _id_like_percentage(values) == 100.0 |
| 81 | + |
| 82 | + def test_all_integer_entrez_ids(self) -> None: |
| 83 | + values = ["7157", "672", "3845", "11286", "9952"] |
| 84 | + assert _id_like_percentage(values) == 100.0 |
| 85 | + |
| 86 | + def test_all_float_entrez_ids(self) -> None: |
| 87 | + values = ["7157.0", "672.0", "3845.0", "11286.0", "9952.0"] |
| 88 | + assert _id_like_percentage(values) == 100.0 |
| 89 | + |
| 90 | + def test_mixed_float_entrez_and_symbols(self) -> None: |
| 91 | + symbols = ["TSPAN6", "DPM1", "SCYL3"] |
| 92 | + entrez = ["7157.0", "672.0", "3845.0", "11286.0", "9952.0", "904.0", "405.0"] |
| 93 | + pct = _id_like_percentage(symbols + entrez) |
| 94 | + assert pct == 70.0 |
| 95 | + |
| 96 | + def test_empty_list(self) -> None: |
| 97 | + assert _id_like_percentage([]) == 100.0 |
0 commit comments