Skip to content

Commit f1b0d09

Browse files
authored
Gene symbols patch (#76)
* clean unused arg * Update validation.py * Create test_validation.py * comma * qodo fix 1 * Update validation.py * revert arg cleanup
1 parent 6afd980 commit f1b0d09

2 files changed

Lines changed: 112 additions & 2 deletions

File tree

cytetype/preprocessing/validation.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def _is_gene_id_like(value: str) -> bool:
2828
if re.match(r"^[NX][MR]_\d+$", value):
2929
return True
3030

31-
if re.match(r"^\d+$", value):
31+
if re.match(r"^\d+(?:\.0)?$", value):
3232
return True
3333

3434
if re.match(r"^[A-Z0-9]+[._][A-Z0-9._]+$", value) and len(value) > 10:
@@ -102,7 +102,20 @@ def materialize_canonical_gene_symbols_column(
102102
source_name = f"column '{gene_symbols_column}'"
103103

104104
canonical_column = _temporary_gene_symbols_column_name(adata)
105-
adata.var[canonical_column] = clean_gene_names(source_values)
105+
cleaned = clean_gene_names(source_values)
106+
107+
id_pct = _id_like_percentage(cleaned)
108+
if id_pct > 49:
109+
raise ValueError(
110+
f"\n\nGene Symbol Detection Error\n"
111+
f"{'─' * 50}\n"
112+
f"CyteType requires human-readable gene symbols (e.g., TSPAN6, DPM1, SCYL3)\n"
113+
f"To fix this, either:\n"
114+
f" 1. Set gene_symbols_column to a column in adata.var that contains gene symbols\n"
115+
f" 2. Convert your gene identifiers to symbols before running CyteType\n"
116+
)
117+
118+
adata.var[canonical_column] = cleaned
106119
logger.info(
107120
f"Materialized canonical gene symbols in temporary column '{canonical_column}' "
108121
f"from {source_name}."

tests/test_validation.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import pytest
2+
3+
from cytetype.preprocessing.validation import _is_gene_id_like, _id_like_percentage
4+
5+
6+
class TestIsGeneIdLike:
7+
8+
@pytest.mark.parametrize("value", [
9+
"ENSG00000000003",
10+
"ENSG00000000003.14",
11+
"ENSMUSG00000000001",
12+
"ensg00000000003",
13+
])
14+
def test_ensembl_ids(self, value: str) -> None:
15+
assert _is_gene_id_like(value) is True
16+
17+
@pytest.mark.parametrize("value", [
18+
"NM_001301",
19+
"NR_046018",
20+
"XM_011541",
21+
"XR_001737",
22+
])
23+
def test_refseq_ids(self, value: str) -> None:
24+
assert _is_gene_id_like(value) is True
25+
26+
@pytest.mark.parametrize("value", [
27+
"7157",
28+
"672",
29+
"11286",
30+
"0",
31+
])
32+
def test_integer_entrez_ids(self, value: str) -> None:
33+
assert _is_gene_id_like(value) is True
34+
35+
@pytest.mark.parametrize("value", [
36+
"7157.0",
37+
"672.0",
38+
"11286.0",
39+
"0.0",
40+
])
41+
def test_float_stringified_entrez_ids(self, value: str) -> None:
42+
assert _is_gene_id_like(value) is True
43+
44+
@pytest.mark.parametrize("value", [
45+
"AFFY_HG_U133A.207163_S_AT",
46+
"ILLUMINA_HUMANHT_12_V4.ILMN_1762337",
47+
])
48+
def test_long_dotted_ids(self, value: str) -> None:
49+
assert _is_gene_id_like(value) is True
50+
51+
@pytest.mark.parametrize("value", [
52+
"TSPAN6",
53+
"DPM1",
54+
"SCYL3",
55+
"TP53",
56+
"BRCA1",
57+
"CD8A",
58+
"MS4A1",
59+
])
60+
def test_gene_symbols_not_flagged(self, value: str) -> None:
61+
assert _is_gene_id_like(value) is False
62+
63+
@pytest.mark.parametrize("value", [
64+
"",
65+
" ",
66+
"7157.5",
67+
])
68+
def test_edge_cases(self, value: str) -> None:
69+
assert _is_gene_id_like(value) is False
70+
71+
72+
class TestIdLikePercentage:
73+
74+
def test_all_gene_symbols(self) -> None:
75+
values = ["TSPAN6", "DPM1", "SCYL3", "TP53", "BRCA1"]
76+
assert _id_like_percentage(values) == 0.0
77+
78+
def test_all_ensembl_ids(self) -> None:
79+
values = [f"ENSG{i:011d}" for i in range(20)]
80+
assert _id_like_percentage(values) == 100.0
81+
82+
def test_all_integer_entrez_ids(self) -> None:
83+
values = ["7157", "672", "3845", "11286", "9952"]
84+
assert _id_like_percentage(values) == 100.0
85+
86+
def test_all_float_entrez_ids(self) -> None:
87+
values = ["7157.0", "672.0", "3845.0", "11286.0", "9952.0"]
88+
assert _id_like_percentage(values) == 100.0
89+
90+
def test_mixed_float_entrez_and_symbols(self) -> None:
91+
symbols = ["TSPAN6", "DPM1", "SCYL3"]
92+
entrez = ["7157.0", "672.0", "3845.0", "11286.0", "9952.0", "904.0", "405.0"]
93+
pct = _id_like_percentage(symbols + entrez)
94+
assert pct == 70.0
95+
96+
def test_empty_list(self) -> None:
97+
assert _id_like_percentage([]) == 100.0

0 commit comments

Comments
 (0)