Skip to content

Commit 7490e8c

Browse files
Fix add_peak_annotation crash on empty distance values (#182)
* Fix add_peak_annotation crash on empty distance values When the distance column in peak annotation TSV contains empty values (intergenic peaks), pandas may infer a numeric dtype. This caused LossySetitemError when trying to assign "" into the numeric column. Fix by converting distance to string via object dtype early, and using pd.to_numeric with errors="coerce" for the final int conversion, which gracefully handles empty/NaN values as 0. Add regression tests for empty distance values and semicolon-separated multi-gene annotations. Closes #181 * rework add_peak_annotation to make use of Pandas' pd.NA Pandas 1.0 has been released sufficiently long ago that we can assume muon users have it. --------- Co-authored-by: Ilia Kats <ilia-kats@gmx.net>
1 parent 889d1ba commit 7490e8c

3 files changed

Lines changed: 82 additions & 33 deletions

File tree

muon/_atac/tools.py

Lines changed: 30 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from pathlib import Path
77
from datetime import datetime
88
from warnings import warn
9+
from contextlib import suppress
910

1011
import numpy as np
1112
import pandas as pd
@@ -81,7 +82,7 @@ def lsi(data: Union[AnnData, MuData], scale_embeddings=True, n_comps=50):
8182

8283
def add_peak_annotation(
8384
data: Union[AnnData, MuData],
84-
annotation: Union[str, pd.DataFrame],
85+
annotation: Union[str, Path, pd.DataFrame],
8586
sep: str = "\t",
8687
return_annotation: bool = False,
8788
):
@@ -108,15 +109,12 @@ def add_peak_annotation(
108109
else:
109110
raise TypeError("Expected AnnData or MuData object with 'atac' modality")
110111

111-
if isinstance(annotation, str):
112-
pa = pd.read_csv(annotation, sep=sep)
113-
else:
112+
if isinstance(annotation, pd.DataFrame):
114113
pa = annotation
114+
else:
115+
pa = pd.read_csv(annotation, sep=sep)
115116

116-
# Convert null values to empty strings
117-
pa.loc[pa.gene.isnull(), "gene"] = ""
118-
pa.loc[pa.distance.isnull(), "distance"] = ""
119-
pa.loc[pa.peak_type.isnull(), "peak_type"] = ""
117+
pa = pa.convert_dtypes()
120118

121119
# If peak name is not in the annotation table, reconstruct it:
122120
# peak = chrom:start-end
@@ -133,38 +131,38 @@ def add_peak_annotation(
133131
raise AttributeError(
134132
f"Peak annotation does not in contain neighter peak column nor chrom, start, and end columns."
135133
)
134+
else:
135+
# chrX_NNNNN_NNNNN -> chrX:NNNNN-NNNNN
136+
pa["peak"] = pa["peak"].str.replace("_", ":", 1).str.replace("_", "-", 1)
136137

137138
# Split genes, distances, and peaks into individual records
138-
pa_g = pd.DataFrame(pa.gene.str.split(";").tolist(), index=pa.peak).stack()
139-
pa_d = pd.DataFrame(pa.distance.astype(str).str.split(";").tolist(), index=pa.peak).stack()
140-
pa_p = pd.DataFrame(pa.peak_type.str.split(";").tolist(), index=pa.peak).stack()
141139

142-
# Make a long dataframe indexed by gene
143-
pa_long = pd.concat(
144-
[pa_g.reset_index()[["peak", 0]], pa_d.reset_index()[[0]], pa_p.reset_index()[[0]]], axis=1
145-
)
146-
pa_long.columns = ["peak", "gene", "distance", "peak_type"]
147-
pa_long = pa_long.set_index("gene")
148-
149-
# chrX_NNNNN_NNNNN -> chrX:NNNNN-NNNNN
150-
pa_long.peak = [peak.replace("_", ":", 1).replace("_", "-", 1) for peak in pa_long.peak]
151-
152-
# Make distance values integers with 0 for intergenic peaks
153-
# DEPRECATED: Make distance values nullable integers
154-
# See https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html
155-
null_distance = pa_long.distance == ""
156-
pa_long.loc[null_distance, "distance"] = 0
157-
pa_long.distance = pa_long.distance.astype(float).astype(int)
158-
# DEPRECATED: Int64 is not recognized when saving HDF5 files with scanpy.write
159-
# pa_long.distance = pa_long.distance.astype(int).astype("Int64")
160-
# pa_long.distance[null_distance] = np.nan
140+
if pd.api.types.is_string_dtype(pa.distance):
141+
pa = pa.set_index("peak")
142+
pa_g = pa.gene.str.split(";").explode()
143+
pa_d = pa.distance.str.split(";").explode().astype(int)
144+
pa_p = pa.peak_type.str.split(";").explode()
145+
146+
# Make a long dataframe indexed by gene
147+
pa = pd.concat((pa_g, pa_d, pa_p), axis=1).reset_index()
148+
else:
149+
pa = pa[["peak", "gene", "distance", "peak_type"]]
150+
151+
with suppress(ValueError): # missing values
152+
pa["distance"] = pa["distance"].astype(int)
153+
154+
# TODO: nullable strings work with anndata >= 0.13
155+
for col in ("peak", "gene", "peak_type"):
156+
pa[col] = pa[col].fillna("").astype(object)
157+
158+
pa = pa.set_index("gene")
161159

162160
if "atac" not in adata.uns:
163161
adata.uns["atac"] = dict()
164-
adata.uns["atac"]["peak_annotation"] = pa_long
162+
adata.uns["atac"]["peak_annotation"] = pa
165163

166164
if return_annotation:
167-
return pa_long
165+
return pa
168166

169167

170168
def add_peak_annotation_gene_names(

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ classifiers = [
1919
requires-python = ">= 3.10"
2020
requires = [
2121
"numpy",
22-
"pandas",
22+
"pandas>=1",
2323
"matplotlib",
2424
"seaborn",
2525
"h5py",

tests/test_atac_tools.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import unittest
2+
from io import StringIO
3+
4+
import numpy as np
5+
import pandas as pd
6+
from anndata import AnnData
7+
import muon
8+
9+
10+
class TestAddPeakAnnotation(unittest.TestCase):
11+
"""Regression tests for add_peak_annotation with empty distance values (#181)."""
12+
13+
def test_empty_distance_values(self):
14+
"""Intergenic peaks with empty distance should not raise."""
15+
tsv = StringIO(
16+
"chrom\tstart\tend\tgene\tdistance\tpeak_type\n"
17+
"chr1\t100\t200\t\t\tintergenic\n"
18+
"chr1\t300\t400\tGeneA\t-173268\tdistal\n"
19+
)
20+
pa = pd.read_csv(tsv, sep="\t")
21+
peaks = ["chr1:100-200", "chr1:300-400"]
22+
adata = AnnData(np.zeros((2, 2)))
23+
adata.var_names = peaks
24+
25+
result = muon.atac.tl.add_peak_annotation(adata, pa, return_annotation=True)
26+
27+
assert result.distance.dtype == pd.Int64Dtype()
28+
assert result.distance.iloc[0] is pd.NA
29+
assert result.distance.iloc[1] == -173268
30+
assert (result.peak == peaks).all()
31+
32+
def test_semicolon_separated_distances(self):
33+
"""Multi-gene peaks with semicolon-separated distances should work."""
34+
tsv = StringIO(
35+
"chrom\tstart\tend\tgene\tdistance\tpeak_type\n"
36+
"chr1\t100\t200\tGeneA;GeneB\t-100;200\tpromoter;distal\n"
37+
)
38+
pa = pd.read_csv(tsv, sep="\t")
39+
adata = AnnData(np.zeros((1, 1)))
40+
adata.var_names = ["chr1:100-200"]
41+
42+
result = muon.atac.tl.add_peak_annotation(adata, pa, return_annotation=True)
43+
44+
assert result.distance.dtype == np.int64
45+
assert result.distance.iloc[0] == -100
46+
assert result.distance.iloc[1] == 200
47+
assert (result.peak.iloc[0] == result.peak.iloc[1] == adata.var_names).all()
48+
49+
50+
if __name__ == "__main__":
51+
unittest.main()

0 commit comments

Comments
 (0)