Skip to content

Commit eca6756

Browse files
committed
rework add_peak_annotation to make use of Pandas' pd.NA
Pandas 1.0 has been released sufficiently long ago that we can assume muon users have it.
1 parent 0cf0a36 commit eca6756

File tree

3 files changed

+39
-33
lines changed

3 files changed

+39
-33
lines changed

muon/_atac/tools.py

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from pathlib import Path
77
from datetime import datetime
88
from warnings import warn
9+
from contextlib import suppress
910

1011
import numpy as np
1112
import pandas as pd
@@ -113,12 +114,7 @@ def add_peak_annotation(
113114
else:
114115
pa = annotation
115116

116-
# Convert null values to empty strings
117-
pa.loc[pa.gene.isnull(), "gene"] = ""
118-
# Convert distance to string via object dtype — pandas may infer a numeric
119-
# or nullable StringDtype, both of which break on direct "" assignment
120-
pa["distance"] = pa["distance"].astype(object).fillna("").astype(str)
121-
pa.loc[pa.peak_type.isnull(), "peak_type"] = ""
117+
pa = pa.convert_dtypes()
122118

123119
# If peak name is not in the annotation table, reconstruct it:
124120
# peak = chrom:start-end
@@ -135,31 +131,41 @@ def add_peak_annotation(
135131
raise AttributeError(
136132
f"Peak annotation does not in contain neighter peak column nor chrom, start, and end columns."
137133
)
134+
else:
135+
# chrX_NNNNN_NNNNN -> chrX:NNNNN-NNNNN
136+
import ipdb
137+
138+
ipdb.set_trace()
139+
pa["peak"] = pa["peak"].str.replace("_", ":", 1).str.replace("_", "-", 1)
138140

139141
# Split genes, distances, and peaks into individual records
140-
pa_g = pd.DataFrame(pa.gene.str.split(";").tolist(), index=pa.peak).stack()
141-
pa_d = pd.DataFrame(pa.distance.astype(str).str.split(";").tolist(), index=pa.peak).stack()
142-
pa_p = pd.DataFrame(pa.peak_type.str.split(";").tolist(), index=pa.peak).stack()
143142

144-
# Make a long dataframe indexed by gene
145-
pa_long = pd.concat(
146-
[pa_g.reset_index()[["peak", 0]], pa_d.reset_index()[[0]], pa_p.reset_index()[[0]]], axis=1
147-
)
148-
pa_long.columns = ["peak", "gene", "distance", "peak_type"]
149-
pa_long = pa_long.set_index("gene")
143+
if pd.api.types.is_string_dtype(pa.distance):
144+
pa = pa.set_index("peak")
145+
pa_g = pa.gene.str.split(";").explode()
146+
pa_d = pa.distance.str.split(";").explode().astype(int)
147+
pa_p = pa.peak_type.str.split(";").explode()
148+
149+
# Make a long dataframe indexed by gene
150+
pa = pd.concat((pa_g, pa_d, pa_p), axis=1).reset_index()
151+
else:
152+
pa = pa[["peak", "gene", "distance", "peak_type"]]
153+
154+
with suppress(ValueError): # missing values
155+
pa["distance"] = pa["distance"].astype(int)
150156

151-
# chrX_NNNNN_NNNNN -> chrX:NNNNN-NNNNN
152-
pa_long.peak = [peak.replace("_", ":", 1).replace("_", "-", 1) for peak in pa_long.peak]
157+
# TODO: nullable strings work with anndata >= 0.13
158+
for col in ("peak", "gene", "peak_type"):
159+
pa[col] = pa[col].fillna("").astype(object)
153160

154-
# Make distance values integers with 0 for intergenic peaks (empty/NaN → 0)
155-
pa_long["distance"] = pd.to_numeric(pa_long["distance"], errors="coerce").fillna(0).astype(int)
161+
pa = pa.set_index("gene")
156162

157163
if "atac" not in adata.uns:
158164
adata.uns["atac"] = dict()
159-
adata.uns["atac"]["peak_annotation"] = pa_long
165+
adata.uns["atac"]["peak_annotation"] = pa
160166

161167
if return_annotation:
162-
return pa_long
168+
return pa
163169

164170

165171
def add_peak_annotation_gene_names(

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ classifiers = [
1919
requires-python = ">= 3.10"
2020
requires = [
2121
"numpy",
22-
"pandas",
22+
"pandas>=1",
2323
"matplotlib",
2424
"seaborn",
2525
"h5py",

tests/test_atac_tools.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55
import pandas as pd
66
from anndata import AnnData
7-
from muon._atac.tools import add_peak_annotation
7+
import muon
88

99

1010
class TestAddPeakAnnotation(unittest.TestCase):
@@ -22,13 +22,12 @@ def test_empty_distance_values(self):
2222
adata = AnnData(np.zeros((2, 2)))
2323
adata.var_names = peaks
2424

25-
result = add_peak_annotation(adata, pa, return_annotation=True)
25+
result = muon.atac.tl.add_peak_annotation(adata, pa, return_annotation=True)
2626

27-
self.assertEqual(result.distance.dtype, np.int64)
28-
# Intergenic peak distance should be 0
29-
self.assertIn(0, result.distance.values)
30-
# Distal peak distance should be preserved
31-
self.assertIn(-173268, result.distance.values)
27+
assert result.distance.dtype == pd.Int64Dtype()
28+
assert result.distance.iloc[0] is pd.NA
29+
assert result.distance.iloc[1] == -173268
30+
assert (result.peak == peaks).all()
3231

3332
def test_semicolon_separated_distances(self):
3433
"""Multi-gene peaks with semicolon-separated distances should work."""
@@ -40,11 +39,12 @@ def test_semicolon_separated_distances(self):
4039
adata = AnnData(np.zeros((1, 1)))
4140
adata.var_names = ["chr1:100-200"]
4241

43-
result = add_peak_annotation(adata, pa, return_annotation=True)
42+
result = muon.atac.tl.add_peak_annotation(adata, pa, return_annotation=True)
4443

45-
self.assertEqual(result.distance.dtype, np.int64)
46-
self.assertIn(-100, result.distance.values)
47-
self.assertIn(200, result.distance.values)
44+
assert result.distance.dtype == np.int64
45+
assert result.distance.iloc[0] == -100
46+
assert result.distance.iloc[1] == 200
47+
assert (result.peak.iloc[0] == result.peak.iloc[1] == adata.var_names).all()
4848

4949

5050
if __name__ == "__main__":

0 commit comments

Comments
 (0)