rework add_peak_annotation to make use of Pandas' pd.NA

ilia-kats · ilia-kats · commit eca6756f333c · 2026-03-12T15:32:50.000+01:00
Pandas 1.0 has been released sufficiently long ago that we can assume
muon users have it.
diff --git a/muon/_atac/tools.py b/muon/_atac/tools.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 from datetime import datetime
 from warnings import warn
+from contextlib import suppress
 
 import numpy as np
 import pandas as pd
@@ -113,12 +114,7 @@ def add_peak_annotation(
     else:
         pa = annotation
 
-    # Convert null values to empty strings
-    pa.loc[pa.gene.isnull(), "gene"] = ""
-    # Convert distance to string via object dtype — pandas may infer a numeric
-    # or nullable StringDtype, both of which break on direct "" assignment
-    pa["distance"] = pa["distance"].astype(object).fillna("").astype(str)
-    pa.loc[pa.peak_type.isnull(), "peak_type"] = ""
+    pa = pa.convert_dtypes()
 
     # If peak name is not in the annotation table, reconstruct it:
     # peak = chrom:start-end
@@ -135,31 +131,41 @@ def add_peak_annotation(
             raise AttributeError(
                 f"Peak annotation does not in contain neighter peak column nor chrom, start, and end columns."
             )
+    else:
+        # chrX_NNNNN_NNNNN -> chrX:NNNNN-NNNNN
+        import ipdb
+
+        ipdb.set_trace()
+        pa["peak"] = pa["peak"].str.replace("_", ":", 1).str.replace("_", "-", 1)
 
     # Split genes, distances, and peaks into individual records
-    pa_g = pd.DataFrame(pa.gene.str.split(";").tolist(), index=pa.peak).stack()
-    pa_d = pd.DataFrame(pa.distance.astype(str).str.split(";").tolist(), index=pa.peak).stack()
-    pa_p = pd.DataFrame(pa.peak_type.str.split(";").tolist(), index=pa.peak).stack()
 
-    # Make a long dataframe indexed by gene
-    pa_long = pd.concat(
-        [pa_g.reset_index()[["peak", 0]], pa_d.reset_index()[[0]], pa_p.reset_index()[[0]]], axis=1
-    )
-    pa_long.columns = ["peak", "gene", "distance", "peak_type"]
-    pa_long = pa_long.set_index("gene")
+    if pd.api.types.is_string_dtype(pa.distance):
+        pa = pa.set_index("peak")
+        pa_g = pa.gene.str.split(";").explode()
+        pa_d = pa.distance.str.split(";").explode().astype(int)
+        pa_p = pa.peak_type.str.split(";").explode()
+
+        # Make a long dataframe indexed by gene
+        pa = pd.concat((pa_g, pa_d, pa_p), axis=1).reset_index()
+    else:
+        pa = pa[["peak", "gene", "distance", "peak_type"]]
+
+    with suppress(ValueError):  # missing values
+        pa["distance"] = pa["distance"].astype(int)
 
-    # chrX_NNNNN_NNNNN -> chrX:NNNNN-NNNNN
-    pa_long.peak = [peak.replace("_", ":", 1).replace("_", "-", 1) for peak in pa_long.peak]
+    # TODO: nullable strings work with anndata >= 0.13
+    for col in ("peak", "gene", "peak_type"):
+        pa[col] = pa[col].fillna("").astype(object)
 
-    # Make distance values integers with 0 for intergenic peaks (empty/NaN → 0)
-    pa_long["distance"] = pd.to_numeric(pa_long["distance"], errors="coerce").fillna(0).astype(int)
+    pa = pa.set_index("gene")
 
     if "atac" not in adata.uns:
         adata.uns["atac"] = dict()
-    adata.uns["atac"]["peak_annotation"] = pa_long
+    adata.uns["atac"]["peak_annotation"] = pa
 
     if return_annotation:
-        return pa_long
+        return pa
 
 
 def add_peak_annotation_gene_names(
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,7 +19,7 @@ classifiers = [
 requires-python = ">= 3.10"
 requires = [
     "numpy",
-    "pandas",
+    "pandas>=1",
     "matplotlib",
     "seaborn",
     "h5py",
diff --git a/tests/test_atac_tools.py b/tests/test_atac_tools.py
@@ -4,7 +4,7 @@
 import numpy as np
 import pandas as pd
 from anndata import AnnData
-from muon._atac.tools import add_peak_annotation
+import muon
 
 
 class TestAddPeakAnnotation(unittest.TestCase):
@@ -22,13 +22,12 @@ def test_empty_distance_values(self):
         adata = AnnData(np.zeros((2, 2)))
         adata.var_names = peaks
 
-        result = add_peak_annotation(adata, pa, return_annotation=True)
+        result = muon.atac.tl.add_peak_annotation(adata, pa, return_annotation=True)
 
-        self.assertEqual(result.distance.dtype, np.int64)
-        # Intergenic peak distance should be 0
-        self.assertIn(0, result.distance.values)
-        # Distal peak distance should be preserved
-        self.assertIn(-173268, result.distance.values)
+        assert result.distance.dtype == pd.Int64Dtype()
+        assert result.distance.iloc[0] is pd.NA
+        assert result.distance.iloc[1] == -173268
+        assert (result.peak == peaks).all()
 
     def test_semicolon_separated_distances(self):
         """Multi-gene peaks with semicolon-separated distances should work."""
@@ -40,11 +39,12 @@ def test_semicolon_separated_distances(self):
         adata = AnnData(np.zeros((1, 1)))
         adata.var_names = ["chr1:100-200"]
 
-        result = add_peak_annotation(adata, pa, return_annotation=True)
+        result = muon.atac.tl.add_peak_annotation(adata, pa, return_annotation=True)
 
-        self.assertEqual(result.distance.dtype, np.int64)
-        self.assertIn(-100, result.distance.values)
-        self.assertIn(200, result.distance.values)
+        assert result.distance.dtype == np.int64
+        assert result.distance.iloc[0] == -100
+        assert result.distance.iloc[1] == 200
+        assert (result.peak.iloc[0] == result.peak.iloc[1] == adata.var_names).all()
 
 
 if __name__ == "__main__":