66from pathlib import Path
77from datetime import datetime
88from warnings import warn
9+ from contextlib import suppress
910
1011import numpy as np
1112import pandas as pd
@@ -113,12 +114,7 @@ def add_peak_annotation(
113114 else :
114115 pa = annotation
115116
116- # Convert null values to empty strings
117- pa .loc [pa .gene .isnull (), "gene" ] = ""
118- # Convert distance to string via object dtype — pandas may infer a numeric
119- # or nullable StringDtype, both of which break on direct "" assignment
120- pa ["distance" ] = pa ["distance" ].astype (object ).fillna ("" ).astype (str )
121- pa .loc [pa .peak_type .isnull (), "peak_type" ] = ""
117+ pa = pa .convert_dtypes ()
122118
123119 # If peak name is not in the annotation table, reconstruct it:
124120 # peak = chrom:start-end
@@ -135,31 +131,41 @@ def add_peak_annotation(
135131 raise AttributeError (
136132 f"Peak annotation does not in contain neighter peak column nor chrom, start, and end columns."
137133 )
134+ else :
135+ # chrX_NNNNN_NNNNN -> chrX:NNNNN-NNNNN
136+ import ipdb
137+
138+ ipdb .set_trace ()
139+ pa ["peak" ] = pa ["peak" ].str .replace ("_" , ":" , 1 ).str .replace ("_" , "-" , 1 )
138140
139141 # Split genes, distances, and peaks into individual records
140- pa_g = pd .DataFrame (pa .gene .str .split (";" ).tolist (), index = pa .peak ).stack ()
141- pa_d = pd .DataFrame (pa .distance .astype (str ).str .split (";" ).tolist (), index = pa .peak ).stack ()
142- pa_p = pd .DataFrame (pa .peak_type .str .split (";" ).tolist (), index = pa .peak ).stack ()
143142
144- # Make a long dataframe indexed by gene
145- pa_long = pd .concat (
146- [pa_g .reset_index ()[["peak" , 0 ]], pa_d .reset_index ()[[0 ]], pa_p .reset_index ()[[0 ]]], axis = 1
147- )
148- pa_long .columns = ["peak" , "gene" , "distance" , "peak_type" ]
149- pa_long = pa_long .set_index ("gene" )
143+ if pd .api .types .is_string_dtype (pa .distance ):
144+ pa = pa .set_index ("peak" )
145+ pa_g = pa .gene .str .split (";" ).explode ()
146+ pa_d = pa .distance .str .split (";" ).explode ().astype (int )
147+ pa_p = pa .peak_type .str .split (";" ).explode ()
148+
149+ # Make a long dataframe indexed by gene
150+ pa = pd .concat ((pa_g , pa_d , pa_p ), axis = 1 ).reset_index ()
151+ else :
152+ pa = pa [["peak" , "gene" , "distance" , "peak_type" ]]
153+
154+ with suppress (ValueError ): # missing values
155+ pa ["distance" ] = pa ["distance" ].astype (int )
150156
151- # chrX_NNNNN_NNNNN -> chrX:NNNNN-NNNNN
152- pa_long .peak = [peak .replace ("_" , ":" , 1 ).replace ("_" , "-" , 1 ) for peak in pa_long .peak ]
157+ # TODO: nullable strings work with anndata >= 0.13
158+ for col in ("peak" , "gene" , "peak_type" ):
159+ pa [col ] = pa [col ].fillna ("" ).astype (object )
153160
154- # Make distance values integers with 0 for intergenic peaks (empty/NaN → 0)
155- pa_long ["distance" ] = pd .to_numeric (pa_long ["distance" ], errors = "coerce" ).fillna (0 ).astype (int )
161+ pa = pa .set_index ("gene" )
156162
157163 if "atac" not in adata .uns :
158164 adata .uns ["atac" ] = dict ()
159- adata .uns ["atac" ]["peak_annotation" ] = pa_long
165+ adata .uns ["atac" ]["peak_annotation" ] = pa
160166
161167 if return_annotation :
162- return pa_long
168+ return pa
163169
164170
165171def add_peak_annotation_gene_names (
0 commit comments