Skip to content

Commit 904f312

Browse files
authored
Merge pull request #2 from czbiohub-sf/add_spectronaut
Add spectronaut reader
2 parents a6aae4e + a9c1b13 commit 904f312

File tree

7 files changed

+968
-3
lines changed

7 files changed

+968
-3
lines changed

data/spectronaut_normal.tsv

Lines changed: 685 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
protdata.io.read\_spectronaut
2+
=============================
3+
4+
.. currentmodule:: protdata.io
5+
6+
.. autofunction:: read_spectronaut

docs/source/api/index.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,5 @@ Input/Output
2323
read_maxquant
2424
read_fragpipe
2525
read_diann
26-
read_mztab
26+
read_mztab
27+
read_spectronaut

protdata/io/__init__.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
1-
from .maxquant_loader import read_maxquant
21
from .diann_loader import read_diann
32
from .fragpipe_loader import read_fragpipe
3+
from .maxquant_loader import read_maxquant
44
from .mztab_loader import read_mztab
5+
from .spectronaut_loader import read_spectronaut
56

6-
__all__ = ["read_maxquant", "read_diann", "read_fragpipe", "read_mztab"]
7+
__all__ = [
8+
"read_maxquant",
9+
"read_diann",
10+
"read_fragpipe",
11+
"read_mztab",
12+
"read_spectronaut",
13+
]

protdata/io/spectronaut_loader.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
import warnings
2+
from typing import Any, Dict, List, Optional, Tuple
3+
4+
import anndata as ad
5+
import pandas as pd
6+
7+
8+
def read_spectronaut(
9+
file: str,
10+
intensity_columns: Optional[List[str] | str] = ["PG.Quantity"],
11+
index_column: Optional[str] = "PG.ProteinGroups",
12+
sample_column: Optional[str] = "R.FileName",
13+
sep="\t",
14+
):
15+
"""
16+
Load Spectronaut results into an AnnData object.
17+
18+
Parameters
19+
----------
20+
file
21+
Path to the Spectronaut results file.
22+
intensity_columns
23+
Name of the intensity column.
24+
index_column
25+
Name of the column to use as protein index.
26+
sample_column
27+
Name of the column to use as sample index.
28+
sep
29+
File separator.
30+
31+
Returns
32+
-------
33+
:class:`anndata.AnnData` object with:
34+
35+
- ``X``: intensity matrix (samples x proteins)
36+
- ``var``: protein metadata (indexed by protein group IDs)
37+
- ``obs``: sample metadata (indexed by sample names)
38+
"""
39+
40+
if isinstance(intensity_columns, str):
41+
intensity_columns = [intensity_columns]
42+
# Check that intensity columns are all at the same level as the index column
43+
index_level = index_column.split(".")[0]
44+
intensity_levels = [col.split(".")[0] for col in intensity_columns]
45+
if not all(level == index_level for level in intensity_levels):
46+
raise ValueError(
47+
f"Intensity columns {intensity_columns} are not all at the same level as the index column {index_column}"
48+
)
49+
50+
sample_levels = ("E", "R")
51+
possible_levels = ("PG", "PEP", "EG", "FG", "F")
52+
df = _read_csv_auto_decimal(file, sep=sep)
53+
54+
sample_level = sample_column.split(".")[0]
55+
sample_idx = sample_levels.index(sample_level)
56+
var_levels = sample_levels[: sample_idx + 1]
57+
var_levels = tuple(f"{level}." for level in var_levels)
58+
var_cols = [
59+
col
60+
for col in df.columns
61+
if col.startswith(var_levels) and df[col].notna().sum() > 0
62+
]
63+
64+
dfp = df.pivot_table(
65+
index=index_column,
66+
columns=var_cols,
67+
values=intensity_columns,
68+
aggfunc="mean",
69+
observed=True,
70+
dropna=True,
71+
).T
72+
dfpx = dfp.loc[intensity_columns[0]]
73+
74+
# Report if intensities were not unique for pivoting and aggregated. This might not be what the user wants.
75+
for intensity_column in intensity_columns:
76+
if df[intensity_column].nunique() > dfpx.size:
77+
warnings.warn(
78+
f"{intensity_column} is not unique within {index_column} and {sample_column}, pivoting with mean aggregation. Make sure this is what you want!"
79+
)
80+
81+
# Construct obs
82+
obs = dfpx.index.to_frame().set_index(sample_column)
83+
84+
# Construct var
85+
# Only columns that are coarser than the value column make sense to keep in var
86+
var_levels = tuple(
87+
f"{level}."
88+
for level in possible_levels[: possible_levels.index(index_level) + 1]
89+
)
90+
varg = df.loc[:, df.columns.str.startswith(var_levels)].groupby(
91+
index_column, observed=True, dropna=False
92+
)
93+
uniquecols = varg.nunique().eq(1).all(axis=0)
94+
uniquecols = uniquecols[uniquecols].index
95+
var = varg[uniquecols].first().reindex(dfpx.columns)
96+
97+
# Get the layers
98+
layers = {}
99+
for intensity_column in intensity_columns[1:]:
100+
layers[intensity_column] = dfp.loc[intensity_column]
101+
102+
# Create AnnData
103+
uns = {
104+
"RawInfo": {
105+
"Search_Engine": "Spectronaut",
106+
},
107+
}
108+
adata = ad.AnnData(X=dfpx.to_numpy(), obs=obs, var=var, uns=uns, layers=layers)
109+
return adata
110+
111+
112+
def _read_csv_auto_decimal(
113+
path: str,
114+
*,
115+
sample_rows: int = 2000,
116+
trials: List[Dict[str, Optional[str]]] = [
117+
{
118+
"decimal": ".",
119+
},
120+
{
121+
"decimal": ",",
122+
},
123+
],
124+
**kwargs: Any,
125+
) -> pd.DataFrame:
126+
"""
127+
Read a CSV by trying multiple (decimal, thousands) configurations on a sample
128+
using a fixed `sep` (no separator inference). Picks the best config and
129+
re-reads the full file once.
130+
131+
Parameters
132+
----------
133+
path : str
134+
CSV file path.
135+
sample_rows : int
136+
Number of rows to sample for scoring.
137+
trials : list of dict
138+
List of {'decimal': <'.' or ','>, 'thousands': <'.' or ',' or None>} to try.
139+
**kwargs : Any
140+
Passed through to `pd.read_csv`.
141+
142+
Returns
143+
-------
144+
DataFrame
145+
"""
146+
147+
scores: List[Tuple[int, Dict[str, Optional[str]]]] = []
148+
base_kwargs = dict(kwargs)
149+
150+
for cfg in trials:
151+
try:
152+
samp = pd.read_csv(path, nrows=sample_rows, **base_kwargs, **cfg)
153+
num = samp.select_dtypes(include="number")
154+
# Score = (non-NaN numeric cells)*10 + (# numeric cols)
155+
frac_valid = 1.0 - num.isna().to_numpy().mean()
156+
score = frac_valid * 10000 + num.shape[1] # or any large scale
157+
158+
scores.append((score, cfg))
159+
except Exception:
160+
scores.append((-1, cfg))
161+
162+
best_score, best_cfg = max(scores, key=lambda x: x[0])
163+
if best_score < 0:
164+
# All trials failed; fall back to single read with caller's sep and defaults
165+
return pd.read_csv(path, **base_kwargs)
166+
167+
return pd.read_csv(path, **base_kwargs, **best_cfg)

tests/test_loaders.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from protdata.io.fragpipe_loader import read_fragpipe
88
from protdata.io.maxquant_loader import read_maxquant
99
from protdata.io.mztab_loader import read_mztab
10+
from protdata.io.spectronaut_loader import read_spectronaut
1011

1112
data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data"))
1213

@@ -18,6 +19,7 @@
1819
("combined_protein.tsv", read_fragpipe),
1920
("SILAC_SQ.mzTab", read_mztab),
2021
("report.pg_matrix.tsv", read_diann),
22+
("spectronaut_normal.tsv", read_spectronaut),
2123
],
2224
)
2325
def test_loader(filename, loader, tmp_path):

tests/test_spectronaut_loader.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import os
2+
import warnings
3+
4+
import numpy as np
5+
import pandas as pd
6+
import pytest
7+
8+
from protdata.io.spectronaut_loader import read_spectronaut
9+
10+
# Absolute path provided by the user for local validation
11+
LOCAL_DATA_PATH = os.path.abspath(
12+
os.path.join(os.path.dirname(__file__), "../data/spectronaut_normal.tsv")
13+
)
14+
15+
16+
def test_spectronaut_pivot_warning_and_shape():
17+
with pytest.warns(UserWarning, match="pivoting with mean aggregation"):
18+
adata = read_spectronaut(
19+
LOCAL_DATA_PATH,
20+
index_column="EG.Workflow",
21+
intensity_columns=["EG.Cscore", "EG.Qvalue"],
22+
)
23+
assert adata.shape == (9, 1)
24+
25+
26+
def test_spectronaut_no_warning_and_shape():
27+
with warnings.catch_warnings(record=True) as caught:
28+
warnings.simplefilter("always")
29+
adata = read_spectronaut(
30+
LOCAL_DATA_PATH,
31+
index_column="EG.ModifiedSequence",
32+
intensity_columns=["EG.Cscore", "EG.Qvalue"],
33+
)
34+
assert len(caught) == 0
35+
assert adata.shape == (9, 14)
36+
37+
38+
def _count_nonnull_grouped_values(
39+
df: pd.DataFrame, index_col: str, sample_col: str, intensity_col: str
40+
) -> int:
41+
grouped = (
42+
df[[index_col, sample_col, intensity_col]]
43+
.groupby([index_col, sample_col], observed=True, dropna=False)[intensity_col]
44+
.apply(lambda s: pd.notna(s).any())
45+
)
46+
return int(grouped.sum())
47+
48+
49+
def _count_nonnull_in_matrix(matrix: np.ndarray) -> int:
50+
return int(np.count_nonzero(~np.isnan(matrix)))
51+
52+
53+
def test_spectronaut_pivot_nonnull_counts_match_local_data():
54+
assert os.path.isfile(LOCAL_DATA_PATH)
55+
# Read raw for ground truth counting (file uses comma decimals)
56+
df = pd.read_csv(LOCAL_DATA_PATH, sep="\t", decimal=",")
57+
adata = read_spectronaut(LOCAL_DATA_PATH)
58+
59+
grouped_nonnull = _count_nonnull_grouped_values(
60+
df,
61+
index_col="PG.ProteinGroups",
62+
sample_col="R.FileName",
63+
intensity_col="PG.Quantity",
64+
)
65+
matrix_nonnull = _count_nonnull_in_matrix(adata.X)
66+
67+
assert matrix_nonnull == grouped_nonnull
68+
69+
70+
def test_spectronaut_layers_shape_and_presence():
71+
assert os.path.isfile(LOCAL_DATA_PATH)
72+
adata = read_spectronaut(
73+
LOCAL_DATA_PATH,
74+
intensity_columns=["PG.Quantity", "PG.Cscore"],
75+
)
76+
assert "PG.Cscore" in adata.layers
77+
assert adata.layers["PG.Cscore"].shape == adata.X.shape
78+
79+
80+
def test_spectronaut_uns_and_obs_metadata():
81+
assert os.path.isfile(LOCAL_DATA_PATH)
82+
adata = read_spectronaut(LOCAL_DATA_PATH)
83+
assert adata.uns["RawInfo"]["Search_Engine"] == "Spectronaut"
84+
# obs index is the sample identifier column
85+
assert adata.obs.index.name == "R.FileName"
86+
# expect at least one other sample-level column to be present
87+
assert "R.Replicate" in adata.obs.columns or "R.Condition" in adata.obs.columns
88+
89+
90+
def test_spectronaut_level_mismatch_raises():
91+
assert os.path.isfile(LOCAL_DATA_PATH)
92+
with pytest.raises(ValueError):
93+
read_spectronaut(
94+
LOCAL_DATA_PATH,
95+
index_column="EG.Workflow",
96+
intensity_columns=["PG.Quantity"],
97+
)

0 commit comments

Comments
 (0)