Merge pull request #2 from czbiohub-sf/add_spectronaut

mffrank · web-flow · commit 904f31216514 · 2025-11-19T16:33:50.000-08:00
Add spectronaut reader
diff --git a/data/spectronaut_normal.tsv b/data/spectronaut_normal.tsv
diff --git a/docs/source/api/generated/protdata.io.read_spectronaut.rst b/docs/source/api/generated/protdata.io.read_spectronaut.rst
@@ -0,0 +1,6 @@
+﻿protdata.io.read\_spectronaut
+=============================
+
+.. currentmodule:: protdata.io
+
+.. autofunction:: read_spectronaut
diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst
@@ -23,4 +23,5 @@ Input/Output
    read_maxquant
    read_fragpipe
    read_diann
-   read_mztab
+   read_mztab
+   read_spectronaut
diff --git a/protdata/io/__init__.py b/protdata/io/__init__.py
@@ -1,6 +1,13 @@
-from .maxquant_loader import read_maxquant
 from .diann_loader import read_diann
 from .fragpipe_loader import read_fragpipe
+from .maxquant_loader import read_maxquant
 from .mztab_loader import read_mztab
+from .spectronaut_loader import read_spectronaut
 
-__all__ = ["read_maxquant", "read_diann", "read_fragpipe", "read_mztab"]
+__all__ = [
+    "read_maxquant",
+    "read_diann",
+    "read_fragpipe",
+    "read_mztab",
+    "read_spectronaut",
+]
diff --git a/protdata/io/spectronaut_loader.py b/protdata/io/spectronaut_loader.py
@@ -0,0 +1,167 @@
+import warnings
+from typing import Any, Dict, List, Optional, Tuple
+
+import anndata as ad
+import pandas as pd
+
+
+def read_spectronaut(
+    file: str,
+    intensity_columns: Optional[List[str] | str] = ["PG.Quantity"],
+    index_column: Optional[str] = "PG.ProteinGroups",
+    sample_column: Optional[str] = "R.FileName",
+    sep="\t",
+):
+    """
+    Load Spectronaut results into an AnnData object.
+
+    Parameters
+    ----------
+    file
+        Path to the Spectronaut results file.
+    intensity_columns
+        Name of the intensity column.
+    index_column
+        Name of the column to use as protein index.
+    sample_column
+        Name of the column to use as sample index.
+    sep
+        File separator.
+
+    Returns
+    -------
+    :class:`anndata.AnnData` object with:
+
+        - ``X``: intensity matrix (samples x proteins)
+        - ``var``: protein metadata (indexed by protein group IDs)
+        - ``obs``: sample metadata (indexed by sample names)
+    """
+
+    if isinstance(intensity_columns, str):
+        intensity_columns = [intensity_columns]
+    # Check that intensity columns are all at the same level as the index column
+    index_level = index_column.split(".")[0]
+    intensity_levels = [col.split(".")[0] for col in intensity_columns]
+    if not all(level == index_level for level in intensity_levels):
+        raise ValueError(
+            f"Intensity columns {intensity_columns} are not all at the same level as the index column {index_column}"
+        )
+
+    sample_levels = ("E", "R")
+    possible_levels = ("PG", "PEP", "EG", "FG", "F")
+    df = _read_csv_auto_decimal(file, sep=sep)
+
+    sample_level = sample_column.split(".")[0]
+    sample_idx = sample_levels.index(sample_level)
+    var_levels = sample_levels[: sample_idx + 1]
+    var_levels = tuple(f"{level}." for level in var_levels)
+    var_cols = [
+        col
+        for col in df.columns
+        if col.startswith(var_levels) and df[col].notna().sum() > 0
+    ]
+
+    dfp = df.pivot_table(
+        index=index_column,
+        columns=var_cols,
+        values=intensity_columns,
+        aggfunc="mean",
+        observed=True,
+        dropna=True,
+    ).T
+    dfpx = dfp.loc[intensity_columns[0]]
+
+    # Report if intensities were not unique for pivoting and aggregated. This might not be what the user wants.
+    for intensity_column in intensity_columns:
+        if df[intensity_column].nunique() > dfpx.size:
+            warnings.warn(
+                f"{intensity_column} is not unique within {index_column} and {sample_column}, pivoting with mean aggregation. Make sure this is what you want!"
+            )
+
+    # Construct obs
+    obs = dfpx.index.to_frame().set_index(sample_column)
+
+    # Construct var
+    # Only columns that are coarser than the value column make sense to keep in var
+    var_levels = tuple(
+        f"{level}."
+        for level in possible_levels[: possible_levels.index(index_level) + 1]
+    )
+    varg = df.loc[:, df.columns.str.startswith(var_levels)].groupby(
+        index_column, observed=True, dropna=False
+    )
+    uniquecols = varg.nunique().eq(1).all(axis=0)
+    uniquecols = uniquecols[uniquecols].index
+    var = varg[uniquecols].first().reindex(dfpx.columns)
+
+    # Get the layers
+    layers = {}
+    for intensity_column in intensity_columns[1:]:
+        layers[intensity_column] = dfp.loc[intensity_column]
+
+    # Create AnnData
+    uns = {
+        "RawInfo": {
+            "Search_Engine": "Spectronaut",
+        },
+    }
+    adata = ad.AnnData(X=dfpx.to_numpy(), obs=obs, var=var, uns=uns, layers=layers)
+    return adata
+
+
+def _read_csv_auto_decimal(
+    path: str,
+    *,
+    sample_rows: int = 2000,
+    trials: List[Dict[str, Optional[str]]] = [
+        {
+            "decimal": ".",
+        },
+        {
+            "decimal": ",",
+        },
+    ],
+    **kwargs: Any,
+) -> pd.DataFrame:
+    """
+    Read a CSV by trying multiple (decimal, thousands) configurations on a sample
+    using a fixed `sep` (no separator inference). Picks the best config and
+    re-reads the full file once.
+
+    Parameters
+    ----------
+    path : str
+        CSV file path.
+    sample_rows : int
+        Number of rows to sample for scoring.
+    trials : list of dict
+        List of {'decimal': <'.' or ','>, 'thousands': <'.' or ',' or None>} to try.
+    **kwargs : Any
+        Passed through to `pd.read_csv`.
+
+    Returns
+    -------
+    DataFrame
+    """
+
+    scores: List[Tuple[int, Dict[str, Optional[str]]]] = []
+    base_kwargs = dict(kwargs)
+
+    for cfg in trials:
+        try:
+            samp = pd.read_csv(path, nrows=sample_rows, **base_kwargs, **cfg)
+            num = samp.select_dtypes(include="number")
+            # Score = (non-NaN numeric cells)*10 + (# numeric cols)
+            frac_valid = 1.0 - num.isna().to_numpy().mean()
+            score = frac_valid * 10000 + num.shape[1]  # or any large scale
+
+            scores.append((score, cfg))
+        except Exception:
+            scores.append((-1, cfg))
+
+    best_score, best_cfg = max(scores, key=lambda x: x[0])
+    if best_score < 0:
+        # All trials failed; fall back to single read with caller's sep and defaults
+        return pd.read_csv(path, **base_kwargs)
+
+    return pd.read_csv(path, **base_kwargs, **best_cfg)
diff --git a/tests/test_loaders.py b/tests/test_loaders.py
@@ -7,6 +7,7 @@
 from protdata.io.fragpipe_loader import read_fragpipe
 from protdata.io.maxquant_loader import read_maxquant
 from protdata.io.mztab_loader import read_mztab
+from protdata.io.spectronaut_loader import read_spectronaut
 
 data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data"))
 
@@ -18,6 +19,7 @@
         ("combined_protein.tsv", read_fragpipe),
         ("SILAC_SQ.mzTab", read_mztab),
         ("report.pg_matrix.tsv", read_diann),
+        ("spectronaut_normal.tsv", read_spectronaut),
     ],
 )
 def test_loader(filename, loader, tmp_path):
diff --git a/tests/test_spectronaut_loader.py b/tests/test_spectronaut_loader.py
@@ -0,0 +1,97 @@
+import os
+import warnings
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from protdata.io.spectronaut_loader import read_spectronaut
+
+# Absolute path provided by the user for local validation
+LOCAL_DATA_PATH = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "../data/spectronaut_normal.tsv")
+)
+
+
+def test_spectronaut_pivot_warning_and_shape():
+    with pytest.warns(UserWarning, match="pivoting with mean aggregation"):
+        adata = read_spectronaut(
+            LOCAL_DATA_PATH,
+            index_column="EG.Workflow",
+            intensity_columns=["EG.Cscore", "EG.Qvalue"],
+        )
+    assert adata.shape == (9, 1)
+
+
+def test_spectronaut_no_warning_and_shape():
+    with warnings.catch_warnings(record=True) as caught:
+        warnings.simplefilter("always")
+        adata = read_spectronaut(
+            LOCAL_DATA_PATH,
+            index_column="EG.ModifiedSequence",
+            intensity_columns=["EG.Cscore", "EG.Qvalue"],
+        )
+        assert len(caught) == 0
+    assert adata.shape == (9, 14)
+
+
+def _count_nonnull_grouped_values(
+    df: pd.DataFrame, index_col: str, sample_col: str, intensity_col: str
+) -> int:
+    grouped = (
+        df[[index_col, sample_col, intensity_col]]
+        .groupby([index_col, sample_col], observed=True, dropna=False)[intensity_col]
+        .apply(lambda s: pd.notna(s).any())
+    )
+    return int(grouped.sum())
+
+
+def _count_nonnull_in_matrix(matrix: np.ndarray) -> int:
+    return int(np.count_nonzero(~np.isnan(matrix)))
+
+
+def test_spectronaut_pivot_nonnull_counts_match_local_data():
+    assert os.path.isfile(LOCAL_DATA_PATH)
+    # Read raw for ground truth counting (file uses comma decimals)
+    df = pd.read_csv(LOCAL_DATA_PATH, sep="\t", decimal=",")
+    adata = read_spectronaut(LOCAL_DATA_PATH)
+
+    grouped_nonnull = _count_nonnull_grouped_values(
+        df,
+        index_col="PG.ProteinGroups",
+        sample_col="R.FileName",
+        intensity_col="PG.Quantity",
+    )
+    matrix_nonnull = _count_nonnull_in_matrix(adata.X)
+
+    assert matrix_nonnull == grouped_nonnull
+
+
+def test_spectronaut_layers_shape_and_presence():
+    assert os.path.isfile(LOCAL_DATA_PATH)
+    adata = read_spectronaut(
+        LOCAL_DATA_PATH,
+        intensity_columns=["PG.Quantity", "PG.Cscore"],
+    )
+    assert "PG.Cscore" in adata.layers
+    assert adata.layers["PG.Cscore"].shape == adata.X.shape
+
+
+def test_spectronaut_uns_and_obs_metadata():
+    assert os.path.isfile(LOCAL_DATA_PATH)
+    adata = read_spectronaut(LOCAL_DATA_PATH)
+    assert adata.uns["RawInfo"]["Search_Engine"] == "Spectronaut"
+    # obs index is the sample identifier column
+    assert adata.obs.index.name == "R.FileName"
+    # expect at least one other sample-level column to be present
+    assert "R.Replicate" in adata.obs.columns or "R.Condition" in adata.obs.columns
+
+
+def test_spectronaut_level_mismatch_raises():
+    assert os.path.isfile(LOCAL_DATA_PATH)
+    with pytest.raises(ValueError):
+        read_spectronaut(
+            LOCAL_DATA_PATH,
+            index_column="EG.Workflow",
+            intensity_columns=["PG.Quantity"],
+        )