Skip to content

various fixes and additions for IO #19

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Mar 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ default_stages:
minimum_pre_commit_version: 2.16.0
repos:
- repo: https://github.com/psf/black
rev: 22.12.0
rev: 23.1.0
hooks:
- id: black
- repo: https://github.com/pre-commit/mirrors-prettier
Expand All @@ -19,11 +19,11 @@ repos:
hooks:
- id: blacken-docs
- repo: https://github.com/PyCQA/isort
rev: 5.11.4
rev: 5.12.0
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v0.991
rev: v1.0.1
hooks:
- id: mypy
additional_dependencies: [numpy==1.24.0]
Expand All @@ -50,7 +50,7 @@ repos:
- id: trailing-whitespace
- id: check-case-conflict
- repo: https://github.com/PyCQA/autoflake
rev: v2.0.0
rev: v2.0.1
hooks:
- id: autoflake
args:
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ dependencies = [
"joblib",
"imagecodecs",
"dask-image",
"pyarrow",
]

[project.optional-dependencies]
Expand Down
4 changes: 4 additions & 0 deletions src/spatialdata_io/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
from importlib.metadata import version

from spatialdata_io.readers.cosmx import cosmx
from spatialdata_io.readers.mcmicro import mcmicro
from spatialdata_io.readers.steinbock import steinbock
from spatialdata_io.readers.visium import visium
from spatialdata_io.readers.xenium import xenium

__all__ = [
"visium",
"xenium",
"cosmx",
"mcmicro",
"steinbock",
]

__version__ = version("spatialdata-io")
37 changes: 36 additions & 1 deletion src/spatialdata_io/_constants/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ class XeniumKeys(ModeEnum):
CELL_METADATA_FILE = "cells.parquet"
CELL_X = "x_centroid"
CELL_Y = "y_centroid"
CELL_AREA = 'cell_area'
CELL_AREA = "cell_area"
CELL_NUCLEUS_AREA = "nucleus_area"

# morphology iamges
MORPHOLOGY_MIP_FILE = "morphology_mip.ome.tif"
Expand Down Expand Up @@ -85,3 +86,37 @@ class VisiumKeys(ModeEnum):
SPOTS_FILE = "spatial/tissue_positions.csv"
SPOTS_X = "pxl_row_in_fullres"
SPOTS_Y = "pxl_col_in_fullres"


@unique
class SteinbockKeys(ModeEnum):
"""Keys for *Steinbock* formatted dataset."""

# files and directories
CELLS_FILE = "cells.h5ad"
DEEPCELL_MASKS_DIR = "masks_deepcell"
ILASTIK_MASKS_DIR = "masks_ilastik"
IMAGES_DIR = "ome"

# suffixes for images and labels
IMAGE_SUFFIX = ".ome.tiff"
LABEL_SUFFIX = ".tiff"


@unique
class McmicroKeys(ModeEnum):
"""Keys for *Mcmicro* formatted dataset."""

# files and directories
CELL_FEATURES_SUFFIX = "--unmicst_cell.csv"
QUANTIFICATION_DIR = "quantification"
MARKERS_FILE = "markers.csv"
IMAGES_DIR = "registration"
IMAGE_SUFFIX = ".ome.tif"
LABELS_DIR = "segmentation"
LABELS_PREFIX = "unmicst-"

# metadata
COORDS_X = "X_centroid"
COORDS_Y = "Y_centroid"
INSTANCE_KEY = "CellID"
73 changes: 47 additions & 26 deletions src/spatialdata_io/readers/cosmx.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,29 @@

import os
import re
import tempfile
from collections.abc import Mapping
from pathlib import Path
from types import MappingProxyType
from typing import Any, Optional
import pyarrow as pa
import pyarrow.parquet as pq

import dask.array as da
import numpy as np
import pandas as pd
import pyarrow as pa
from anndata import AnnData
from dask_image.imread import imread
import dask.array as da
from dask.dataframe.core import DataFrame as DaskDataFrame
from dask_image.imread import imread
from scipy.sparse import csr_matrix

# from spatialdata._core.core_utils import xy_cs
from skimage.transform import estimate_transform
from spatialdata import SpatialData
from spatialdata._core.models import Image2DModel, Labels2DModel, TableModel, PointsModel
from spatialdata._core.models import (
Image2DModel,
Labels2DModel,
PointsModel,
TableModel,
)

# from spatialdata._core.ngff.ngff_coordinate_system import NgffAxis # , CoordinateSystem
from spatialdata._core.transformations import Affine, Identity
Expand All @@ -41,7 +44,6 @@
def cosmx(
path: str | Path,
dataset_id: Optional[str] = None,
# shape_size: float | int = 1,
transcripts: bool = True,
imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
image_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
Expand All @@ -67,8 +69,8 @@ def cosmx(
Path to the root directory containing *Nanostring* files.
dataset_id
Name of the dataset.
shape_size
Size of the shape to be used for the centroids of the labels.
transcripts
Whether to also read in transcripts information.
imread_kwargs
Keyword arguments passed to :func:`dask_image.imread.imread`.
image_models_kwargs
Expand Down Expand Up @@ -118,7 +120,7 @@ def cosmx(

obs = pd.read_csv(path / meta_file, header=0, index_col=CosmxKeys.INSTANCE_KEY)
obs[CosmxKeys.FOV] = pd.Categorical(obs[CosmxKeys.FOV].astype(str))
obs[CosmxKeys.REGION_KEY] = pd.Categorical(obs[CosmxKeys.FOV].astype(str).apply(lambda s: "/labels/" + s))
obs[CosmxKeys.REGION_KEY] = pd.Categorical(obs[CosmxKeys.FOV].astype(str).apply(lambda s: s + "_labels"))
obs[CosmxKeys.INSTANCE_KEY] = obs.index.astype(np.int64)
obs.rename_axis(None, inplace=True)
obs.index = obs.index.astype(str).str.cat(obs[CosmxKeys.FOV].values, sep="_")
Expand All @@ -141,12 +143,6 @@ def cosmx(

fovs_counts = list(map(str, adata.obs.fov.astype(int).unique()))

# TODO(giovp): uncomment once transform is ready
# input_cs = CoordinateSystem("cxy", axes=[c_axis, y_axis, x_axis])
# input_cs_labels = CoordinateSystem("cxy", axes=[y_axis, x_axis])
# output_cs = CoordinateSystem("global", axes=[c_axis, y_axis, x_axis])
# output_cs_labels = CoordinateSystem("global", axes=[y_axis, x_axis])

affine_transforms_to_global = {}

for fov in fovs_counts:
Expand All @@ -163,7 +159,10 @@ def cosmx(

table.obsm["global"] = table.obs[[CosmxKeys.X_GLOBAL_CELL, CosmxKeys.Y_GLOBAL_CELL]].to_numpy()
table.obsm["spatial"] = table.obs[[CosmxKeys.X_LOCAL_CELL, CosmxKeys.Y_LOCAL_CELL]].to_numpy()
table.obs.drop(columns=[CosmxKeys.X_LOCAL_CELL, CosmxKeys.Y_LOCAL_CELL, CosmxKeys.X_GLOBAL_CELL, CosmxKeys.Y_GLOBAL_CELL], inplace=True)
table.obs.drop(
columns=[CosmxKeys.X_LOCAL_CELL, CosmxKeys.Y_LOCAL_CELL, CosmxKeys.X_GLOBAL_CELL, CosmxKeys.Y_GLOBAL_CELL],
inplace=True,
)

# prepare to read images and labels
file_extensions = (".jpg", ".png", ".jpeg", ".tif", ".tiff")
Expand Down Expand Up @@ -200,7 +199,6 @@ def cosmx(
flipped_im = da.flip(im, axis=0)
parsed_im = Image2DModel.parse(
flipped_im,
name=fov,
transformations={
fov: Identity(),
"global": aff,
Expand All @@ -209,7 +207,7 @@ def cosmx(
dims=("y", "x", "c"),
**image_models_kwargs,
)
images[fov] = parsed_im
images[f"{fov}_image"] = parsed_im
else:
logger.warning(f"FOV {fov} not found in counts file. Skipping image {fname}.")

Expand All @@ -224,7 +222,6 @@ def cosmx(
flipped_la = da.flip(la, axis=0)
parsed_la = Labels2DModel.parse(
flipped_la,
name=fov,
transformations={
fov: Identity(),
"global": aff,
Expand All @@ -233,15 +230,40 @@ def cosmx(
dims=("y", "x"),
**image_models_kwargs,
)
labels[fov] = parsed_la
labels[f"{fov}_labels"] = parsed_la
else:
logger.warning(f"FOV {fov} not found in counts file. Skipping labels {fname}.")

points: dict[str, DaskDataFrame] = {}
if transcripts:
# assert transcripts_file is not None
# from pyarrow.csv import read_csv
#
# ptable = read_csv(path / transcripts_file) # , header=0)
# for fov in fovs_counts:
# aff = affine_transforms_to_global[fov]
# sub_table = ptable.filter(pa.compute.equal(ptable.column(CosmxKeys.FOV), int(fov))).to_pandas()
# sub_table[CosmxKeys.INSTANCE_KEY] = sub_table[CosmxKeys.INSTANCE_KEY].astype("category")
# # we rename z because we want to treat the data as 2d
# sub_table.rename(columns={"z": "z_raw"}, inplace=True)
# points[fov] = PointsModel.parse(
# sub_table,
# coordinates={"x": CosmxKeys.X_LOCAL_TRANSCRIPT, "y": CosmxKeys.Y_LOCAL_TRANSCRIPT},
# feature_key=CosmxKeys.TARGET_OF_TRANSCRIPT,
# instance_key=CosmxKeys.INSTANCE_KEY,
# transformations={
# fov: Identity(),
# "global": aff,
# "global_only_labels": aff,
# },
# )
# let's convert the .csv to .parquet and let's read it with pyarrow.parquet for faster subsetting
import tempfile

import pyarrow.parquet as pq

with tempfile.TemporaryDirectory() as tmpdir:
print("converting .csv to .parquet... ", end="")
print("converting .csv to .parquet to improve the speed of the slicing operations... ", end="")
assert transcripts_file is not None
transcripts_data = pd.read_csv(path / transcripts_file, header=0)
transcripts_data.to_parquet(Path(tmpdir) / "transcripts.parquet")
Expand All @@ -251,10 +273,10 @@ def cosmx(
for fov in fovs_counts:
aff = affine_transforms_to_global[fov]
sub_table = ptable.filter(pa.compute.equal(ptable.column(CosmxKeys.FOV), int(fov))).to_pandas()
sub_table[CosmxKeys.INSTANCE_KEY] = sub_table[CosmxKeys.INSTANCE_KEY].astype('category')
sub_table[CosmxKeys.INSTANCE_KEY] = sub_table[CosmxKeys.INSTANCE_KEY].astype("category")
# we rename z because we want to treat the data as 2d
sub_table.rename(columns={'z': 'z_raw'}, inplace=True)
points[fov] = PointsModel.parse(
sub_table.rename(columns={"z": "z_raw"}, inplace=True)
points[f"{fov}_points"] = PointsModel.parse(
sub_table,
coordinates={"x": CosmxKeys.X_LOCAL_TRANSCRIPT, "y": CosmxKeys.Y_LOCAL_TRANSCRIPT},
feature_key=CosmxKeys.TARGET_OF_TRANSCRIPT,
Expand All @@ -266,7 +288,6 @@ def cosmx(
},
)


# TODO: what to do with fov file?
# if fov_file is not None:
# fov_positions = pd.read_csv(path / fov_file, header=0, index_col=CosmxKeys.FOV)
Expand Down
Loading