From 9412eaa1eba66467cd6381ce80b86836a1f2778d Mon Sep 17 00:00:00 2001 From: Clarence Mah Date: Tue, 18 Feb 2025 21:14:11 +0000 Subject: [PATCH 01/13] Add G4X reader for samples and full runs --- src/spatialdata_io/__init__.py | 2 + src/spatialdata_io/_constants/_constants.py | 47 ++ src/spatialdata_io/readers/g4x.py | 621 ++++++++++++++++++++ 3 files changed, 670 insertions(+) create mode 100644 src/spatialdata_io/readers/g4x.py diff --git a/src/spatialdata_io/__init__.py b/src/spatialdata_io/__init__.py index 48f784bd..d857e2f8 100644 --- a/src/spatialdata_io/__init__.py +++ b/src/spatialdata_io/__init__.py @@ -16,8 +16,10 @@ xenium_aligned_image, xenium_explorer_selection, ) +from spatialdata_io.readers.g4x import g4x __all__ = [ + "g4x", "curio", "seqfish", "visium", diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py index e4f77d5f..204bbfed 100644 --- a/src/spatialdata_io/_constants/_constants.py +++ b/src/spatialdata_io/_constants/_constants.py @@ -392,3 +392,50 @@ class VisiumHDKeys(ModeEnum): MICROSCOPE_COLROW_TO_SPOT_COLROW = ("microscope_colrow_to_spot_colrow",) SPOT_COLROW_TO_MICROSCOPE_COLROW = ("spot_colrow_to_microscope_colrow",) FILE_FORMAT = "file_format" + + +class G4XKeys(str, ModeEnum): + + # H&E + HE_DIR = "h_and_e" + HE_PATTERN = "*.jp2" + HE_IMG2DMODEL_KWARGS = { + "dims": ["y", "x", "c"], + "scale_factors": [2, 2, 2], + "chunks": "auto" + } + + # Nuclei + NUCLEI_BOUNDARIES_KEY = "nuclei" + CELL_BOUNDARIES_KEY = "nuclei_exp" + SEGMENTATION_DIR = "segmentation" + SEGMENTATION_PATTERN = "segmentation_mask.npz" + SEG_IMG2DMODEL_KWARGS = { + "dims": ["y", "x"], + "chunks": "auto" + } + + # Protein + PROTEIN_KEY = "protein" + PROTEIN_DIR = "protein" + PROTEIN_PATTERN = "*.jp2" + PROTEIN_IMG2DMODEL_KWARGS = { + "dims": ["c", "y", "x"], + "scale_factors": [2, 2, 2], + "chunks": "auto" + } + + # Transcripts + TRANSCRIPTS_DIR = "rna" + TRANSCRIPTS_PATTERN = "*transcript_table.csv.gz" + TRANSCRIPTS_COORDS = { + "x": "x_pixel_coordinate", + "y": "y_pixel_coordinate" + } + TRANSCRIPTS_FEATURE_KEY = "gene_name" + TRANSCRIPTS_SWAP_XY = True + + # Tables + TABLES_DIR = "single_cell_data" + TABLE_PATTERN = "feature_matrix.h5" + TABLE_KEY = "table" diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py new file mode 100644 index 00000000..78c7f007 --- /dev/null +++ b/src/spatialdata_io/readers/g4x.py @@ -0,0 +1,621 @@ +from __future__ import annotations + +import re +from pathlib import Path +from typing import Union + +import numpy as np +from anndata.io import read_h5ad +from dask_image.imread import imread +from spatialdata import SpatialData, to_polygons +from spatialdata._logging import logger +from spatialdata.models import ( + Image2DModel, + Labels2DModel, + PointsModel, + TableModel, +) +import dask.dataframe as dd +from tqdm.auto import tqdm + +from spatialdata_io._constants._constants import G4XKeys +from spatialdata_io._docs import inject_docs + +__all__ = ["g4x_sample", "g4x_run"] + + +@inject_docs(xx=G4XKeys) +def g4x_run( + input_path: Union[str, Path], + output_path: Union[str, Path, None] = None, + include_he: bool = True, + include_segmentation: bool = True, + include_protein: bool = True, + include_transcripts: bool = True, + include_tables: bool = True, + mode: str = "append", +): + """ + Create SpatialData objects for each sample in a run directory. + + See :func:`g4x_sample` for more details. + + Parameters + ---------- + input_path : Union[str, Path] + Path to input directory containing run data. Assumes each subdirectory contains a sample. e.g. `input_path/A01`, `input_path/B01`, etc. + output_path : Union[str, Path] + Path to directory where SpatialData zarr stores will be written. If None, zarr stores will be written to each sample directory found in `input_path`. + include_he : bool + Include H&E image if available. + include_segmentation : bool + Include segmentation if available. + include_protein : bool + Include protein images if available. + include_transcripts : bool + Include transcript data if available. + include_tables : bool + Include tables if available. + mode : str + Mode for handling existing elements. Options: + - "append": Skip existing elements (default) + - "overwrite": Replace existing elements + Returns + ------- + sdatas : list[SpatialData] + List of SpatialData objects + """ + if isinstance(input_path, str): + input_path = Path(input_path) + if isinstance(output_path, str): + output_path = Path(output_path) + + # Make sure paths match expected format e.g. A01, B01 + sample_input_paths = [] + for p in input_path.iterdir(): + if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name): + sample_input_paths.append(p) + logger.debug(f"Found {len(sample_input_paths)} samples.") + + if output_path is None: + sample_output_paths = [ + input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths + ] + else: + sample_output_paths = [ + output_path / f"{p.name}.zarr" for p in sample_input_paths + ] + + kwargs = { + "include_he": include_he, + "include_segmentation": include_segmentation, + "include_protein": include_protein, + "include_transcripts": include_transcripts, + "include_tables": include_tables, + "mode": mode, + } + + sdatas = [] + for sample_input_path, sample_output_path in tqdm( + zip(sample_input_paths, sample_output_paths), + total=len(sample_input_paths), + desc="Processing samples", + ): + sdata = g4x_sample( + input_path=sample_input_path, + output_zarr_path=sample_output_path, + **kwargs, + ) + sdatas.append(sdata) + return sdatas + + +def g4x_sample( + input_path: Union[str, Path], + output_zarr_path: Union[str, Path], + include_he: bool = True, + include_segmentation: bool = True, + include_protein: bool = True, + include_transcripts: bool = True, + include_tables: bool = True, + mode: str = "append", +) -> SpatialData: + """ + Create a SpatialData object from a G4X sample dataset. + + This function looks for the following files: + + - ``{xx.HE_DIR!r}/{xx.HE_PATTERN!r}``: H&E images. + - ``{xx.NUCLEI_DIR!r}/{xx.NUCLEI_PATTERN!r}``: Segmentation files. + - ``{xx.PROTEIN_DIR!r}/{xx.PROTEIN_PATTERN!r}``: Protein images. + - ``{xx.TRANSCRIPTS_DIR!r}/{xx.TRANSCRIPTS_PATTERN!r}``: Transcript tables. + - ``{xx.TABLES_DIR!r}/{xx.TABLE_PATTERN!r}``: Table file. + + Parameters + ---------- + input_path : str + Path to input directory containing G4X data + output_path : str + Writes/appends to a SpatialData zarr store at this path + include_he : bool + Include H&E image if available. + include_segmentation : bool + Include segmentation if available. + include_protein : bool + Include protein images if available. + include_transcripts : bool + Include transcript data if available. + include_tables : bool + Include tables if available. + mode : str + Mode for creating SpatialData object ('new' or 'append') + + Returns + ------- + SpatialData + SpatialData object containing requested data elements + """ + if isinstance(input_path, str): + input_path = Path(input_path) + if isinstance(output_zarr_path, str): + output_zarr_path = Path(output_zarr_path) + if output_zarr_path.suffix != ".zarr": + logger.error(f"Output path must end with '.zarr'. Got {output_zarr_path}") + raise ValueError( + f"Output path must end with '.zarr'. Got {output_zarr_path}" + ) + + if mode not in ["append", "overwrite"]: + msg = f"Invalid mode '{mode}'. Must be one of: 'append', 'overwrite'" + logger.error(msg) + raise ValueError(msg) + + if output_zarr_path.exists(): + logger.debug(f"Found existing {output_zarr_path}") + sdata = SpatialData.read(output_zarr_path) + else: + logger.debug(f"Creating new SpatialData object at {output_zarr_path}") + sdata = SpatialData() + sdata.write(output_zarr_path) + + # Create progress bar for main steps + steps = [] + steps.append("H&E") if include_he else None + steps.append("Segmentation") if include_segmentation else None + steps.append("Protein Images") if include_protein else None + steps.append("Transcripts") if include_transcripts else None + steps.append("Tables") if include_tables else None + with tqdm(total=len(steps)) as pbar: + if include_he: + pbar.set_description(steps[pbar.n]) + _write_he( + sdata, + he_dir=G4XKeys.HE_DIR, + pattern=G4XKeys.HE_PATTERN, + mode=mode, + **G4XKeys.HE_IMG2DMODEL_KWARGS, + ) + pbar.update(1) + + if include_segmentation: + pbar.set_description(steps[pbar.n]) + _write_segmentation( + sdata, + nuclei_dir=G4XKeys.SEGMENTATION_DIR, + pattern=G4XKeys.SEGMENTATION_PATTERN, + nuclei_key=G4XKeys.NUCLEI_BOUNDARIES_KEY, + nuclei_exp_key=G4XKeys.CELL_BOUNDARIES_KEY, + mode=mode, + **G4XKeys.SEG_IMG2DMODEL_KWARGS, + ) + pbar.update(1) + + if include_protein: + pbar.set_description(steps[pbar.n]) + _write_protein_images( + sdata, + protein_dir=G4XKeys.PROTEIN_DIR, + pattern=G4XKeys.PROTEIN_PATTERN, + mode=mode, + **G4XKeys.PROTEIN_IMG2DMODEL_KWARGS, + ) + pbar.update(1) + + if include_transcripts: + pbar.set_description(steps[pbar.n]) + _write_transcripts( + sdata, + transcripts_dir=G4XKeys.TRANSCRIPTS_DIR, + pattern=G4XKeys.TRANSCRIPTS_PATTERN, + coordinates=G4XKeys.TRANSCRIPTS_COORDS, + feature_key=G4XKeys.TRANSCRIPTS_FEATURE_KEY, + swap_xy=G4XKeys.TRANSCRIPTS_SWAP_XY, + mode=mode, + ) + pbar.update(1) + + if include_tables: + pbar.set_description(steps[pbar.n]) + _write_table( + sdata, + table_path=G4XKeys.TABLE_PATTERN, + mode=mode, + ) + pbar.update(1) + + logger.debug("Done!") + + # Read back to enable lazy loading + sdata = SpatialData.read(output_zarr_path) + return sdata + + +def _write_he( + sdata: SpatialData, + he_dir: Union[str, None], + pattern: str, + mode: str = "append", + **kwargs, +): + """ + Write H&E images to SpatialData object. Each H&E image is stored as a separate object. + + Parameters + ---------- + sdata : SpatialData + SpatialData object to write to + he_dir : Union[str, None] + Path to directory containing H&E images. If None, this step will be skipped. + pattern : str + Glob pattern for selecting H&E images. + mode : str, optional + Mode for handling existing elements. Options: + - "append": Skip if element exists (default) + - "overwrite": Replace if element exists + kwargs : dict + Additional arguments passed to Image2DModel.parse() + + Modifies + ------- + sdata : SpatialData + SpatialData object with H&E images stored in sdata["{img_name}"] e.g. "h_and_e" + """ + if he_dir is None: + logger.debug("H&E skipped...") + return + + # Get list of H&E images + he_dir = Path(he_dir) + if he_dir.is_file(): + he_files = [he_dir] + else: + he_files = list(Path(he_dir).glob(pattern)) + if not he_files: + logger.warning(f"No H&E images found in {he_dir}") + return + he_files.sort() + + logger.debug(f"Found {len(he_files)} H&E images") + + # Process each H&E image + for he_file in tqdm(he_files, desc="Processing H&E images", leave=False): + # Extract sample ID from filename (e.g., "C02" from "C02_digital_he.jp2") + logger.debug(f"Processing {he_file}") + img_key = he_file.stem + + # Check if element exists + if f"images/{img_key}" in sdata.elements_paths_on_disk(): + if mode == "append": + logger.debug(f"H&E image '{img_key}' already exists. Skipping...") + continue + elif mode == "overwrite": + logger.debug(f"Deleting existing H&E image '{img_key}'") + if img_key in sdata: + del sdata[img_key] + sdata.delete_element_from_disk(img_key) + + # Load and process image + logger.debug(f"Loading H&E image from {he_file}") + img = imread(str(he_file)) + if len(img.shape) == 4: + img = img[0] # [0] to remove extra dimension + elif len(img.shape) == 3: + img = img.transpose(1, 2, 0) # move first dimension to last + logger.debug(f"H&E image shape: {img.shape}") + logger.debug(f"H&E image dtype: {img.dtype}") + + # Create Image2DModel and write + logger.debug(f"Creating Image2DModel for {img_key}") + sdata[img_key] = Image2DModel.parse(img, **kwargs) + logger.debug(f"Writing Image2DModel for {img_key}") + sdata.write_element(img_key) + + +def _write_segmentation( + sdata: SpatialData, + nuclei_dir: Union[str, None], + pattern: str, + nuclei_key: str, + nuclei_exp_key: str, + mode: str = "append", + **kwargs, +): + """ + Write segmentation labels to SpatialData object. + + Parameters + ---------- + sdata : SpatialData + SpatialData object to write to + nuclei_dir : Union[str, None] + Path to directory containing nuclei segmentation files. + If None, this step will be skipped. + pattern : str + Glob pattern for selecting nuclei segmentation files. + nuclei_key : str + Key for nuclei segmentation array in the NPZ file + nuclei_exp_key : str + Key for expanded nuclei segmentation array in the NPZ file + mode : str, optional + Mode for handling existing elements. Options: + - "append": Skip if elements exist (default) + - "overwrite": Replace if elements exist + kwargs : dict + Additional arguments passed to Labels2DModel.parse() + + Modifies + -------- + sdata : SpatialData + Adds the following elements: + - {nuclei_key}: Labels2DModel of nuclei segmentation + - {nuclei_exp_key}: Labels2DModel of expanded nuclei segmentation + - {nuclei_key}_shapes: Polygon shapes derived from nuclei segmentation + - {nuclei_exp_key}_shapes: Polygon shapes derived from expanded segmentation + """ + if nuclei_dir is None: + logger.debug("Segmentation skipped...") + return + + # Get list of nuclei files + nuclei_dir = Path(nuclei_dir) + nuclei_file = nuclei_dir / pattern + if not nuclei_file.exists(): + logger.warning(f"No segmentation files matching {pattern} in {nuclei_dir}") + return + + # Process each nuclei file + shapes_seg_key = f"{nuclei_key}_shapes" + shapes_exp_key = f"{nuclei_exp_key}_shapes" + + # Check if elements exist + elements = [nuclei_key, nuclei_exp_key, shapes_seg_key, shapes_exp_key] + elements_paths = [ + f"labels/{nuclei_key}", + f"labels/{nuclei_exp_key}", + f"shapes/{shapes_seg_key}", + f"shapes/{shapes_exp_key}", + ] + + if mode == "append" and any( + p in sdata.elements_paths_on_disk() for p in elements_paths + ): + logger.debug("Segmentation already exist. Skipping...") + return + elif mode == "overwrite": + logger.debug("Deleting existing segmentation elements") + for el in elements: + if el in sdata: + del sdata[el] + if ( + f"labels/{el}" in sdata.elements_paths_on_disk() + or f"shapes/{el}" in sdata.elements_paths_on_disk() + ): + sdata.delete_element_from_disk(el) + + # Load and process segmentation data + logger.debug(f"Loading segmentation data from {nuclei_file}") + nuclei_dict = np.load(nuclei_file) + nuclei_raw = nuclei_dict[nuclei_key] + nuclei_exp = nuclei_dict[nuclei_exp_key] + logger.debug(f"Nuclei masks shape: {nuclei_raw.shape}") + logger.debug(f"Cell masks shape: {nuclei_exp.shape}") + + # Create progress bar for nuclei processing steps + logger.debug("Converting to Labels2DModel") + sdata[nuclei_key] = Labels2DModel.parse(nuclei_raw, **kwargs) + sdata[nuclei_exp_key] = Labels2DModel.parse(nuclei_exp, **kwargs) + logger.debug("Converting to polygons") + sdata[shapes_seg_key] = to_polygons(sdata[nuclei_key]).reset_index(drop=True) + sdata[shapes_exp_key] = to_polygons(sdata[nuclei_exp_key]).reset_index(drop=True) + logger.debug("Writing elements") + for element in elements: + sdata.write_element(element) + + +def _write_protein_images( + sdata: SpatialData, + protein_dir: Union[str, None], + pattern: str, + mode: str = "append", + **kwargs, +): + """ + Write protein images to SpatialData object. Proteins are stored as channels in a single Image2DModel object. + + Parameters + ---------- + sdata : SpatialData + SpatialData object to write to + protein_dir : Union[str, None] + Path to directory containing protein images. + If None, this step will be skipped. + pattern : str + Glob pattern for selecting protein images. + mode : str, optional + Mode for handling existing elements. Options: + - "append": Skip if element exists (default) + - "overwrite": Replace if element exists + kwargs : dict + Additional arguments passed to Image2DModel.parse() + """ + if protein_dir is None: + logger.debug("Protein skipped...") + return + + protein_dir = Path(protein_dir) + + # Get list of protein images for this sample + img_list = list(protein_dir.glob(pattern)) + img_list.sort() + + if not img_list: + logger.warning( + f"No protein images found matching pattern '{pattern}' in {protein_dir}" + ) + return + logger.debug(f"Found {len(img_list)} protein images") + + # Check if element exists + if "images/protein" in sdata.elements_paths_on_disk(): + if mode == "append": + logger.debug("Protein images already exist. Skipping...") + return + elif mode == "overwrite": + logger.debug("Deleting existing protein images") + if G4XKeys.PROTEIN_KEY in sdata: + del sdata[G4XKeys.PROTEIN_KEY] + sdata.delete_element_from_disk(G4XKeys.PROTEIN_KEY) + img_list.sort() + + # Get channel names from filenames + channel_names = [img_file.stem.split("_")[0] for img_file in img_list] + + # Load all images at once with dask imread + logger.debug("Loading protein images") + protein_stack = imread(str(protein_dir / pattern)) + logger.debug(f"Images shape: {protein_stack.shape}") + + # Create Image2DModel and write + logger.debug("Converting to Image2DModel") + sdata[G4XKeys.PROTEIN_KEY] = Image2DModel.parse( + protein_stack, c_coords=channel_names, **kwargs + ) + + logger.debug("Writing protein images") + sdata.write_element(G4XKeys.PROTEIN_KEY) + + +def _write_transcripts( + sdata: SpatialData, + transcripts_dir: Union[str, None], + pattern: str, + coordinates: dict, + feature_key: str, + swap_xy: bool, + mode: str = "append", +): + """ + Write transcripts to SpatialData object. + + Parameters + ---------- + sdata : SpatialData + SpatialData object to write to + transcripts_dir : Union[str, None] + Path to directory containing transcript tables. + pattern : str + Glob pattern for selecting transcript tables. + coordinates : dict + Dictionary mapping coordinate column names to standard x,y coordinates + feature_key : str + Column name containing transcript feature identifiers + swap_xy : bool + Whether to swap the x and y coordinates + mode : str, optional + Mode for handling existing element. Options: + - "append": Skip if element exists (default) + - "overwrite": Replace if element exists + + Modifies + -------- + sdata : SpatialData + Adds a "transcripts" PointsModel containing transcript locations and features + """ + if transcripts_dir is None: + logger.debug("Transcripts skipped...") + return + + if f"points/{G4XKeys.TRANSCRIPTS_KEY}" in sdata.elements_paths_on_disk(): + if mode == "append": + logger.debug("Transcripts already exist. Skipping...") + return + elif mode == "overwrite": + logger.debug("Deleting existing transcripts") + if G4XKeys.TRANSCRIPTS_KEY in sdata: + del sdata[G4XKeys.TRANSCRIPTS_KEY] + sdata.delete_element_from_disk(G4XKeys.TRANSCRIPTS_KEY) + + transcript_dir = Path(transcripts_dir) + with tqdm(total=3, desc="Processing transcripts", leave=False) as pbar: + pbar.set_description("Loading transcripts") + + if pattern.endswith(".csv") or pattern.endswith(".csv.gz"): + # list files found in transcript_dir + transcript_files = list(transcript_dir.glob(pattern)) + transcript_files.sort() + logger.debug(f"Found {len(transcript_files)} transcript files") + transcripts = dd.read_csv(transcript_files).compute().reset_index(drop=True) + else: + raise ValueError(f"Unsupported file type: {transcript_dir / pattern}") + pbar.update(1) + + if swap_xy: + transcripts[[coordinates["x"], coordinates["y"]]] = transcripts[ + [coordinates["y"], coordinates["x"]] + ] + + pbar.set_description("Converting to PointsModel") + sdata[G4XKeys.TRANSCRIPTS_KEY] = PointsModel.parse( + transcripts, + coordinates=coordinates, + feature_key=feature_key, + ) + pbar.update(1) + + pbar.set_description("Writing to disk") + sdata.write_element(G4XKeys.TRANSCRIPTS_KEY) + pbar.update(1) + + +def _write_table( + sdata: SpatialData, + table_path: Union[str, None], + mode: str = "append", +): + """ + Write tables to SpatialData object. + """ + if table_path is None: + logger.debug("Table skipped...") + return + + adata = read_h5ad(table_path) + sdata[G4XKeys.TABLE_KEY] = TableModel.parse(adata) + + logger.debug("Writing table to disk") + sdata.write_element(G4XKeys.TABLE_KEY) + + +def _deep_update(base_dict, update_dict): + """ + Recursively update a dictionary with another dictionary. + """ + for key, value in update_dict.items(): + if ( + isinstance(value, dict) + and key in base_dict + and isinstance(base_dict[key], dict) + ): + _deep_update(base_dict[key], value) + else: + base_dict[key] = value From 8099b686ed451d44dcaff10d0c23b6b24f9aaf4f Mon Sep 17 00:00:00 2001 From: Clarence Mah Date: Tue, 18 Feb 2025 21:37:18 +0000 Subject: [PATCH 02/13] Refactor g4x function to support single sample and run directory processing --- src/spatialdata_io/readers/g4x.py | 99 ++++++++++++++++++------------- 1 file changed, 58 insertions(+), 41 deletions(-) diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py index 78c7f007..82c2946e 100644 --- a/src/spatialdata_io/readers/g4x.py +++ b/src/spatialdata_io/readers/g4x.py @@ -21,11 +21,11 @@ from spatialdata_io._constants._constants import G4XKeys from spatialdata_io._docs import inject_docs -__all__ = ["g4x_sample", "g4x_run"] +__all__ = ["g4x"] @inject_docs(xx=G4XKeys) -def g4x_run( +def g4x( input_path: Union[str, Path], output_path: Union[str, Path, None] = None, include_he: bool = True, @@ -36,14 +36,13 @@ def g4x_run( mode: str = "append", ): """ - Create SpatialData objects for each sample in a run directory. - - See :func:`g4x_sample` for more details. + Create SpatialData objects for each sample in a run directory or a single sample directory. Parameters ---------- input_path : Union[str, Path] - Path to input directory containing run data. Assumes each subdirectory contains a sample. e.g. `input_path/A01`, `input_path/B01`, etc. + Path to input directory containing run data or a single sample directory. + If a run directory, assumes each subdirectory contains a sample. e.g. `input_path/A01`, `input_path/B01`, etc. output_path : Union[str, Path] Path to directory where SpatialData zarr stores will be written. If None, zarr stores will be written to each sample directory found in `input_path`. include_he : bool @@ -62,52 +61,70 @@ def g4x_run( - "overwrite": Replace existing elements Returns ------- - sdatas : list[SpatialData] - List of SpatialData objects + sdatas : Union[SpatialData, list[SpatialData]] + A single SpatialData object if processing a single sample directory, otherwise a list of SpatialData objects. """ if isinstance(input_path, str): input_path = Path(input_path) if isinstance(output_path, str): output_path = Path(output_path) - # Make sure paths match expected format e.g. A01, B01 - sample_input_paths = [] - for p in input_path.iterdir(): - if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name): - sample_input_paths.append(p) - logger.debug(f"Found {len(sample_input_paths)} samples.") - - if output_path is None: - sample_output_paths = [ - input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths + # Determine if input_path is a run directory or a single sample directory + if any( + p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) for p in input_path.iterdir() + ): + # Run directory with multiple samples + sample_input_paths = [ + p + for p in input_path.iterdir() + if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) ] + logger.debug(f"Found {len(sample_input_paths)} samples.") + + if output_path is None: + sample_output_paths = [ + input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths + ] + else: + sample_output_paths = [ + output_path / f"{p.name}.zarr" for p in sample_input_paths + ] + + sdatas = [] + for sample_input_path, sample_output_path in tqdm( + zip(sample_input_paths, sample_output_paths), + total=len(sample_input_paths), + desc="Processing samples", + ): + sdata = g4x_sample( + input_path=sample_input_path, + output_zarr_path=sample_output_path, + include_he=include_he, + include_segmentation=include_segmentation, + include_protein=include_protein, + include_transcripts=include_transcripts, + include_tables=include_tables, + mode=mode, + ) + sdatas.append(sdata) + return sdatas else: - sample_output_paths = [ - output_path / f"{p.name}.zarr" for p in sample_input_paths - ] + # Single sample directory + logger.debug("Processing single sample directory.") + if output_path is None: + output_path = input_path / f"{input_path.name}.zarr" - kwargs = { - "include_he": include_he, - "include_segmentation": include_segmentation, - "include_protein": include_protein, - "include_transcripts": include_transcripts, - "include_tables": include_tables, - "mode": mode, - } - - sdatas = [] - for sample_input_path, sample_output_path in tqdm( - zip(sample_input_paths, sample_output_paths), - total=len(sample_input_paths), - desc="Processing samples", - ): sdata = g4x_sample( - input_path=sample_input_path, - output_zarr_path=sample_output_path, - **kwargs, + input_path=input_path, + output_zarr_path=output_path, + include_he=include_he, + include_segmentation=include_segmentation, + include_protein=include_protein, + include_transcripts=include_transcripts, + include_tables=include_tables, + mode=mode, ) - sdatas.append(sdata) - return sdatas + return sdata def g4x_sample( From 03fb7af5fc570aacba43a40cdb55964972062f1d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 19 Feb 2025 01:32:30 +0000 Subject: [PATCH 03/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata_io/__init__.py | 2 +- src/spatialdata_io/_constants/_constants.py | 22 ++----- src/spatialdata_io/readers/g4x.py | 69 +++++++-------------- 3 files changed, 26 insertions(+), 67 deletions(-) diff --git a/src/spatialdata_io/__init__.py b/src/spatialdata_io/__init__.py index f05ab5b4..14af87cb 100644 --- a/src/spatialdata_io/__init__.py +++ b/src/spatialdata_io/__init__.py @@ -5,8 +5,8 @@ from spatialdata_io.readers.cosmx import cosmx from spatialdata_io.readers.curio import curio from spatialdata_io.readers.dbit import dbit -from spatialdata_io.readers.generic import generic, geojson, image from spatialdata_io.readers.g4x import g4x +from spatialdata_io.readers.generic import generic, geojson, image from spatialdata_io.readers.macsima import macsima from spatialdata_io.readers.mcmicro import mcmicro from spatialdata_io.readers.merscope import merscope diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py index 5edb44de..cbbddba7 100644 --- a/src/spatialdata_io/_constants/_constants.py +++ b/src/spatialdata_io/_constants/_constants.py @@ -406,39 +406,25 @@ class G4XKeys(str, ModeEnum): # H&E HE_DIR = "h_and_e" HE_PATTERN = "*.jp2" - HE_IMG2DMODEL_KWARGS = { - "dims": ["y", "x", "c"], - "scale_factors": [2, 2, 2], - "chunks": "auto" - } + HE_IMG2DMODEL_KWARGS = {"dims": ["y", "x", "c"], "scale_factors": [2, 2, 2], "chunks": "auto"} # Nuclei NUCLEI_BOUNDARIES_KEY = "nuclei" CELL_BOUNDARIES_KEY = "nuclei_exp" SEGMENTATION_DIR = "segmentation" SEGMENTATION_PATTERN = "segmentation_mask.npz" - SEG_IMG2DMODEL_KWARGS = { - "dims": ["y", "x"], - "chunks": "auto" - } + SEG_IMG2DMODEL_KWARGS = {"dims": ["y", "x"], "chunks": "auto"} # Protein PROTEIN_KEY = "protein" PROTEIN_DIR = "protein" PROTEIN_PATTERN = "*.jp2" - PROTEIN_IMG2DMODEL_KWARGS = { - "dims": ["c", "y", "x"], - "scale_factors": [2, 2, 2], - "chunks": "auto" - } + PROTEIN_IMG2DMODEL_KWARGS = {"dims": ["c", "y", "x"], "scale_factors": [2, 2, 2], "chunks": "auto"} # Transcripts TRANSCRIPTS_DIR = "rna" TRANSCRIPTS_PATTERN = "*transcript_table.csv.gz" - TRANSCRIPTS_COORDS = { - "x": "x_pixel_coordinate", - "y": "y_pixel_coordinate" - } + TRANSCRIPTS_COORDS = {"x": "x_pixel_coordinate", "y": "y_pixel_coordinate"} TRANSCRIPTS_FEATURE_KEY = "gene_name" TRANSCRIPTS_SWAP_XY = True diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py index 82c2946e..58569ea9 100644 --- a/src/spatialdata_io/readers/g4x.py +++ b/src/spatialdata_io/readers/g4x.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Union +import dask.dataframe as dd import numpy as np from anndata.io import read_h5ad from dask_image.imread import imread @@ -15,7 +16,6 @@ PointsModel, TableModel, ) -import dask.dataframe as dd from tqdm.auto import tqdm from spatialdata_io._constants._constants import G4XKeys @@ -26,8 +26,8 @@ @inject_docs(xx=G4XKeys) def g4x( - input_path: Union[str, Path], - output_path: Union[str, Path, None] = None, + input_path: str | Path, + output_path: str | Path | None = None, include_he: bool = True, include_segmentation: bool = True, include_protein: bool = True, @@ -70,25 +70,15 @@ def g4x( output_path = Path(output_path) # Determine if input_path is a run directory or a single sample directory - if any( - p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) for p in input_path.iterdir() - ): + if any(p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) for p in input_path.iterdir()): # Run directory with multiple samples - sample_input_paths = [ - p - for p in input_path.iterdir() - if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) - ] + sample_input_paths = [p for p in input_path.iterdir() if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name)] logger.debug(f"Found {len(sample_input_paths)} samples.") if output_path is None: - sample_output_paths = [ - input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths - ] + sample_output_paths = [input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths] else: - sample_output_paths = [ - output_path / f"{p.name}.zarr" for p in sample_input_paths - ] + sample_output_paths = [output_path / f"{p.name}.zarr" for p in sample_input_paths] sdatas = [] for sample_input_path, sample_output_path in tqdm( @@ -128,8 +118,8 @@ def g4x( def g4x_sample( - input_path: Union[str, Path], - output_zarr_path: Union[str, Path], + input_path: str | Path, + output_zarr_path: str | Path, include_he: bool = True, include_segmentation: bool = True, include_protein: bool = True, @@ -178,9 +168,7 @@ def g4x_sample( output_zarr_path = Path(output_zarr_path) if output_zarr_path.suffix != ".zarr": logger.error(f"Output path must end with '.zarr'. Got {output_zarr_path}") - raise ValueError( - f"Output path must end with '.zarr'. Got {output_zarr_path}" - ) + raise ValueError(f"Output path must end with '.zarr'. Got {output_zarr_path}") if mode not in ["append", "overwrite"]: msg = f"Invalid mode '{mode}'. Must be one of: 'append', 'overwrite'" @@ -269,7 +257,7 @@ def g4x_sample( def _write_he( sdata: SpatialData, - he_dir: Union[str, None], + he_dir: str | None, pattern: str, mode: str = "append", **kwargs, @@ -350,7 +338,7 @@ def _write_he( def _write_segmentation( sdata: SpatialData, - nuclei_dir: Union[str, None], + nuclei_dir: str | None, pattern: str, nuclei_key: str, nuclei_exp_key: str, @@ -413,9 +401,7 @@ def _write_segmentation( f"shapes/{shapes_exp_key}", ] - if mode == "append" and any( - p in sdata.elements_paths_on_disk() for p in elements_paths - ): + if mode == "append" and any(p in sdata.elements_paths_on_disk() for p in elements_paths): logger.debug("Segmentation already exist. Skipping...") return elif mode == "overwrite": @@ -423,10 +409,7 @@ def _write_segmentation( for el in elements: if el in sdata: del sdata[el] - if ( - f"labels/{el}" in sdata.elements_paths_on_disk() - or f"shapes/{el}" in sdata.elements_paths_on_disk() - ): + if f"labels/{el}" in sdata.elements_paths_on_disk() or f"shapes/{el}" in sdata.elements_paths_on_disk(): sdata.delete_element_from_disk(el) # Load and process segmentation data @@ -451,7 +434,7 @@ def _write_segmentation( def _write_protein_images( sdata: SpatialData, - protein_dir: Union[str, None], + protein_dir: str | None, pattern: str, mode: str = "append", **kwargs, @@ -486,9 +469,7 @@ def _write_protein_images( img_list.sort() if not img_list: - logger.warning( - f"No protein images found matching pattern '{pattern}' in {protein_dir}" - ) + logger.warning(f"No protein images found matching pattern '{pattern}' in {protein_dir}") return logger.debug(f"Found {len(img_list)} protein images") @@ -514,9 +495,7 @@ def _write_protein_images( # Create Image2DModel and write logger.debug("Converting to Image2DModel") - sdata[G4XKeys.PROTEIN_KEY] = Image2DModel.parse( - protein_stack, c_coords=channel_names, **kwargs - ) + sdata[G4XKeys.PROTEIN_KEY] = Image2DModel.parse(protein_stack, c_coords=channel_names, **kwargs) logger.debug("Writing protein images") sdata.write_element(G4XKeys.PROTEIN_KEY) @@ -524,7 +503,7 @@ def _write_protein_images( def _write_transcripts( sdata: SpatialData, - transcripts_dir: Union[str, None], + transcripts_dir: str | None, pattern: str, coordinates: dict, feature_key: str, @@ -587,9 +566,7 @@ def _write_transcripts( pbar.update(1) if swap_xy: - transcripts[[coordinates["x"], coordinates["y"]]] = transcripts[ - [coordinates["y"], coordinates["x"]] - ] + transcripts[[coordinates["x"], coordinates["y"]]] = transcripts[[coordinates["y"], coordinates["x"]]] pbar.set_description("Converting to PointsModel") sdata[G4XKeys.TRANSCRIPTS_KEY] = PointsModel.parse( @@ -606,7 +583,7 @@ def _write_transcripts( def _write_table( sdata: SpatialData, - table_path: Union[str, None], + table_path: str | None, mode: str = "append", ): """ @@ -628,11 +605,7 @@ def _deep_update(base_dict, update_dict): Recursively update a dictionary with another dictionary. """ for key, value in update_dict.items(): - if ( - isinstance(value, dict) - and key in base_dict - and isinstance(base_dict[key], dict) - ): + if isinstance(value, dict) and key in base_dict and isinstance(base_dict[key], dict): _deep_update(base_dict[key], value) else: base_dict[key] = value From 667f3611aa889d41737b5c2bf77fe35ac29152da Mon Sep 17 00:00:00 2001 From: Clarence Mah Date: Mon, 17 Mar 2025 17:51:39 -0700 Subject: [PATCH 04/13] simplify constants to strings --- src/spatialdata_io/_constants/_constants.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py index cbbddba7..acf12625 100644 --- a/src/spatialdata_io/_constants/_constants.py +++ b/src/spatialdata_io/_constants/_constants.py @@ -401,30 +401,29 @@ class VisiumHDKeys(ModeEnum): FILE_FORMAT = "file_format" -class G4XKeys(str, ModeEnum): +class G4XKeys(ModeEnum): # H&E HE_DIR = "h_and_e" HE_PATTERN = "*.jp2" - HE_IMG2DMODEL_KWARGS = {"dims": ["y", "x", "c"], "scale_factors": [2, 2, 2], "chunks": "auto"} # Nuclei NUCLEI_BOUNDARIES_KEY = "nuclei" CELL_BOUNDARIES_KEY = "nuclei_exp" SEGMENTATION_DIR = "segmentation" SEGMENTATION_PATTERN = "segmentation_mask.npz" - SEG_IMG2DMODEL_KWARGS = {"dims": ["y", "x"], "chunks": "auto"} # Protein PROTEIN_KEY = "protein" PROTEIN_DIR = "protein" PROTEIN_PATTERN = "*.jp2" - PROTEIN_IMG2DMODEL_KWARGS = {"dims": ["c", "y", "x"], "scale_factors": [2, 2, 2], "chunks": "auto"} # Transcripts + TRANSCRIPTS_KEY = "transcripts" TRANSCRIPTS_DIR = "rna" TRANSCRIPTS_PATTERN = "*transcript_table.csv.gz" - TRANSCRIPTS_COORDS = {"x": "x_pixel_coordinate", "y": "y_pixel_coordinate"} + TRANSCRIPTS_COORD_X = "x_pixel_coordinate" + TRANSCRIPTS_COORD_Y = "y_pixel_coordinate" TRANSCRIPTS_FEATURE_KEY = "gene_name" TRANSCRIPTS_SWAP_XY = True From d75db969dd334ee387dedf0bfb54b3fa9d5b717b Mon Sep 17 00:00:00 2001 From: Clarence Mah Date: Mon, 17 Mar 2025 17:54:52 -0700 Subject: [PATCH 05/13] fix input paths, link table annotations to cells, simplify image model kwargs, use dask array imread --- src/spatialdata_io/readers/g4x.py | 159 +++++++++++++++++++++++------- 1 file changed, 122 insertions(+), 37 deletions(-) diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py index 58569ea9..17ee6177 100644 --- a/src/spatialdata_io/readers/g4x.py +++ b/src/spatialdata_io/readers/g4x.py @@ -7,7 +7,7 @@ import dask.dataframe as dd import numpy as np from anndata.io import read_h5ad -from dask_image.imread import imread +from dask.array.image import imread from spatialdata import SpatialData, to_polygons from spatialdata._logging import logger from spatialdata.models import ( @@ -70,15 +70,25 @@ def g4x( output_path = Path(output_path) # Determine if input_path is a run directory or a single sample directory - if any(p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) for p in input_path.iterdir()): + if any( + p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) for p in input_path.iterdir() + ): # Run directory with multiple samples - sample_input_paths = [p for p in input_path.iterdir() if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name)] + sample_input_paths = [ + p + for p in input_path.iterdir() + if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) + ] logger.debug(f"Found {len(sample_input_paths)} samples.") if output_path is None: - sample_output_paths = [input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths] + sample_output_paths = [ + input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths + ] else: - sample_output_paths = [output_path / f"{p.name}.zarr" for p in sample_input_paths] + sample_output_paths = [ + output_path / f"{p.name}.zarr" for p in sample_input_paths + ] sdatas = [] for sample_input_path, sample_output_path in tqdm( @@ -168,7 +178,9 @@ def g4x_sample( output_zarr_path = Path(output_zarr_path) if output_zarr_path.suffix != ".zarr": logger.error(f"Output path must end with '.zarr'. Got {output_zarr_path}") - raise ValueError(f"Output path must end with '.zarr'. Got {output_zarr_path}") + raise ValueError( + f"Output path must end with '.zarr'. Got {output_zarr_path}" + ) if mode not in ["append", "overwrite"]: msg = f"Invalid mode '{mode}'. Must be one of: 'append', 'overwrite'" @@ -195,10 +207,9 @@ def g4x_sample( pbar.set_description(steps[pbar.n]) _write_he( sdata, - he_dir=G4XKeys.HE_DIR, + he_dir=input_path / G4XKeys.HE_DIR, pattern=G4XKeys.HE_PATTERN, mode=mode, - **G4XKeys.HE_IMG2DMODEL_KWARGS, ) pbar.update(1) @@ -206,12 +217,11 @@ def g4x_sample( pbar.set_description(steps[pbar.n]) _write_segmentation( sdata, - nuclei_dir=G4XKeys.SEGMENTATION_DIR, + nuclei_dir=input_path / G4XKeys.SEGMENTATION_DIR, pattern=G4XKeys.SEGMENTATION_PATTERN, nuclei_key=G4XKeys.NUCLEI_BOUNDARIES_KEY, nuclei_exp_key=G4XKeys.CELL_BOUNDARIES_KEY, mode=mode, - **G4XKeys.SEG_IMG2DMODEL_KWARGS, ) pbar.update(1) @@ -219,10 +229,9 @@ def g4x_sample( pbar.set_description(steps[pbar.n]) _write_protein_images( sdata, - protein_dir=G4XKeys.PROTEIN_DIR, + protein_dir=input_path / G4XKeys.PROTEIN_DIR, pattern=G4XKeys.PROTEIN_PATTERN, mode=mode, - **G4XKeys.PROTEIN_IMG2DMODEL_KWARGS, ) pbar.update(1) @@ -230,9 +239,12 @@ def g4x_sample( pbar.set_description(steps[pbar.n]) _write_transcripts( sdata, - transcripts_dir=G4XKeys.TRANSCRIPTS_DIR, + transcripts_dir=input_path / G4XKeys.TRANSCRIPTS_DIR, pattern=G4XKeys.TRANSCRIPTS_PATTERN, - coordinates=G4XKeys.TRANSCRIPTS_COORDS, + coordinates={ + "x": G4XKeys.TRANSCRIPTS_COORD_X, + "y": G4XKeys.TRANSCRIPTS_COORD_Y, + }, feature_key=G4XKeys.TRANSCRIPTS_FEATURE_KEY, swap_xy=G4XKeys.TRANSCRIPTS_SWAP_XY, mode=mode, @@ -243,7 +255,7 @@ def g4x_sample( pbar.set_description(steps[pbar.n]) _write_table( sdata, - table_path=G4XKeys.TABLE_PATTERN, + table_path=input_path / G4XKeys.TABLES_DIR / G4XKeys.TABLE_PATTERN, mode=mode, ) pbar.update(1) @@ -278,7 +290,7 @@ def _write_he( - "append": Skip if element exists (default) - "overwrite": Replace if element exists kwargs : dict - Additional arguments passed to Image2DModel.parse() + Keyword arguments for Image2DModel Modifies ------- @@ -321,16 +333,20 @@ def _write_he( # Load and process image logger.debug(f"Loading H&E image from {he_file}") - img = imread(str(he_file)) - if len(img.shape) == 4: - img = img[0] # [0] to remove extra dimension - elif len(img.shape) == 3: - img = img.transpose(1, 2, 0) # move first dimension to last + img = imread(str(he_file)).compute().squeeze() logger.debug(f"H&E image shape: {img.shape}") logger.debug(f"H&E image dtype: {img.dtype}") - + if len(img.shape) == 2: + img = img[np.newaxis, :, :] + elif len(img.shape) == 3: + img = img.transpose(2, 0, 1) # Create Image2DModel and write logger.debug(f"Creating Image2DModel for {img_key}") + kwargs["dims"] = ["c", "y", "x"] if "dims" not in kwargs else kwargs["dims"] + kwargs["scale_factors"] = ( + [2, 2, 2] if "scale_factors" not in kwargs else kwargs["scale_factors"] + ) + kwargs["chunks"] = "auto" if "chunks" not in kwargs else kwargs["chunks"] sdata[img_key] = Image2DModel.parse(img, **kwargs) logger.debug(f"Writing Image2DModel for {img_key}") sdata.write_element(img_key) @@ -366,7 +382,7 @@ def _write_segmentation( - "append": Skip if elements exist (default) - "overwrite": Replace if elements exist kwargs : dict - Additional arguments passed to Labels2DModel.parse() + Keyword arguments for Labels2DModel Modifies -------- @@ -389,19 +405,21 @@ def _write_segmentation( return # Process each nuclei file - shapes_seg_key = f"{nuclei_key}_shapes" - shapes_exp_key = f"{nuclei_exp_key}_shapes" + shapes_nuclei_key = f"{nuclei_key}_shapes" + shapes_nuclei_exp_key = f"{nuclei_exp_key}_shapes" # Check if elements exist - elements = [nuclei_key, nuclei_exp_key, shapes_seg_key, shapes_exp_key] + elements = [nuclei_key, nuclei_exp_key, shapes_nuclei_key, shapes_nuclei_exp_key] elements_paths = [ f"labels/{nuclei_key}", f"labels/{nuclei_exp_key}", - f"shapes/{shapes_seg_key}", - f"shapes/{shapes_exp_key}", + f"shapes/{shapes_nuclei_key}", + f"shapes/{shapes_nuclei_exp_key}", ] - if mode == "append" and any(p in sdata.elements_paths_on_disk() for p in elements_paths): + if mode == "append" and any( + p in sdata.elements_paths_on_disk() for p in elements_paths + ): logger.debug("Segmentation already exist. Skipping...") return elif mode == "overwrite": @@ -409,7 +427,10 @@ def _write_segmentation( for el in elements: if el in sdata: del sdata[el] - if f"labels/{el}" in sdata.elements_paths_on_disk() or f"shapes/{el}" in sdata.elements_paths_on_disk(): + if ( + f"labels/{el}" in sdata.elements_paths_on_disk() + or f"shapes/{el}" in sdata.elements_paths_on_disk() + ): sdata.delete_element_from_disk(el) # Load and process segmentation data @@ -425,8 +446,14 @@ def _write_segmentation( sdata[nuclei_key] = Labels2DModel.parse(nuclei_raw, **kwargs) sdata[nuclei_exp_key] = Labels2DModel.parse(nuclei_exp, **kwargs) logger.debug("Converting to polygons") - sdata[shapes_seg_key] = to_polygons(sdata[nuclei_key]).reset_index(drop=True) - sdata[shapes_exp_key] = to_polygons(sdata[nuclei_exp_key]).reset_index(drop=True) + sdata[shapes_nuclei_key] = to_polygons(sdata[nuclei_key]).reset_index(drop=True) + sdata[shapes_nuclei_exp_key] = to_polygons(sdata[nuclei_exp_key]).reset_index( + drop=True + ) + # Set index for shapes + sdata[shapes_nuclei_exp_key] = sdata[shapes_nuclei_exp_key].set_index("label") + sdata[shapes_nuclei_exp_key].index = sdata[shapes_nuclei_exp_key].index.astype(str) + logger.debug("Writing elements") for element in elements: sdata.write_element(element) @@ -456,7 +483,7 @@ def _write_protein_images( - "append": Skip if element exists (default) - "overwrite": Replace if element exists kwargs : dict - Additional arguments passed to Image2DModel.parse() + Keyword arguments for Image2DModel """ if protein_dir is None: logger.debug("Protein skipped...") @@ -469,7 +496,9 @@ def _write_protein_images( img_list.sort() if not img_list: - logger.warning(f"No protein images found matching pattern '{pattern}' in {protein_dir}") + logger.warning( + f"No protein images found matching pattern '{pattern}' in {protein_dir}" + ) return logger.debug(f"Found {len(img_list)} protein images") @@ -493,9 +522,21 @@ def _write_protein_images( protein_stack = imread(str(protein_dir / pattern)) logger.debug(f"Images shape: {protein_stack.shape}") + kwargs["dims"] = ["c", "y", "x"] if "dims" not in kwargs else kwargs["dims"] + kwargs["scale_factors"] = ( + [2, 2, 2] if "scale_factors" not in kwargs else kwargs["scale_factors"] + ) + kwargs["chunks"] = ( + [1, protein_stack.shape[-2], protein_stack.shape[-1]] + if "chunks" not in kwargs + else kwargs["chunks"] + ) + # Create Image2DModel and write logger.debug("Converting to Image2DModel") - sdata[G4XKeys.PROTEIN_KEY] = Image2DModel.parse(protein_stack, c_coords=channel_names, **kwargs) + sdata[G4XKeys.PROTEIN_KEY] = Image2DModel.parse( + protein_stack, c_coords=channel_names, **kwargs + ) logger.debug("Writing protein images") sdata.write_element(G4XKeys.PROTEIN_KEY) @@ -566,7 +607,9 @@ def _write_transcripts( pbar.update(1) if swap_xy: - transcripts[[coordinates["x"], coordinates["y"]]] = transcripts[[coordinates["y"], coordinates["x"]]] + transcripts[[coordinates["x"], coordinates["y"]]] = transcripts[ + [coordinates["y"], coordinates["x"]] + ] pbar.set_description("Converting to PointsModel") sdata[G4XKeys.TRANSCRIPTS_KEY] = PointsModel.parse( @@ -588,13 +631,51 @@ def _write_table( ): """ Write tables to SpatialData object. + + Parameters + ---------- + sdata : SpatialData + SpatialData object to write to + table_path : Union[str, None] + Path to the table file. + If None, this step will be skipped. + mode : str, optional + Mode for handling existing elements. Options: + - "append": Skip if element exists (default) + - "overwrite": Replace if element exists + + Modifies + -------- + sdata : SpatialData + Adds a table element to the SpatialData object """ if table_path is None: logger.debug("Table skipped...") return + if f"tables/{G4XKeys.TABLE_KEY}" in sdata.elements_paths_on_disk(): + if mode == "append": + logger.debug("Table already exists. Skipping...") + return + elif mode == "overwrite": + logger.debug("Deleting existing table") + if G4XKeys.TABLE_KEY in sdata: + del sdata[G4XKeys.TABLE_KEY] + sdata.delete_element_from_disk(G4XKeys.TABLE_KEY) + adata = read_h5ad(table_path) + + # Link table annotations to cell shapes + shape_key = f"{G4XKeys.CELL_BOUNDARIES_KEY}_shapes" + adata.obs["region"] = shape_key + adata.obs["label"] = adata.obs["cell_id"].str.split("-").str[1] sdata[G4XKeys.TABLE_KEY] = TableModel.parse(adata) + sdata.set_table_annotates_spatialelement( + G4XKeys.TABLE_KEY, + region=shape_key, + region_key="region", + instance_key="label", + ) logger.debug("Writing table to disk") sdata.write_element(G4XKeys.TABLE_KEY) @@ -605,7 +686,11 @@ def _deep_update(base_dict, update_dict): Recursively update a dictionary with another dictionary. """ for key, value in update_dict.items(): - if isinstance(value, dict) and key in base_dict and isinstance(base_dict[key], dict): + if ( + isinstance(value, dict) + and key in base_dict + and isinstance(base_dict[key], dict) + ): _deep_update(base_dict[key], value) else: base_dict[key] = value From be6eca33932c9025ae7fa7667a92f314958e83f6 Mon Sep 17 00:00:00 2001 From: Clarence Mah Date: Mon, 17 Mar 2025 18:14:33 -0700 Subject: [PATCH 06/13] swap axes for tx no longer needed --- src/spatialdata_io/_constants/_constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py index acf12625..021b9d57 100644 --- a/src/spatialdata_io/_constants/_constants.py +++ b/src/spatialdata_io/_constants/_constants.py @@ -425,7 +425,7 @@ class G4XKeys(ModeEnum): TRANSCRIPTS_COORD_X = "x_pixel_coordinate" TRANSCRIPTS_COORD_Y = "y_pixel_coordinate" TRANSCRIPTS_FEATURE_KEY = "gene_name" - TRANSCRIPTS_SWAP_XY = True + TRANSCRIPTS_SWAP_XY = False # Tables TABLES_DIR = "single_cell_data" From 3db3877c2b437a3b317cf03922b251a0fb77cb58 Mon Sep 17 00:00:00 2001 From: Clarence Mah Date: Wed, 19 Mar 2025 16:18:06 -0700 Subject: [PATCH 07/13] parse bool properly for swap_xy --- src/spatialdata_io/readers/g4x.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py index 17ee6177..84e189e1 100644 --- a/src/spatialdata_io/readers/g4x.py +++ b/src/spatialdata_io/readers/g4x.py @@ -246,7 +246,7 @@ def g4x_sample( "y": G4XKeys.TRANSCRIPTS_COORD_Y, }, feature_key=G4XKeys.TRANSCRIPTS_FEATURE_KEY, - swap_xy=G4XKeys.TRANSCRIPTS_SWAP_XY, + swap_xy=G4XKeys.TRANSCRIPTS_SWAP_XY == "True", mode=mode, ) pbar.update(1) @@ -606,7 +606,9 @@ def _write_transcripts( raise ValueError(f"Unsupported file type: {transcript_dir / pattern}") pbar.update(1) + logger.debug(f"swap_xy: {swap_xy}, {type(swap_xy)}") if swap_xy: + logger.debug("Swapping x and y coordinates") transcripts[[coordinates["x"], coordinates["y"]]] = transcripts[ [coordinates["y"], coordinates["x"]] ] From 8d7a41a202e0023ccb1edba69afc034873f94034 Mon Sep 17 00:00:00 2001 From: Clarence Mah Date: Wed, 19 Mar 2025 16:20:34 -0700 Subject: [PATCH 08/13] introduce offset to generated shapes --- src/spatialdata_io/_constants/_constants.py | 1 + src/spatialdata_io/readers/g4x.py | 20 +++++++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py index 021b9d57..c56d3069 100644 --- a/src/spatialdata_io/_constants/_constants.py +++ b/src/spatialdata_io/_constants/_constants.py @@ -412,6 +412,7 @@ class G4XKeys(ModeEnum): CELL_BOUNDARIES_KEY = "nuclei_exp" SEGMENTATION_DIR = "segmentation" SEGMENTATION_PATTERN = "segmentation_mask.npz" + OFFSET = "-0.5" # Protein PROTEIN_KEY = "protein" diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py index 84e189e1..dff96c34 100644 --- a/src/spatialdata_io/readers/g4x.py +++ b/src/spatialdata_io/readers/g4x.py @@ -445,13 +445,23 @@ def _write_segmentation( logger.debug("Converting to Labels2DModel") sdata[nuclei_key] = Labels2DModel.parse(nuclei_raw, **kwargs) sdata[nuclei_exp_key] = Labels2DModel.parse(nuclei_exp, **kwargs) + + # Convert labels to polygons using "label" as index and translating xy coordinates to (almost) match label pixel coordinates logger.debug("Converting to polygons") - sdata[shapes_nuclei_key] = to_polygons(sdata[nuclei_key]).reset_index(drop=True) - sdata[shapes_nuclei_exp_key] = to_polygons(sdata[nuclei_exp_key]).reset_index( - drop=True + offset = float(G4XKeys.OFFSET) + + # Nuclei shapes + sdata[shapes_nuclei_key] = to_polygons(sdata[nuclei_key]).set_index("label") + sdata[shapes_nuclei_key].geometry = sdata[shapes_nuclei_key].translate( + xoff=offset, yoff=offset + ) + sdata[shapes_nuclei_key].index = sdata[shapes_nuclei_key].index.astype(str) + + # Expanded nuclei shapes + sdata[shapes_nuclei_exp_key] = to_polygons(sdata[nuclei_exp_key]).set_index("label") + sdata[shapes_nuclei_exp_key].geometry = sdata[shapes_nuclei_exp_key].translate( + xoff=offset, yoff=offset ) - # Set index for shapes - sdata[shapes_nuclei_exp_key] = sdata[shapes_nuclei_exp_key].set_index("label") sdata[shapes_nuclei_exp_key].index = sdata[shapes_nuclei_exp_key].index.astype(str) logger.debug("Writing elements") From a7a1f639abd58f1b7ba161df34d38478504e9dcb Mon Sep 17 00:00:00 2001 From: Clarence Mah Date: Wed, 19 Mar 2025 16:25:49 -0700 Subject: [PATCH 09/13] read 16bit jp2 images with glymur --- pyproject.toml | 1 + src/spatialdata_io/readers/g4x.py | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 81f0ae28..8c92ee6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "readfcs", "tifffile>=2023.8.12", "ome-types", + "glymur", ] [project.optional-dependencies] diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py index dff96c34..3d636606 100644 --- a/src/spatialdata_io/readers/g4x.py +++ b/src/spatialdata_io/readers/g4x.py @@ -6,6 +6,7 @@ import dask.dataframe as dd import numpy as np +import glymur from anndata.io import read_h5ad from dask.array.image import imread from spatialdata import SpatialData, to_polygons @@ -333,7 +334,7 @@ def _write_he( # Load and process image logger.debug(f"Loading H&E image from {he_file}") - img = imread(str(he_file)).compute().squeeze() + img = imread(str(he_file), imread=_glymur_imread).compute().squeeze() logger.debug(f"H&E image shape: {img.shape}") logger.debug(f"H&E image dtype: {img.dtype}") if len(img.shape) == 2: @@ -706,3 +707,10 @@ def _deep_update(base_dict, update_dict): _deep_update(base_dict[key], value) else: base_dict[key] = value + + +def _glymur_imread(img_path: str): + """ + Read a JP2 file using glymur. This is for compatibility with 16-bit JP2 files. + """ + return glymur.Jp2k(img_path)[:] From d5d33d6e416ffe2d3598cafc1aa25450d3f0675b Mon Sep 17 00:00:00 2001 From: Clarence Mah Date: Wed, 19 Mar 2025 17:21:45 -0700 Subject: [PATCH 10/13] Add notes on differences in generated SpatialData object in g4x function --- src/spatialdata_io/readers/g4x.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py index 3d636606..ea5c147e 100644 --- a/src/spatialdata_io/readers/g4x.py +++ b/src/spatialdata_io/readers/g4x.py @@ -64,6 +64,14 @@ def g4x( ------- sdatas : Union[SpatialData, list[SpatialData]] A single SpatialData object if processing a single sample directory, otherwise a list of SpatialData objects. + + Notes + ----- + There will be several minor differences between the original data and the generated SpatialData object: + + - cell and nuclear polygons are smoothed when vectorized from label data + - cell and nuclear polygons are offset by `-0.5` pixels relative to the rasterized label data + - transcript table counts correspond exactly to cell labels but not necessarily to cell polygons """ if isinstance(input_path, str): input_path = Path(input_path) From 3dc01c987df79d6de84fab0d8c2bf1bb9a185f97 Mon Sep 17 00:00:00 2001 From: Clarence Mah Date: Mon, 24 Mar 2025 15:32:11 -0700 Subject: [PATCH 11/13] use .v attributes for accessing keys for python3.10 compat --- src/spatialdata_io/_constants/_constants.py | 1 + src/spatialdata_io/readers/g4x.py | 70 ++++++++++----------- 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py index c56d3069..363e6300 100644 --- a/src/spatialdata_io/_constants/_constants.py +++ b/src/spatialdata_io/_constants/_constants.py @@ -402,6 +402,7 @@ class VisiumHDKeys(ModeEnum): class G4XKeys(ModeEnum): + """Keys for G4X formatted dataset.""" # H&E HE_DIR = "h_and_e" diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py index ea5c147e..14888674 100644 --- a/src/spatialdata_io/readers/g4x.py +++ b/src/spatialdata_io/readers/g4x.py @@ -216,8 +216,8 @@ def g4x_sample( pbar.set_description(steps[pbar.n]) _write_he( sdata, - he_dir=input_path / G4XKeys.HE_DIR, - pattern=G4XKeys.HE_PATTERN, + he_dir=input_path / G4XKeys.HE_DIR.v, + pattern=G4XKeys.HE_PATTERN.v, mode=mode, ) pbar.update(1) @@ -226,10 +226,10 @@ def g4x_sample( pbar.set_description(steps[pbar.n]) _write_segmentation( sdata, - nuclei_dir=input_path / G4XKeys.SEGMENTATION_DIR, - pattern=G4XKeys.SEGMENTATION_PATTERN, - nuclei_key=G4XKeys.NUCLEI_BOUNDARIES_KEY, - nuclei_exp_key=G4XKeys.CELL_BOUNDARIES_KEY, + nuclei_dir=input_path / G4XKeys.SEGMENTATION_DIR.v, + pattern=G4XKeys.SEGMENTATION_PATTERN.v, + nuclei_key=G4XKeys.NUCLEI_BOUNDARIES_KEY.v, + nuclei_exp_key=G4XKeys.CELL_BOUNDARIES_KEY.v, mode=mode, ) pbar.update(1) @@ -238,8 +238,8 @@ def g4x_sample( pbar.set_description(steps[pbar.n]) _write_protein_images( sdata, - protein_dir=input_path / G4XKeys.PROTEIN_DIR, - pattern=G4XKeys.PROTEIN_PATTERN, + protein_dir=input_path / G4XKeys.PROTEIN_DIR.v, + pattern=G4XKeys.PROTEIN_PATTERN.v, mode=mode, ) pbar.update(1) @@ -248,14 +248,14 @@ def g4x_sample( pbar.set_description(steps[pbar.n]) _write_transcripts( sdata, - transcripts_dir=input_path / G4XKeys.TRANSCRIPTS_DIR, - pattern=G4XKeys.TRANSCRIPTS_PATTERN, + transcripts_dir=input_path / G4XKeys.TRANSCRIPTS_DIR.v, + pattern=G4XKeys.TRANSCRIPTS_PATTERN.v, coordinates={ - "x": G4XKeys.TRANSCRIPTS_COORD_X, - "y": G4XKeys.TRANSCRIPTS_COORD_Y, + "x": G4XKeys.TRANSCRIPTS_COORD_X.v, + "y": G4XKeys.TRANSCRIPTS_COORD_Y.v, }, - feature_key=G4XKeys.TRANSCRIPTS_FEATURE_KEY, - swap_xy=G4XKeys.TRANSCRIPTS_SWAP_XY == "True", + feature_key=G4XKeys.TRANSCRIPTS_FEATURE_KEY.v, + swap_xy=G4XKeys.TRANSCRIPTS_SWAP_XY.v == "True", mode=mode, ) pbar.update(1) @@ -264,7 +264,7 @@ def g4x_sample( pbar.set_description(steps[pbar.n]) _write_table( sdata, - table_path=input_path / G4XKeys.TABLES_DIR / G4XKeys.TABLE_PATTERN, + table_path=input_path / G4XKeys.TABLES_DIR.v / G4XKeys.TABLE_PATTERN.v, mode=mode, ) pbar.update(1) @@ -457,7 +457,7 @@ def _write_segmentation( # Convert labels to polygons using "label" as index and translating xy coordinates to (almost) match label pixel coordinates logger.debug("Converting to polygons") - offset = float(G4XKeys.OFFSET) + offset = float(G4XKeys.OFFSET.v) # Nuclei shapes sdata[shapes_nuclei_key] = to_polygons(sdata[nuclei_key]).set_index("label") @@ -528,9 +528,9 @@ def _write_protein_images( return elif mode == "overwrite": logger.debug("Deleting existing protein images") - if G4XKeys.PROTEIN_KEY in sdata: - del sdata[G4XKeys.PROTEIN_KEY] - sdata.delete_element_from_disk(G4XKeys.PROTEIN_KEY) + if G4XKeys.PROTEIN_KEY.v in sdata: + del sdata[G4XKeys.PROTEIN_KEY.v] + sdata.delete_element_from_disk(G4XKeys.PROTEIN_KEY.v) img_list.sort() # Get channel names from filenames @@ -553,12 +553,12 @@ def _write_protein_images( # Create Image2DModel and write logger.debug("Converting to Image2DModel") - sdata[G4XKeys.PROTEIN_KEY] = Image2DModel.parse( + sdata[G4XKeys.PROTEIN_KEY.v] = Image2DModel.parse( protein_stack, c_coords=channel_names, **kwargs ) logger.debug("Writing protein images") - sdata.write_element(G4XKeys.PROTEIN_KEY) + sdata.write_element(G4XKeys.PROTEIN_KEY.v) def _write_transcripts( @@ -601,15 +601,15 @@ def _write_transcripts( logger.debug("Transcripts skipped...") return - if f"points/{G4XKeys.TRANSCRIPTS_KEY}" in sdata.elements_paths_on_disk(): + if f"points/{G4XKeys.TRANSCRIPTS_KEY.v}" in sdata.elements_paths_on_disk(): if mode == "append": logger.debug("Transcripts already exist. Skipping...") return elif mode == "overwrite": logger.debug("Deleting existing transcripts") - if G4XKeys.TRANSCRIPTS_KEY in sdata: - del sdata[G4XKeys.TRANSCRIPTS_KEY] - sdata.delete_element_from_disk(G4XKeys.TRANSCRIPTS_KEY) + if G4XKeys.TRANSCRIPTS_KEY.v in sdata: + del sdata[G4XKeys.TRANSCRIPTS_KEY.v] + sdata.delete_element_from_disk(G4XKeys.TRANSCRIPTS_KEY.v) transcript_dir = Path(transcripts_dir) with tqdm(total=3, desc="Processing transcripts", leave=False) as pbar: @@ -633,7 +633,7 @@ def _write_transcripts( ] pbar.set_description("Converting to PointsModel") - sdata[G4XKeys.TRANSCRIPTS_KEY] = PointsModel.parse( + sdata[G4XKeys.TRANSCRIPTS_KEY.v] = PointsModel.parse( transcripts, coordinates=coordinates, feature_key=feature_key, @@ -641,7 +641,7 @@ def _write_transcripts( pbar.update(1) pbar.set_description("Writing to disk") - sdata.write_element(G4XKeys.TRANSCRIPTS_KEY) + sdata.write_element(G4XKeys.TRANSCRIPTS_KEY.v) pbar.update(1) @@ -674,32 +674,32 @@ def _write_table( logger.debug("Table skipped...") return - if f"tables/{G4XKeys.TABLE_KEY}" in sdata.elements_paths_on_disk(): + if f"tables/{G4XKeys.TABLE_KEY.v}" in sdata.elements_paths_on_disk(): if mode == "append": logger.debug("Table already exists. Skipping...") return elif mode == "overwrite": logger.debug("Deleting existing table") - if G4XKeys.TABLE_KEY in sdata: - del sdata[G4XKeys.TABLE_KEY] - sdata.delete_element_from_disk(G4XKeys.TABLE_KEY) + if G4XKeys.TABLE_KEY.v in sdata: + del sdata[G4XKeys.TABLE_KEY.v] + sdata.delete_element_from_disk(G4XKeys.TABLE_KEY.v) adata = read_h5ad(table_path) # Link table annotations to cell shapes - shape_key = f"{G4XKeys.CELL_BOUNDARIES_KEY}_shapes" + shape_key = f"{G4XKeys.CELL_BOUNDARIES_KEY.v}_shapes" adata.obs["region"] = shape_key adata.obs["label"] = adata.obs["cell_id"].str.split("-").str[1] - sdata[G4XKeys.TABLE_KEY] = TableModel.parse(adata) + sdata[G4XKeys.TABLE_KEY.v] = TableModel.parse(adata) sdata.set_table_annotates_spatialelement( - G4XKeys.TABLE_KEY, + G4XKeys.TABLE_KEY.v, region=shape_key, region_key="region", instance_key="label", ) logger.debug("Writing table to disk") - sdata.write_element(G4XKeys.TABLE_KEY) + sdata.write_element(G4XKeys.TABLE_KEY.v) def _deep_update(base_dict, update_dict): From b9b186500bbc201e85cd14a238fa501fa7d38df1 Mon Sep 17 00:00:00 2001 From: xyi Date: Thu, 24 Apr 2025 12:36:03 -0700 Subject: [PATCH 12/13] raise max images size and manually set chunk size --- src/spatialdata_io/readers/g4x.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py index 14888674..9d9b6d94 100644 --- a/src/spatialdata_io/readers/g4x.py +++ b/src/spatialdata_io/readers/g4x.py @@ -7,6 +7,7 @@ import dask.dataframe as dd import numpy as np import glymur +import PIL from anndata.io import read_h5ad from dask.array.image import imread from spatialdata import SpatialData, to_polygons @@ -22,9 +23,12 @@ from spatialdata_io._constants._constants import G4XKeys from spatialdata_io._docs import inject_docs +PIL.Image.MAX_IMAGE_PIXELS = 500000000 + __all__ = ["g4x"] + @inject_docs(xx=G4XKeys) def g4x( input_path: str | Path, @@ -355,7 +359,7 @@ def _write_he( kwargs["scale_factors"] = ( [2, 2, 2] if "scale_factors" not in kwargs else kwargs["scale_factors"] ) - kwargs["chunks"] = "auto" if "chunks" not in kwargs else kwargs["chunks"] + kwargs["chunks"] = [1, 1024, 1024] if "chunks" not in kwargs else kwargs["chunks"] sdata[img_key] = Image2DModel.parse(img, **kwargs) logger.debug(f"Writing Image2DModel for {img_key}") sdata.write_element(img_key) From f670424f08d73a2a48d00b0ab4f993ff236732a3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Apr 2025 19:36:31 +0000 Subject: [PATCH 13/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata_io/readers/g4x.py | 73 +++++++------------------------ 1 file changed, 17 insertions(+), 56 deletions(-) diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py index 9d9b6d94..c61759ca 100644 --- a/src/spatialdata_io/readers/g4x.py +++ b/src/spatialdata_io/readers/g4x.py @@ -2,11 +2,10 @@ import re from pathlib import Path -from typing import Union import dask.dataframe as dd -import numpy as np import glymur +import numpy as np import PIL from anndata.io import read_h5ad from dask.array.image import imread @@ -28,7 +27,6 @@ __all__ = ["g4x"] - @inject_docs(xx=G4XKeys) def g4x( input_path: str | Path, @@ -83,25 +81,15 @@ def g4x( output_path = Path(output_path) # Determine if input_path is a run directory or a single sample directory - if any( - p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) for p in input_path.iterdir() - ): + if any(p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) for p in input_path.iterdir()): # Run directory with multiple samples - sample_input_paths = [ - p - for p in input_path.iterdir() - if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) - ] + sample_input_paths = [p for p in input_path.iterdir() if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name)] logger.debug(f"Found {len(sample_input_paths)} samples.") if output_path is None: - sample_output_paths = [ - input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths - ] + sample_output_paths = [input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths] else: - sample_output_paths = [ - output_path / f"{p.name}.zarr" for p in sample_input_paths - ] + sample_output_paths = [output_path / f"{p.name}.zarr" for p in sample_input_paths] sdatas = [] for sample_input_path, sample_output_path in tqdm( @@ -191,9 +179,7 @@ def g4x_sample( output_zarr_path = Path(output_zarr_path) if output_zarr_path.suffix != ".zarr": logger.error(f"Output path must end with '.zarr'. Got {output_zarr_path}") - raise ValueError( - f"Output path must end with '.zarr'. Got {output_zarr_path}" - ) + raise ValueError(f"Output path must end with '.zarr'. Got {output_zarr_path}") if mode not in ["append", "overwrite"]: msg = f"Invalid mode '{mode}'. Must be one of: 'append', 'overwrite'" @@ -356,9 +342,7 @@ def _write_he( # Create Image2DModel and write logger.debug(f"Creating Image2DModel for {img_key}") kwargs["dims"] = ["c", "y", "x"] if "dims" not in kwargs else kwargs["dims"] - kwargs["scale_factors"] = ( - [2, 2, 2] if "scale_factors" not in kwargs else kwargs["scale_factors"] - ) + kwargs["scale_factors"] = [2, 2, 2] if "scale_factors" not in kwargs else kwargs["scale_factors"] kwargs["chunks"] = [1, 1024, 1024] if "chunks" not in kwargs else kwargs["chunks"] sdata[img_key] = Image2DModel.parse(img, **kwargs) logger.debug(f"Writing Image2DModel for {img_key}") @@ -430,9 +414,7 @@ def _write_segmentation( f"shapes/{shapes_nuclei_exp_key}", ] - if mode == "append" and any( - p in sdata.elements_paths_on_disk() for p in elements_paths - ): + if mode == "append" and any(p in sdata.elements_paths_on_disk() for p in elements_paths): logger.debug("Segmentation already exist. Skipping...") return elif mode == "overwrite": @@ -440,10 +422,7 @@ def _write_segmentation( for el in elements: if el in sdata: del sdata[el] - if ( - f"labels/{el}" in sdata.elements_paths_on_disk() - or f"shapes/{el}" in sdata.elements_paths_on_disk() - ): + if f"labels/{el}" in sdata.elements_paths_on_disk() or f"shapes/{el}" in sdata.elements_paths_on_disk(): sdata.delete_element_from_disk(el) # Load and process segmentation data @@ -465,16 +444,12 @@ def _write_segmentation( # Nuclei shapes sdata[shapes_nuclei_key] = to_polygons(sdata[nuclei_key]).set_index("label") - sdata[shapes_nuclei_key].geometry = sdata[shapes_nuclei_key].translate( - xoff=offset, yoff=offset - ) + sdata[shapes_nuclei_key].geometry = sdata[shapes_nuclei_key].translate(xoff=offset, yoff=offset) sdata[shapes_nuclei_key].index = sdata[shapes_nuclei_key].index.astype(str) # Expanded nuclei shapes sdata[shapes_nuclei_exp_key] = to_polygons(sdata[nuclei_exp_key]).set_index("label") - sdata[shapes_nuclei_exp_key].geometry = sdata[shapes_nuclei_exp_key].translate( - xoff=offset, yoff=offset - ) + sdata[shapes_nuclei_exp_key].geometry = sdata[shapes_nuclei_exp_key].translate(xoff=offset, yoff=offset) sdata[shapes_nuclei_exp_key].index = sdata[shapes_nuclei_exp_key].index.astype(str) logger.debug("Writing elements") @@ -519,9 +494,7 @@ def _write_protein_images( img_list.sort() if not img_list: - logger.warning( - f"No protein images found matching pattern '{pattern}' in {protein_dir}" - ) + logger.warning(f"No protein images found matching pattern '{pattern}' in {protein_dir}") return logger.debug(f"Found {len(img_list)} protein images") @@ -546,20 +519,14 @@ def _write_protein_images( logger.debug(f"Images shape: {protein_stack.shape}") kwargs["dims"] = ["c", "y", "x"] if "dims" not in kwargs else kwargs["dims"] - kwargs["scale_factors"] = ( - [2, 2, 2] if "scale_factors" not in kwargs else kwargs["scale_factors"] - ) + kwargs["scale_factors"] = [2, 2, 2] if "scale_factors" not in kwargs else kwargs["scale_factors"] kwargs["chunks"] = ( - [1, protein_stack.shape[-2], protein_stack.shape[-1]] - if "chunks" not in kwargs - else kwargs["chunks"] + [1, protein_stack.shape[-2], protein_stack.shape[-1]] if "chunks" not in kwargs else kwargs["chunks"] ) # Create Image2DModel and write logger.debug("Converting to Image2DModel") - sdata[G4XKeys.PROTEIN_KEY.v] = Image2DModel.parse( - protein_stack, c_coords=channel_names, **kwargs - ) + sdata[G4XKeys.PROTEIN_KEY.v] = Image2DModel.parse(protein_stack, c_coords=channel_names, **kwargs) logger.debug("Writing protein images") sdata.write_element(G4XKeys.PROTEIN_KEY.v) @@ -632,9 +599,7 @@ def _write_transcripts( logger.debug(f"swap_xy: {swap_xy}, {type(swap_xy)}") if swap_xy: logger.debug("Swapping x and y coordinates") - transcripts[[coordinates["x"], coordinates["y"]]] = transcripts[ - [coordinates["y"], coordinates["x"]] - ] + transcripts[[coordinates["x"], coordinates["y"]]] = transcripts[[coordinates["y"], coordinates["x"]]] pbar.set_description("Converting to PointsModel") sdata[G4XKeys.TRANSCRIPTS_KEY.v] = PointsModel.parse( @@ -711,11 +676,7 @@ def _deep_update(base_dict, update_dict): Recursively update a dictionary with another dictionary. """ for key, value in update_dict.items(): - if ( - isinstance(value, dict) - and key in base_dict - and isinstance(base_dict[key], dict) - ): + if isinstance(value, dict) and key in base_dict and isinstance(base_dict[key], dict): _deep_update(base_dict[key], value) else: base_dict[key] = value