From 9412eaa1eba66467cd6381ce80b86836a1f2778d Mon Sep 17 00:00:00 2001
From: Clarence Mah <clarence.k.mah@gmail.com>
Date: Tue, 18 Feb 2025 21:14:11 +0000
Subject: [PATCH 01/13] Add G4X reader for samples and full runs

---
 src/spatialdata_io/__init__.py              |   2 +
 src/spatialdata_io/_constants/_constants.py |  47 ++
 src/spatialdata_io/readers/g4x.py           | 621 ++++++++++++++++++++
 3 files changed, 670 insertions(+)
 create mode 100644 src/spatialdata_io/readers/g4x.py

diff --git a/src/spatialdata_io/__init__.py b/src/spatialdata_io/__init__.py
index 48f784bd..d857e2f8 100644
--- a/src/spatialdata_io/__init__.py
+++ b/src/spatialdata_io/__init__.py
@@ -16,8 +16,10 @@
     xenium_aligned_image,
     xenium_explorer_selection,
 )
+from spatialdata_io.readers.g4x import g4x
 
 __all__ = [
+    "g4x",
     "curio",
     "seqfish",
     "visium",
diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py
index e4f77d5f..204bbfed 100644
--- a/src/spatialdata_io/_constants/_constants.py
+++ b/src/spatialdata_io/_constants/_constants.py
@@ -392,3 +392,50 @@ class VisiumHDKeys(ModeEnum):
     MICROSCOPE_COLROW_TO_SPOT_COLROW = ("microscope_colrow_to_spot_colrow",)
     SPOT_COLROW_TO_MICROSCOPE_COLROW = ("spot_colrow_to_microscope_colrow",)
     FILE_FORMAT = "file_format"
+
+
+class G4XKeys(str, ModeEnum):
+
+    # H&E
+    HE_DIR = "h_and_e"
+    HE_PATTERN = "*.jp2"
+    HE_IMG2DMODEL_KWARGS = {
+        "dims": ["y", "x", "c"],
+        "scale_factors": [2, 2, 2],
+        "chunks": "auto"
+    }
+
+    # Nuclei
+    NUCLEI_BOUNDARIES_KEY = "nuclei"
+    CELL_BOUNDARIES_KEY = "nuclei_exp"
+    SEGMENTATION_DIR = "segmentation"
+    SEGMENTATION_PATTERN = "segmentation_mask.npz"
+    SEG_IMG2DMODEL_KWARGS = {
+        "dims": ["y", "x"],
+        "chunks": "auto"
+    }
+
+    # Protein
+    PROTEIN_KEY = "protein"
+    PROTEIN_DIR = "protein"
+    PROTEIN_PATTERN = "*.jp2"
+    PROTEIN_IMG2DMODEL_KWARGS = {
+        "dims": ["c", "y", "x"],
+        "scale_factors": [2, 2, 2],
+        "chunks": "auto"
+    }
+
+    # Transcripts
+    TRANSCRIPTS_DIR = "rna"
+    TRANSCRIPTS_PATTERN = "*transcript_table.csv.gz"
+    TRANSCRIPTS_COORDS = {
+        "x": "x_pixel_coordinate",
+        "y": "y_pixel_coordinate"
+    }
+    TRANSCRIPTS_FEATURE_KEY = "gene_name"
+    TRANSCRIPTS_SWAP_XY = True
+
+    # Tables
+    TABLES_DIR = "single_cell_data"
+    TABLE_PATTERN = "feature_matrix.h5"
+    TABLE_KEY = "table"
diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py
new file mode 100644
index 00000000..78c7f007
--- /dev/null
+++ b/src/spatialdata_io/readers/g4x.py
@@ -0,0 +1,621 @@
+from __future__ import annotations
+
+import re
+from pathlib import Path
+from typing import Union
+
+import numpy as np
+from anndata.io import read_h5ad
+from dask_image.imread import imread
+from spatialdata import SpatialData, to_polygons
+from spatialdata._logging import logger
+from spatialdata.models import (
+    Image2DModel,
+    Labels2DModel,
+    PointsModel,
+    TableModel,
+)
+import dask.dataframe as dd
+from tqdm.auto import tqdm
+
+from spatialdata_io._constants._constants import G4XKeys
+from spatialdata_io._docs import inject_docs
+
+__all__ = ["g4x_sample", "g4x_run"]
+
+
+@inject_docs(xx=G4XKeys)
+def g4x_run(
+    input_path: Union[str, Path],
+    output_path: Union[str, Path, None] = None,
+    include_he: bool = True,
+    include_segmentation: bool = True,
+    include_protein: bool = True,
+    include_transcripts: bool = True,
+    include_tables: bool = True,
+    mode: str = "append",
+):
+    """
+    Create SpatialData objects for each sample in a run directory.
+
+    See :func:`g4x_sample` for more details.
+
+    Parameters
+    ----------
+    input_path : Union[str, Path]
+        Path to input directory containing run data. Assumes each subdirectory contains a sample. e.g. `input_path/A01`, `input_path/B01`, etc.
+    output_path : Union[str, Path]
+        Path to directory where SpatialData zarr stores will be written. If None, zarr stores will be written to each sample directory found in `input_path`.
+    include_he : bool
+        Include H&E image if available.
+    include_segmentation : bool
+        Include segmentation if available.
+    include_protein : bool
+        Include protein images if available.
+    include_transcripts : bool
+        Include transcript data if available.
+    include_tables : bool
+        Include tables if available.
+    mode : str
+        Mode for handling existing elements. Options:
+        - "append": Skip existing elements (default)
+        - "overwrite": Replace existing elements
+    Returns
+    -------
+    sdatas : list[SpatialData]
+        List of SpatialData objects
+    """
+    if isinstance(input_path, str):
+        input_path = Path(input_path)
+    if isinstance(output_path, str):
+        output_path = Path(output_path)
+
+    # Make sure paths match expected format e.g. A01, B01
+    sample_input_paths = []
+    for p in input_path.iterdir():
+        if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name):
+            sample_input_paths.append(p)
+    logger.debug(f"Found {len(sample_input_paths)} samples.")
+
+    if output_path is None:
+        sample_output_paths = [
+            input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths
+        ]
+    else:
+        sample_output_paths = [
+            output_path / f"{p.name}.zarr" for p in sample_input_paths
+        ]
+
+    kwargs = {
+        "include_he": include_he,
+        "include_segmentation": include_segmentation,
+        "include_protein": include_protein,
+        "include_transcripts": include_transcripts,
+        "include_tables": include_tables,
+        "mode": mode,
+    }
+
+    sdatas = []
+    for sample_input_path, sample_output_path in tqdm(
+        zip(sample_input_paths, sample_output_paths),
+        total=len(sample_input_paths),
+        desc="Processing samples",
+    ):
+        sdata = g4x_sample(
+            input_path=sample_input_path,
+            output_zarr_path=sample_output_path,
+            **kwargs,
+        )
+        sdatas.append(sdata)
+    return sdatas
+
+
+def g4x_sample(
+    input_path: Union[str, Path],
+    output_zarr_path: Union[str, Path],
+    include_he: bool = True,
+    include_segmentation: bool = True,
+    include_protein: bool = True,
+    include_transcripts: bool = True,
+    include_tables: bool = True,
+    mode: str = "append",
+) -> SpatialData:
+    """
+    Create a SpatialData object from a G4X sample dataset.
+
+    This function looks for the following files:
+
+        - ``{xx.HE_DIR!r}/{xx.HE_PATTERN!r}``: H&E images.
+        - ``{xx.NUCLEI_DIR!r}/{xx.NUCLEI_PATTERN!r}``: Segmentation files.
+        - ``{xx.PROTEIN_DIR!r}/{xx.PROTEIN_PATTERN!r}``: Protein images.
+        - ``{xx.TRANSCRIPTS_DIR!r}/{xx.TRANSCRIPTS_PATTERN!r}``: Transcript tables.
+        - ``{xx.TABLES_DIR!r}/{xx.TABLE_PATTERN!r}``: Table file.
+
+    Parameters
+    ----------
+    input_path : str
+        Path to input directory containing G4X data
+    output_path : str
+        Writes/appends to a SpatialData zarr store at this path
+    include_he : bool
+        Include H&E image if available.
+    include_segmentation : bool
+        Include segmentation if available.
+    include_protein : bool
+        Include protein images if available.
+    include_transcripts : bool
+        Include transcript data if available.
+    include_tables : bool
+        Include tables if available.
+    mode : str
+        Mode for creating SpatialData object ('new' or 'append')
+
+    Returns
+    -------
+    SpatialData
+        SpatialData object containing requested data elements
+    """
+    if isinstance(input_path, str):
+        input_path = Path(input_path)
+    if isinstance(output_zarr_path, str):
+        output_zarr_path = Path(output_zarr_path)
+        if output_zarr_path.suffix != ".zarr":
+            logger.error(f"Output path must end with '.zarr'. Got {output_zarr_path}")
+            raise ValueError(
+                f"Output path must end with '.zarr'. Got {output_zarr_path}"
+            )
+
+    if mode not in ["append", "overwrite"]:
+        msg = f"Invalid mode '{mode}'. Must be one of: 'append', 'overwrite'"
+        logger.error(msg)
+        raise ValueError(msg)
+
+    if output_zarr_path.exists():
+        logger.debug(f"Found existing {output_zarr_path}")
+        sdata = SpatialData.read(output_zarr_path)
+    else:
+        logger.debug(f"Creating new SpatialData object at {output_zarr_path}")
+        sdata = SpatialData()
+        sdata.write(output_zarr_path)
+
+    # Create progress bar for main steps
+    steps = []
+    steps.append("H&E") if include_he else None
+    steps.append("Segmentation") if include_segmentation else None
+    steps.append("Protein Images") if include_protein else None
+    steps.append("Transcripts") if include_transcripts else None
+    steps.append("Tables") if include_tables else None
+    with tqdm(total=len(steps)) as pbar:
+        if include_he:
+            pbar.set_description(steps[pbar.n])
+            _write_he(
+                sdata,
+                he_dir=G4XKeys.HE_DIR,
+                pattern=G4XKeys.HE_PATTERN,
+                mode=mode,
+                **G4XKeys.HE_IMG2DMODEL_KWARGS,
+            )
+            pbar.update(1)
+
+        if include_segmentation:
+            pbar.set_description(steps[pbar.n])
+            _write_segmentation(
+                sdata,
+                nuclei_dir=G4XKeys.SEGMENTATION_DIR,
+                pattern=G4XKeys.SEGMENTATION_PATTERN,
+                nuclei_key=G4XKeys.NUCLEI_BOUNDARIES_KEY,
+                nuclei_exp_key=G4XKeys.CELL_BOUNDARIES_KEY,
+                mode=mode,
+                **G4XKeys.SEG_IMG2DMODEL_KWARGS,
+            )
+            pbar.update(1)
+
+        if include_protein:
+            pbar.set_description(steps[pbar.n])
+            _write_protein_images(
+                sdata,
+                protein_dir=G4XKeys.PROTEIN_DIR,
+                pattern=G4XKeys.PROTEIN_PATTERN,
+                mode=mode,
+                **G4XKeys.PROTEIN_IMG2DMODEL_KWARGS,
+            )
+            pbar.update(1)
+
+        if include_transcripts:
+            pbar.set_description(steps[pbar.n])
+            _write_transcripts(
+                sdata,
+                transcripts_dir=G4XKeys.TRANSCRIPTS_DIR,
+                pattern=G4XKeys.TRANSCRIPTS_PATTERN,
+                coordinates=G4XKeys.TRANSCRIPTS_COORDS,
+                feature_key=G4XKeys.TRANSCRIPTS_FEATURE_KEY,
+                swap_xy=G4XKeys.TRANSCRIPTS_SWAP_XY,
+                mode=mode,
+            )
+            pbar.update(1)
+
+        if include_tables:
+            pbar.set_description(steps[pbar.n])
+            _write_table(
+                sdata,
+                table_path=G4XKeys.TABLE_PATTERN,
+                mode=mode,
+            )
+            pbar.update(1)
+
+    logger.debug("Done!")
+
+    # Read back to enable lazy loading
+    sdata = SpatialData.read(output_zarr_path)
+    return sdata
+
+
+def _write_he(
+    sdata: SpatialData,
+    he_dir: Union[str, None],
+    pattern: str,
+    mode: str = "append",
+    **kwargs,
+):
+    """
+    Write H&E images to SpatialData object. Each H&E image is stored as a separate object.
+
+    Parameters
+    ----------
+    sdata : SpatialData
+        SpatialData object to write to
+    he_dir : Union[str, None]
+        Path to directory containing H&E images. If None, this step will be skipped.
+    pattern : str
+        Glob pattern for selecting H&E images.
+    mode : str, optional
+        Mode for handling existing elements. Options:
+        - "append": Skip if element exists (default)
+        - "overwrite": Replace if element exists
+    kwargs : dict
+        Additional arguments passed to Image2DModel.parse()
+
+    Modifies
+    -------
+    sdata : SpatialData
+        SpatialData object with H&E images stored in sdata["{img_name}"] e.g. "h_and_e"
+    """
+    if he_dir is None:
+        logger.debug("H&E skipped...")
+        return
+
+    # Get list of H&E images
+    he_dir = Path(he_dir)
+    if he_dir.is_file():
+        he_files = [he_dir]
+    else:
+        he_files = list(Path(he_dir).glob(pattern))
+        if not he_files:
+            logger.warning(f"No H&E images found in {he_dir}")
+            return
+        he_files.sort()
+
+    logger.debug(f"Found {len(he_files)} H&E images")
+
+    # Process each H&E image
+    for he_file in tqdm(he_files, desc="Processing H&E images", leave=False):
+        # Extract sample ID from filename (e.g., "C02" from "C02_digital_he.jp2")
+        logger.debug(f"Processing {he_file}")
+        img_key = he_file.stem
+
+        # Check if element exists
+        if f"images/{img_key}" in sdata.elements_paths_on_disk():
+            if mode == "append":
+                logger.debug(f"H&E image '{img_key}' already exists. Skipping...")
+                continue
+            elif mode == "overwrite":
+                logger.debug(f"Deleting existing H&E image '{img_key}'")
+                if img_key in sdata:
+                    del sdata[img_key]
+                sdata.delete_element_from_disk(img_key)
+
+        # Load and process image
+        logger.debug(f"Loading H&E image from {he_file}")
+        img = imread(str(he_file))
+        if len(img.shape) == 4:
+            img = img[0]  # [0] to remove extra dimension
+        elif len(img.shape) == 3:
+            img = img.transpose(1, 2, 0)  # move first dimension to last
+        logger.debug(f"H&E image shape: {img.shape}")
+        logger.debug(f"H&E image dtype: {img.dtype}")
+
+        # Create Image2DModel and write
+        logger.debug(f"Creating Image2DModel for {img_key}")
+        sdata[img_key] = Image2DModel.parse(img, **kwargs)
+        logger.debug(f"Writing Image2DModel for {img_key}")
+        sdata.write_element(img_key)
+
+
+def _write_segmentation(
+    sdata: SpatialData,
+    nuclei_dir: Union[str, None],
+    pattern: str,
+    nuclei_key: str,
+    nuclei_exp_key: str,
+    mode: str = "append",
+    **kwargs,
+):
+    """
+    Write segmentation labels to SpatialData object.
+
+    Parameters
+    ----------
+    sdata : SpatialData
+        SpatialData object to write to
+    nuclei_dir : Union[str, None]
+        Path to directory containing nuclei segmentation files.
+        If None, this step will be skipped.
+    pattern : str
+        Glob pattern for selecting nuclei segmentation files.
+    nuclei_key : str
+        Key for nuclei segmentation array in the NPZ file
+    nuclei_exp_key : str
+        Key for expanded nuclei segmentation array in the NPZ file
+    mode : str, optional
+        Mode for handling existing elements. Options:
+        - "append": Skip if elements exist (default)
+        - "overwrite": Replace if elements exist
+    kwargs : dict
+        Additional arguments passed to Labels2DModel.parse()
+
+    Modifies
+    --------
+    sdata : SpatialData
+        Adds the following elements:
+        - {nuclei_key}: Labels2DModel of nuclei segmentation
+        - {nuclei_exp_key}: Labels2DModel of expanded nuclei segmentation
+        - {nuclei_key}_shapes: Polygon shapes derived from nuclei segmentation
+        - {nuclei_exp_key}_shapes: Polygon shapes derived from expanded segmentation
+    """
+    if nuclei_dir is None:
+        logger.debug("Segmentation skipped...")
+        return
+
+    # Get list of nuclei files
+    nuclei_dir = Path(nuclei_dir)
+    nuclei_file = nuclei_dir / pattern
+    if not nuclei_file.exists():
+        logger.warning(f"No segmentation files matching {pattern} in {nuclei_dir}")
+        return
+
+    # Process each nuclei file
+    shapes_seg_key = f"{nuclei_key}_shapes"
+    shapes_exp_key = f"{nuclei_exp_key}_shapes"
+
+    # Check if elements exist
+    elements = [nuclei_key, nuclei_exp_key, shapes_seg_key, shapes_exp_key]
+    elements_paths = [
+        f"labels/{nuclei_key}",
+        f"labels/{nuclei_exp_key}",
+        f"shapes/{shapes_seg_key}",
+        f"shapes/{shapes_exp_key}",
+    ]
+
+    if mode == "append" and any(
+        p in sdata.elements_paths_on_disk() for p in elements_paths
+    ):
+        logger.debug("Segmentation already exist. Skipping...")
+        return
+    elif mode == "overwrite":
+        logger.debug("Deleting existing segmentation elements")
+        for el in elements:
+            if el in sdata:
+                del sdata[el]
+            if (
+                f"labels/{el}" in sdata.elements_paths_on_disk()
+                or f"shapes/{el}" in sdata.elements_paths_on_disk()
+            ):
+                sdata.delete_element_from_disk(el)
+
+    # Load and process segmentation data
+    logger.debug(f"Loading segmentation data from {nuclei_file}")
+    nuclei_dict = np.load(nuclei_file)
+    nuclei_raw = nuclei_dict[nuclei_key]
+    nuclei_exp = nuclei_dict[nuclei_exp_key]
+    logger.debug(f"Nuclei masks shape: {nuclei_raw.shape}")
+    logger.debug(f"Cell masks shape: {nuclei_exp.shape}")
+
+    # Create progress bar for nuclei processing steps
+    logger.debug("Converting to Labels2DModel")
+    sdata[nuclei_key] = Labels2DModel.parse(nuclei_raw, **kwargs)
+    sdata[nuclei_exp_key] = Labels2DModel.parse(nuclei_exp, **kwargs)
+    logger.debug("Converting to polygons")
+    sdata[shapes_seg_key] = to_polygons(sdata[nuclei_key]).reset_index(drop=True)
+    sdata[shapes_exp_key] = to_polygons(sdata[nuclei_exp_key]).reset_index(drop=True)
+    logger.debug("Writing elements")
+    for element in elements:
+        sdata.write_element(element)
+
+
+def _write_protein_images(
+    sdata: SpatialData,
+    protein_dir: Union[str, None],
+    pattern: str,
+    mode: str = "append",
+    **kwargs,
+):
+    """
+    Write protein images to SpatialData object. Proteins are stored as channels in a single Image2DModel object.
+
+    Parameters
+    ----------
+    sdata : SpatialData
+        SpatialData object to write to
+    protein_dir : Union[str, None]
+        Path to directory containing protein images.
+        If None, this step will be skipped.
+    pattern : str
+        Glob pattern for selecting protein images.
+    mode : str, optional
+        Mode for handling existing elements. Options:
+        - "append": Skip if element exists (default)
+        - "overwrite": Replace if element exists
+    kwargs : dict
+        Additional arguments passed to Image2DModel.parse()
+    """
+    if protein_dir is None:
+        logger.debug("Protein skipped...")
+        return
+
+    protein_dir = Path(protein_dir)
+
+    # Get list of protein images for this sample
+    img_list = list(protein_dir.glob(pattern))
+    img_list.sort()
+
+    if not img_list:
+        logger.warning(
+            f"No protein images found matching pattern '{pattern}' in {protein_dir}"
+        )
+        return
+    logger.debug(f"Found {len(img_list)} protein images")
+
+    # Check if element exists
+    if "images/protein" in sdata.elements_paths_on_disk():
+        if mode == "append":
+            logger.debug("Protein images already exist. Skipping...")
+            return
+        elif mode == "overwrite":
+            logger.debug("Deleting existing protein images")
+            if G4XKeys.PROTEIN_KEY in sdata:
+                del sdata[G4XKeys.PROTEIN_KEY]
+            sdata.delete_element_from_disk(G4XKeys.PROTEIN_KEY)
+    img_list.sort()
+
+    # Get channel names from filenames
+    channel_names = [img_file.stem.split("_")[0] for img_file in img_list]
+
+    # Load all images at once with dask imread
+    logger.debug("Loading protein images")
+    protein_stack = imread(str(protein_dir / pattern))
+    logger.debug(f"Images shape: {protein_stack.shape}")
+
+    # Create Image2DModel and write
+    logger.debug("Converting to Image2DModel")
+    sdata[G4XKeys.PROTEIN_KEY] = Image2DModel.parse(
+        protein_stack, c_coords=channel_names, **kwargs
+    )
+
+    logger.debug("Writing protein images")
+    sdata.write_element(G4XKeys.PROTEIN_KEY)
+
+
+def _write_transcripts(
+    sdata: SpatialData,
+    transcripts_dir: Union[str, None],
+    pattern: str,
+    coordinates: dict,
+    feature_key: str,
+    swap_xy: bool,
+    mode: str = "append",
+):
+    """
+    Write transcripts to SpatialData object.
+
+    Parameters
+    ----------
+    sdata : SpatialData
+        SpatialData object to write to
+    transcripts_dir : Union[str, None]
+        Path to directory containing transcript tables.
+    pattern : str
+        Glob pattern for selecting transcript tables.
+    coordinates : dict
+        Dictionary mapping coordinate column names to standard x,y coordinates
+    feature_key : str
+        Column name containing transcript feature identifiers
+    swap_xy : bool
+        Whether to swap the x and y coordinates
+    mode : str, optional
+        Mode for handling existing element. Options:
+        - "append": Skip if element exists (default)
+        - "overwrite": Replace if element exists
+
+    Modifies
+    --------
+    sdata : SpatialData
+        Adds a "transcripts" PointsModel containing transcript locations and features
+    """
+    if transcripts_dir is None:
+        logger.debug("Transcripts skipped...")
+        return
+
+    if f"points/{G4XKeys.TRANSCRIPTS_KEY}" in sdata.elements_paths_on_disk():
+        if mode == "append":
+            logger.debug("Transcripts already exist. Skipping...")
+            return
+        elif mode == "overwrite":
+            logger.debug("Deleting existing transcripts")
+            if G4XKeys.TRANSCRIPTS_KEY in sdata:
+                del sdata[G4XKeys.TRANSCRIPTS_KEY]
+            sdata.delete_element_from_disk(G4XKeys.TRANSCRIPTS_KEY)
+
+    transcript_dir = Path(transcripts_dir)
+    with tqdm(total=3, desc="Processing transcripts", leave=False) as pbar:
+        pbar.set_description("Loading transcripts")
+
+        if pattern.endswith(".csv") or pattern.endswith(".csv.gz"):
+            # list files found in transcript_dir
+            transcript_files = list(transcript_dir.glob(pattern))
+            transcript_files.sort()
+            logger.debug(f"Found {len(transcript_files)} transcript files")
+            transcripts = dd.read_csv(transcript_files).compute().reset_index(drop=True)
+        else:
+            raise ValueError(f"Unsupported file type: {transcript_dir / pattern}")
+        pbar.update(1)
+
+        if swap_xy:
+            transcripts[[coordinates["x"], coordinates["y"]]] = transcripts[
+                [coordinates["y"], coordinates["x"]]
+            ]
+
+        pbar.set_description("Converting to PointsModel")
+        sdata[G4XKeys.TRANSCRIPTS_KEY] = PointsModel.parse(
+            transcripts,
+            coordinates=coordinates,
+            feature_key=feature_key,
+        )
+        pbar.update(1)
+
+        pbar.set_description("Writing to disk")
+        sdata.write_element(G4XKeys.TRANSCRIPTS_KEY)
+        pbar.update(1)
+
+
+def _write_table(
+    sdata: SpatialData,
+    table_path: Union[str, None],
+    mode: str = "append",
+):
+    """
+    Write tables to SpatialData object.
+    """
+    if table_path is None:
+        logger.debug("Table skipped...")
+        return
+
+    adata = read_h5ad(table_path)
+    sdata[G4XKeys.TABLE_KEY] = TableModel.parse(adata)
+
+    logger.debug("Writing table to disk")
+    sdata.write_element(G4XKeys.TABLE_KEY)
+
+
+def _deep_update(base_dict, update_dict):
+    """
+    Recursively update a dictionary with another dictionary.
+    """
+    for key, value in update_dict.items():
+        if (
+            isinstance(value, dict)
+            and key in base_dict
+            and isinstance(base_dict[key], dict)
+        ):
+            _deep_update(base_dict[key], value)
+        else:
+            base_dict[key] = value

From 8099b686ed451d44dcaff10d0c23b6b24f9aaf4f Mon Sep 17 00:00:00 2001
From: Clarence Mah <clarence.k.mah@gmail.com>
Date: Tue, 18 Feb 2025 21:37:18 +0000
Subject: [PATCH 02/13] Refactor g4x function to support single sample and run
 directory processing

---
 src/spatialdata_io/readers/g4x.py | 99 ++++++++++++++++++-------------
 1 file changed, 58 insertions(+), 41 deletions(-)

diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py
index 78c7f007..82c2946e 100644
--- a/src/spatialdata_io/readers/g4x.py
+++ b/src/spatialdata_io/readers/g4x.py
@@ -21,11 +21,11 @@
 from spatialdata_io._constants._constants import G4XKeys
 from spatialdata_io._docs import inject_docs
 
-__all__ = ["g4x_sample", "g4x_run"]
+__all__ = ["g4x"]
 
 
 @inject_docs(xx=G4XKeys)
-def g4x_run(
+def g4x(
     input_path: Union[str, Path],
     output_path: Union[str, Path, None] = None,
     include_he: bool = True,
@@ -36,14 +36,13 @@ def g4x_run(
     mode: str = "append",
 ):
     """
-    Create SpatialData objects for each sample in a run directory.
-
-    See :func:`g4x_sample` for more details.
+    Create SpatialData objects for each sample in a run directory or a single sample directory.
 
     Parameters
     ----------
     input_path : Union[str, Path]
-        Path to input directory containing run data. Assumes each subdirectory contains a sample. e.g. `input_path/A01`, `input_path/B01`, etc.
+        Path to input directory containing run data or a single sample directory.
+        If a run directory, assumes each subdirectory contains a sample. e.g. `input_path/A01`, `input_path/B01`, etc.
     output_path : Union[str, Path]
         Path to directory where SpatialData zarr stores will be written. If None, zarr stores will be written to each sample directory found in `input_path`.
     include_he : bool
@@ -62,52 +61,70 @@ def g4x_run(
         - "overwrite": Replace existing elements
     Returns
     -------
-    sdatas : list[SpatialData]
-        List of SpatialData objects
+    sdatas : Union[SpatialData, list[SpatialData]]
+        A single SpatialData object if processing a single sample directory, otherwise a list of SpatialData objects.
     """
     if isinstance(input_path, str):
         input_path = Path(input_path)
     if isinstance(output_path, str):
         output_path = Path(output_path)
 
-    # Make sure paths match expected format e.g. A01, B01
-    sample_input_paths = []
-    for p in input_path.iterdir():
-        if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name):
-            sample_input_paths.append(p)
-    logger.debug(f"Found {len(sample_input_paths)} samples.")
-
-    if output_path is None:
-        sample_output_paths = [
-            input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths
+    # Determine if input_path is a run directory or a single sample directory
+    if any(
+        p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) for p in input_path.iterdir()
+    ):
+        # Run directory with multiple samples
+        sample_input_paths = [
+            p
+            for p in input_path.iterdir()
+            if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name)
         ]
+        logger.debug(f"Found {len(sample_input_paths)} samples.")
+
+        if output_path is None:
+            sample_output_paths = [
+                input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths
+            ]
+        else:
+            sample_output_paths = [
+                output_path / f"{p.name}.zarr" for p in sample_input_paths
+            ]
+
+        sdatas = []
+        for sample_input_path, sample_output_path in tqdm(
+            zip(sample_input_paths, sample_output_paths),
+            total=len(sample_input_paths),
+            desc="Processing samples",
+        ):
+            sdata = g4x_sample(
+                input_path=sample_input_path,
+                output_zarr_path=sample_output_path,
+                include_he=include_he,
+                include_segmentation=include_segmentation,
+                include_protein=include_protein,
+                include_transcripts=include_transcripts,
+                include_tables=include_tables,
+                mode=mode,
+            )
+            sdatas.append(sdata)
+        return sdatas
     else:
-        sample_output_paths = [
-            output_path / f"{p.name}.zarr" for p in sample_input_paths
-        ]
+        # Single sample directory
+        logger.debug("Processing single sample directory.")
+        if output_path is None:
+            output_path = input_path / f"{input_path.name}.zarr"
 
-    kwargs = {
-        "include_he": include_he,
-        "include_segmentation": include_segmentation,
-        "include_protein": include_protein,
-        "include_transcripts": include_transcripts,
-        "include_tables": include_tables,
-        "mode": mode,
-    }
-
-    sdatas = []
-    for sample_input_path, sample_output_path in tqdm(
-        zip(sample_input_paths, sample_output_paths),
-        total=len(sample_input_paths),
-        desc="Processing samples",
-    ):
         sdata = g4x_sample(
-            input_path=sample_input_path,
-            output_zarr_path=sample_output_path,
-            **kwargs,
+            input_path=input_path,
+            output_zarr_path=output_path,
+            include_he=include_he,
+            include_segmentation=include_segmentation,
+            include_protein=include_protein,
+            include_transcripts=include_transcripts,
+            include_tables=include_tables,
+            mode=mode,
         )
-        sdatas.append(sdata)
-    return sdatas
+        return sdata
 
 
 def g4x_sample(

From 03fb7af5fc570aacba43a40cdb55964972062f1d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 19 Feb 2025 01:32:30 +0000
Subject: [PATCH 03/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/spatialdata_io/__init__.py              |  2 +-
 src/spatialdata_io/_constants/_constants.py | 22 ++-----
 src/spatialdata_io/readers/g4x.py           | 69 +++++++--------------
 3 files changed, 26 insertions(+), 67 deletions(-)

diff --git a/src/spatialdata_io/__init__.py b/src/spatialdata_io/__init__.py
index f05ab5b4..14af87cb 100644
--- a/src/spatialdata_io/__init__.py
+++ b/src/spatialdata_io/__init__.py
@@ -5,8 +5,8 @@
 from spatialdata_io.readers.cosmx import cosmx
 from spatialdata_io.readers.curio import curio
 from spatialdata_io.readers.dbit import dbit
-from spatialdata_io.readers.generic import generic, geojson, image
 from spatialdata_io.readers.g4x import g4x
+from spatialdata_io.readers.generic import generic, geojson, image
 from spatialdata_io.readers.macsima import macsima
 from spatialdata_io.readers.mcmicro import mcmicro
 from spatialdata_io.readers.merscope import merscope
diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py
index 5edb44de..cbbddba7 100644
--- a/src/spatialdata_io/_constants/_constants.py
+++ b/src/spatialdata_io/_constants/_constants.py
@@ -406,39 +406,25 @@ class G4XKeys(str, ModeEnum):
     # H&E
     HE_DIR = "h_and_e"
     HE_PATTERN = "*.jp2"
-    HE_IMG2DMODEL_KWARGS = {
-        "dims": ["y", "x", "c"],
-        "scale_factors": [2, 2, 2],
-        "chunks": "auto"
-    }
+    HE_IMG2DMODEL_KWARGS = {"dims": ["y", "x", "c"], "scale_factors": [2, 2, 2], "chunks": "auto"}
 
     # Nuclei
     NUCLEI_BOUNDARIES_KEY = "nuclei"
     CELL_BOUNDARIES_KEY = "nuclei_exp"
     SEGMENTATION_DIR = "segmentation"
     SEGMENTATION_PATTERN = "segmentation_mask.npz"
-    SEG_IMG2DMODEL_KWARGS = {
-        "dims": ["y", "x"],
-        "chunks": "auto"
-    }
+    SEG_IMG2DMODEL_KWARGS = {"dims": ["y", "x"], "chunks": "auto"}
 
     # Protein
     PROTEIN_KEY = "protein"
     PROTEIN_DIR = "protein"
     PROTEIN_PATTERN = "*.jp2"
-    PROTEIN_IMG2DMODEL_KWARGS = {
-        "dims": ["c", "y", "x"],
-        "scale_factors": [2, 2, 2],
-        "chunks": "auto"
-    }
+    PROTEIN_IMG2DMODEL_KWARGS = {"dims": ["c", "y", "x"], "scale_factors": [2, 2, 2], "chunks": "auto"}
 
     # Transcripts
     TRANSCRIPTS_DIR = "rna"
     TRANSCRIPTS_PATTERN = "*transcript_table.csv.gz"
-    TRANSCRIPTS_COORDS = {
-        "x": "x_pixel_coordinate",
-        "y": "y_pixel_coordinate"
-    }
+    TRANSCRIPTS_COORDS = {"x": "x_pixel_coordinate", "y": "y_pixel_coordinate"}
     TRANSCRIPTS_FEATURE_KEY = "gene_name"
     TRANSCRIPTS_SWAP_XY = True
 
diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py
index 82c2946e..58569ea9 100644
--- a/src/spatialdata_io/readers/g4x.py
+++ b/src/spatialdata_io/readers/g4x.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 from typing import Union
 
+import dask.dataframe as dd
 import numpy as np
 from anndata.io import read_h5ad
 from dask_image.imread import imread
@@ -15,7 +16,6 @@
     PointsModel,
     TableModel,
 )
-import dask.dataframe as dd
 from tqdm.auto import tqdm
 
 from spatialdata_io._constants._constants import G4XKeys
@@ -26,8 +26,8 @@
 
 @inject_docs(xx=G4XKeys)
 def g4x(
-    input_path: Union[str, Path],
-    output_path: Union[str, Path, None] = None,
+    input_path: str | Path,
+    output_path: str | Path | None = None,
     include_he: bool = True,
     include_segmentation: bool = True,
     include_protein: bool = True,
@@ -70,25 +70,15 @@ def g4x(
         output_path = Path(output_path)
 
     # Determine if input_path is a run directory or a single sample directory
-    if any(
-        p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) for p in input_path.iterdir()
-    ):
+    if any(p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) for p in input_path.iterdir()):
         # Run directory with multiple samples
-        sample_input_paths = [
-            p
-            for p in input_path.iterdir()
-            if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name)
-        ]
+        sample_input_paths = [p for p in input_path.iterdir() if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name)]
         logger.debug(f"Found {len(sample_input_paths)} samples.")
 
         if output_path is None:
-            sample_output_paths = [
-                input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths
-            ]
+            sample_output_paths = [input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths]
         else:
-            sample_output_paths = [
-                output_path / f"{p.name}.zarr" for p in sample_input_paths
-            ]
+            sample_output_paths = [output_path / f"{p.name}.zarr" for p in sample_input_paths]
 
         sdatas = []
         for sample_input_path, sample_output_path in tqdm(
@@ -128,8 +118,8 @@ def g4x(
 
 
 def g4x_sample(
-    input_path: Union[str, Path],
-    output_zarr_path: Union[str, Path],
+    input_path: str | Path,
+    output_zarr_path: str | Path,
     include_he: bool = True,
     include_segmentation: bool = True,
     include_protein: bool = True,
@@ -178,9 +168,7 @@ def g4x_sample(
         output_zarr_path = Path(output_zarr_path)
         if output_zarr_path.suffix != ".zarr":
             logger.error(f"Output path must end with '.zarr'. Got {output_zarr_path}")
-            raise ValueError(
-                f"Output path must end with '.zarr'. Got {output_zarr_path}"
-            )
+            raise ValueError(f"Output path must end with '.zarr'. Got {output_zarr_path}")
 
     if mode not in ["append", "overwrite"]:
         msg = f"Invalid mode '{mode}'. Must be one of: 'append', 'overwrite'"
@@ -269,7 +257,7 @@ def g4x_sample(
 
 def _write_he(
     sdata: SpatialData,
-    he_dir: Union[str, None],
+    he_dir: str | None,
     pattern: str,
     mode: str = "append",
     **kwargs,
@@ -350,7 +338,7 @@ def _write_he(
 
 def _write_segmentation(
     sdata: SpatialData,
-    nuclei_dir: Union[str, None],
+    nuclei_dir: str | None,
     pattern: str,
     nuclei_key: str,
     nuclei_exp_key: str,
@@ -413,9 +401,7 @@ def _write_segmentation(
         f"shapes/{shapes_exp_key}",
     ]
 
-    if mode == "append" and any(
-        p in sdata.elements_paths_on_disk() for p in elements_paths
-    ):
+    if mode == "append" and any(p in sdata.elements_paths_on_disk() for p in elements_paths):
         logger.debug("Segmentation already exist. Skipping...")
         return
     elif mode == "overwrite":
@@ -423,10 +409,7 @@ def _write_segmentation(
         for el in elements:
             if el in sdata:
                 del sdata[el]
-            if (
-                f"labels/{el}" in sdata.elements_paths_on_disk()
-                or f"shapes/{el}" in sdata.elements_paths_on_disk()
-            ):
+            if f"labels/{el}" in sdata.elements_paths_on_disk() or f"shapes/{el}" in sdata.elements_paths_on_disk():
                 sdata.delete_element_from_disk(el)
 
     # Load and process segmentation data
@@ -451,7 +434,7 @@ def _write_segmentation(
 
 def _write_protein_images(
     sdata: SpatialData,
-    protein_dir: Union[str, None],
+    protein_dir: str | None,
     pattern: str,
     mode: str = "append",
     **kwargs,
@@ -486,9 +469,7 @@ def _write_protein_images(
     img_list.sort()
 
     if not img_list:
-        logger.warning(
-            f"No protein images found matching pattern '{pattern}' in {protein_dir}"
-        )
+        logger.warning(f"No protein images found matching pattern '{pattern}' in {protein_dir}")
         return
     logger.debug(f"Found {len(img_list)} protein images")
 
@@ -514,9 +495,7 @@ def _write_protein_images(
 
     # Create Image2DModel and write
     logger.debug("Converting to Image2DModel")
-    sdata[G4XKeys.PROTEIN_KEY] = Image2DModel.parse(
-        protein_stack, c_coords=channel_names, **kwargs
-    )
+    sdata[G4XKeys.PROTEIN_KEY] = Image2DModel.parse(protein_stack, c_coords=channel_names, **kwargs)
 
     logger.debug("Writing protein images")
     sdata.write_element(G4XKeys.PROTEIN_KEY)
@@ -524,7 +503,7 @@ def _write_protein_images(
 
 def _write_transcripts(
     sdata: SpatialData,
-    transcripts_dir: Union[str, None],
+    transcripts_dir: str | None,
     pattern: str,
     coordinates: dict,
     feature_key: str,
@@ -587,9 +566,7 @@ def _write_transcripts(
         pbar.update(1)
 
         if swap_xy:
-            transcripts[[coordinates["x"], coordinates["y"]]] = transcripts[
-                [coordinates["y"], coordinates["x"]]
-            ]
+            transcripts[[coordinates["x"], coordinates["y"]]] = transcripts[[coordinates["y"], coordinates["x"]]]
 
         pbar.set_description("Converting to PointsModel")
         sdata[G4XKeys.TRANSCRIPTS_KEY] = PointsModel.parse(
@@ -606,7 +583,7 @@ def _write_transcripts(
 
 def _write_table(
     sdata: SpatialData,
-    table_path: Union[str, None],
+    table_path: str | None,
     mode: str = "append",
 ):
     """
@@ -628,11 +605,7 @@ def _deep_update(base_dict, update_dict):
     Recursively update a dictionary with another dictionary.
     """
     for key, value in update_dict.items():
-        if (
-            isinstance(value, dict)
-            and key in base_dict
-            and isinstance(base_dict[key], dict)
-        ):
+        if isinstance(value, dict) and key in base_dict and isinstance(base_dict[key], dict):
             _deep_update(base_dict[key], value)
         else:
             base_dict[key] = value

From 667f3611aa889d41737b5c2bf77fe35ac29152da Mon Sep 17 00:00:00 2001
From: Clarence Mah <clarence.k.mah@gmail.com>
Date: Mon, 17 Mar 2025 17:51:39 -0700
Subject: [PATCH 04/13] simplify constants to strings

---
 src/spatialdata_io/_constants/_constants.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py
index cbbddba7..acf12625 100644
--- a/src/spatialdata_io/_constants/_constants.py
+++ b/src/spatialdata_io/_constants/_constants.py
@@ -401,30 +401,29 @@ class VisiumHDKeys(ModeEnum):
     FILE_FORMAT = "file_format"
 
 
-class G4XKeys(str, ModeEnum):
+class G4XKeys(ModeEnum):
 
     # H&E
     HE_DIR = "h_and_e"
     HE_PATTERN = "*.jp2"
-    HE_IMG2DMODEL_KWARGS = {"dims": ["y", "x", "c"], "scale_factors": [2, 2, 2], "chunks": "auto"}
 
     # Nuclei
     NUCLEI_BOUNDARIES_KEY = "nuclei"
     CELL_BOUNDARIES_KEY = "nuclei_exp"
     SEGMENTATION_DIR = "segmentation"
     SEGMENTATION_PATTERN = "segmentation_mask.npz"
-    SEG_IMG2DMODEL_KWARGS = {"dims": ["y", "x"], "chunks": "auto"}
 
     # Protein
     PROTEIN_KEY = "protein"
     PROTEIN_DIR = "protein"
     PROTEIN_PATTERN = "*.jp2"
-    PROTEIN_IMG2DMODEL_KWARGS = {"dims": ["c", "y", "x"], "scale_factors": [2, 2, 2], "chunks": "auto"}
 
     # Transcripts
+    TRANSCRIPTS_KEY = "transcripts"
     TRANSCRIPTS_DIR = "rna"
     TRANSCRIPTS_PATTERN = "*transcript_table.csv.gz"
-    TRANSCRIPTS_COORDS = {"x": "x_pixel_coordinate", "y": "y_pixel_coordinate"}
+    TRANSCRIPTS_COORD_X = "x_pixel_coordinate"
+    TRANSCRIPTS_COORD_Y = "y_pixel_coordinate"
     TRANSCRIPTS_FEATURE_KEY = "gene_name"
     TRANSCRIPTS_SWAP_XY = True
 

From d75db969dd334ee387dedf0bfb54b3fa9d5b717b Mon Sep 17 00:00:00 2001
From: Clarence Mah <clarence.k.mah@gmail.com>
Date: Mon, 17 Mar 2025 17:54:52 -0700
Subject: [PATCH 05/13] fix input paths, link table annotations to cells,
 simplify image model kwargs, use dask array imread

---
 src/spatialdata_io/readers/g4x.py | 159 +++++++++++++++++++++++-------
 1 file changed, 122 insertions(+), 37 deletions(-)

diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py
index 58569ea9..17ee6177 100644
--- a/src/spatialdata_io/readers/g4x.py
+++ b/src/spatialdata_io/readers/g4x.py
@@ -7,7 +7,7 @@
 import dask.dataframe as dd
 import numpy as np
 from anndata.io import read_h5ad
-from dask_image.imread import imread
+from dask.array.image import imread
 from spatialdata import SpatialData, to_polygons
 from spatialdata._logging import logger
 from spatialdata.models import (
@@ -70,15 +70,25 @@ def g4x(
         output_path = Path(output_path)
 
     # Determine if input_path is a run directory or a single sample directory
-    if any(p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) for p in input_path.iterdir()):
+    if any(
+        p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) for p in input_path.iterdir()
+    ):
         # Run directory with multiple samples
-        sample_input_paths = [p for p in input_path.iterdir() if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name)]
+        sample_input_paths = [
+            p
+            for p in input_path.iterdir()
+            if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name)
+        ]
         logger.debug(f"Found {len(sample_input_paths)} samples.")
 
         if output_path is None:
-            sample_output_paths = [input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths]
+            sample_output_paths = [
+                input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths
+            ]
         else:
-            sample_output_paths = [output_path / f"{p.name}.zarr" for p in sample_input_paths]
+            sample_output_paths = [
+                output_path / f"{p.name}.zarr" for p in sample_input_paths
+            ]
 
         sdatas = []
         for sample_input_path, sample_output_path in tqdm(
@@ -168,7 +178,9 @@ def g4x_sample(
         output_zarr_path = Path(output_zarr_path)
         if output_zarr_path.suffix != ".zarr":
             logger.error(f"Output path must end with '.zarr'. Got {output_zarr_path}")
-            raise ValueError(f"Output path must end with '.zarr'. Got {output_zarr_path}")
+            raise ValueError(
+                f"Output path must end with '.zarr'. Got {output_zarr_path}"
+            )
 
     if mode not in ["append", "overwrite"]:
         msg = f"Invalid mode '{mode}'. Must be one of: 'append', 'overwrite'"
@@ -195,10 +207,9 @@ def g4x_sample(
             pbar.set_description(steps[pbar.n])
             _write_he(
                 sdata,
-                he_dir=G4XKeys.HE_DIR,
+                he_dir=input_path / G4XKeys.HE_DIR,
                 pattern=G4XKeys.HE_PATTERN,
                 mode=mode,
-                **G4XKeys.HE_IMG2DMODEL_KWARGS,
             )
             pbar.update(1)
 
@@ -206,12 +217,11 @@ def g4x_sample(
             pbar.set_description(steps[pbar.n])
             _write_segmentation(
                 sdata,
-                nuclei_dir=G4XKeys.SEGMENTATION_DIR,
+                nuclei_dir=input_path / G4XKeys.SEGMENTATION_DIR,
                 pattern=G4XKeys.SEGMENTATION_PATTERN,
                 nuclei_key=G4XKeys.NUCLEI_BOUNDARIES_KEY,
                 nuclei_exp_key=G4XKeys.CELL_BOUNDARIES_KEY,
                 mode=mode,
-                **G4XKeys.SEG_IMG2DMODEL_KWARGS,
             )
             pbar.update(1)
 
@@ -219,10 +229,9 @@ def g4x_sample(
             pbar.set_description(steps[pbar.n])
             _write_protein_images(
                 sdata,
-                protein_dir=G4XKeys.PROTEIN_DIR,
+                protein_dir=input_path / G4XKeys.PROTEIN_DIR,
                 pattern=G4XKeys.PROTEIN_PATTERN,
                 mode=mode,
-                **G4XKeys.PROTEIN_IMG2DMODEL_KWARGS,
             )
             pbar.update(1)
 
@@ -230,9 +239,12 @@ def g4x_sample(
             pbar.set_description(steps[pbar.n])
             _write_transcripts(
                 sdata,
-                transcripts_dir=G4XKeys.TRANSCRIPTS_DIR,
+                transcripts_dir=input_path / G4XKeys.TRANSCRIPTS_DIR,
                 pattern=G4XKeys.TRANSCRIPTS_PATTERN,
-                coordinates=G4XKeys.TRANSCRIPTS_COORDS,
+                coordinates={
+                    "x": G4XKeys.TRANSCRIPTS_COORD_X,
+                    "y": G4XKeys.TRANSCRIPTS_COORD_Y,
+                },
                 feature_key=G4XKeys.TRANSCRIPTS_FEATURE_KEY,
                 swap_xy=G4XKeys.TRANSCRIPTS_SWAP_XY,
                 mode=mode,
@@ -243,7 +255,7 @@ def g4x_sample(
             pbar.set_description(steps[pbar.n])
             _write_table(
                 sdata,
-                table_path=G4XKeys.TABLE_PATTERN,
+                table_path=input_path / G4XKeys.TABLES_DIR / G4XKeys.TABLE_PATTERN,
                 mode=mode,
             )
             pbar.update(1)
@@ -278,7 +290,7 @@ def _write_he(
         - "append": Skip if element exists (default)
         - "overwrite": Replace if element exists
     kwargs : dict
-        Additional arguments passed to Image2DModel.parse()
+        Keyword arguments for Image2DModel
 
     Modifies
     -------
@@ -321,16 +333,20 @@ def _write_he(
 
         # Load and process image
         logger.debug(f"Loading H&E image from {he_file}")
-        img = imread(str(he_file))
-        if len(img.shape) == 4:
-            img = img[0]  # [0] to remove extra dimension
-        elif len(img.shape) == 3:
-            img = img.transpose(1, 2, 0)  # move first dimension to last
+        img = imread(str(he_file)).compute().squeeze()
         logger.debug(f"H&E image shape: {img.shape}")
         logger.debug(f"H&E image dtype: {img.dtype}")
-
+        if len(img.shape) == 2:
+            img = img[np.newaxis, :, :]
+        elif len(img.shape) == 3:
+            img = img.transpose(2, 0, 1)
         # Create Image2DModel and write
         logger.debug(f"Creating Image2DModel for {img_key}")
+        kwargs["dims"] = ["c", "y", "x"] if "dims" not in kwargs else kwargs["dims"]
+        kwargs["scale_factors"] = (
+            [2, 2, 2] if "scale_factors" not in kwargs else kwargs["scale_factors"]
+        )
+        kwargs["chunks"] = "auto" if "chunks" not in kwargs else kwargs["chunks"]
         sdata[img_key] = Image2DModel.parse(img, **kwargs)
         logger.debug(f"Writing Image2DModel for {img_key}")
         sdata.write_element(img_key)
@@ -366,7 +382,7 @@ def _write_segmentation(
         - "append": Skip if elements exist (default)
         - "overwrite": Replace if elements exist
     kwargs : dict
-        Additional arguments passed to Labels2DModel.parse()
+        Keyword arguments for Labels2DModel
 
     Modifies
     --------
@@ -389,19 +405,21 @@ def _write_segmentation(
         return
 
     # Process each nuclei file
-    shapes_seg_key = f"{nuclei_key}_shapes"
-    shapes_exp_key = f"{nuclei_exp_key}_shapes"
+    shapes_nuclei_key = f"{nuclei_key}_shapes"
+    shapes_nuclei_exp_key = f"{nuclei_exp_key}_shapes"
 
     # Check if elements exist
-    elements = [nuclei_key, nuclei_exp_key, shapes_seg_key, shapes_exp_key]
+    elements = [nuclei_key, nuclei_exp_key, shapes_nuclei_key, shapes_nuclei_exp_key]
     elements_paths = [
         f"labels/{nuclei_key}",
         f"labels/{nuclei_exp_key}",
-        f"shapes/{shapes_seg_key}",
-        f"shapes/{shapes_exp_key}",
+        f"shapes/{shapes_nuclei_key}",
+        f"shapes/{shapes_nuclei_exp_key}",
     ]
 
-    if mode == "append" and any(p in sdata.elements_paths_on_disk() for p in elements_paths):
+    if mode == "append" and any(
+        p in sdata.elements_paths_on_disk() for p in elements_paths
+    ):
         logger.debug("Segmentation already exist. Skipping...")
         return
     elif mode == "overwrite":
@@ -409,7 +427,10 @@ def _write_segmentation(
         for el in elements:
             if el in sdata:
                 del sdata[el]
-            if f"labels/{el}" in sdata.elements_paths_on_disk() or f"shapes/{el}" in sdata.elements_paths_on_disk():
+            if (
+                f"labels/{el}" in sdata.elements_paths_on_disk()
+                or f"shapes/{el}" in sdata.elements_paths_on_disk()
+            ):
                 sdata.delete_element_from_disk(el)
 
     # Load and process segmentation data
@@ -425,8 +446,14 @@ def _write_segmentation(
     sdata[nuclei_key] = Labels2DModel.parse(nuclei_raw, **kwargs)
     sdata[nuclei_exp_key] = Labels2DModel.parse(nuclei_exp, **kwargs)
     logger.debug("Converting to polygons")
-    sdata[shapes_seg_key] = to_polygons(sdata[nuclei_key]).reset_index(drop=True)
-    sdata[shapes_exp_key] = to_polygons(sdata[nuclei_exp_key]).reset_index(drop=True)
+    sdata[shapes_nuclei_key] = to_polygons(sdata[nuclei_key]).reset_index(drop=True)
+    sdata[shapes_nuclei_exp_key] = to_polygons(sdata[nuclei_exp_key]).reset_index(
+        drop=True
+    )
+    # Set index for shapes
+    sdata[shapes_nuclei_exp_key] = sdata[shapes_nuclei_exp_key].set_index("label")
+    sdata[shapes_nuclei_exp_key].index = sdata[shapes_nuclei_exp_key].index.astype(str)
+
     logger.debug("Writing elements")
     for element in elements:
         sdata.write_element(element)
@@ -456,7 +483,7 @@ def _write_protein_images(
         - "append": Skip if element exists (default)
         - "overwrite": Replace if element exists
     kwargs : dict
-        Additional arguments passed to Image2DModel.parse()
+        Keyword arguments for Image2DModel
     """
     if protein_dir is None:
         logger.debug("Protein skipped...")
@@ -469,7 +496,9 @@ def _write_protein_images(
     img_list.sort()
 
     if not img_list:
-        logger.warning(f"No protein images found matching pattern '{pattern}' in {protein_dir}")
+        logger.warning(
+            f"No protein images found matching pattern '{pattern}' in {protein_dir}"
+        )
         return
     logger.debug(f"Found {len(img_list)} protein images")
 
@@ -493,9 +522,21 @@ def _write_protein_images(
     protein_stack = imread(str(protein_dir / pattern))
     logger.debug(f"Images shape: {protein_stack.shape}")
 
+    kwargs["dims"] = ["c", "y", "x"] if "dims" not in kwargs else kwargs["dims"]
+    kwargs["scale_factors"] = (
+        [2, 2, 2] if "scale_factors" not in kwargs else kwargs["scale_factors"]
+    )
+    kwargs["chunks"] = (
+        [1, protein_stack.shape[-2], protein_stack.shape[-1]]
+        if "chunks" not in kwargs
+        else kwargs["chunks"]
+    )
+
     # Create Image2DModel and write
     logger.debug("Converting to Image2DModel")
-    sdata[G4XKeys.PROTEIN_KEY] = Image2DModel.parse(protein_stack, c_coords=channel_names, **kwargs)
+    sdata[G4XKeys.PROTEIN_KEY] = Image2DModel.parse(
+        protein_stack, c_coords=channel_names, **kwargs
+    )
 
     logger.debug("Writing protein images")
     sdata.write_element(G4XKeys.PROTEIN_KEY)
@@ -566,7 +607,9 @@ def _write_transcripts(
         pbar.update(1)
 
         if swap_xy:
-            transcripts[[coordinates["x"], coordinates["y"]]] = transcripts[[coordinates["y"], coordinates["x"]]]
+            transcripts[[coordinates["x"], coordinates["y"]]] = transcripts[
+                [coordinates["y"], coordinates["x"]]
+            ]
 
         pbar.set_description("Converting to PointsModel")
         sdata[G4XKeys.TRANSCRIPTS_KEY] = PointsModel.parse(
@@ -588,13 +631,51 @@ def _write_table(
 ):
     """
     Write tables to SpatialData object.
+
+    Parameters
+    ----------
+    sdata : SpatialData
+        SpatialData object to write to
+    table_path : Union[str, None]
+        Path to the table file.
+        If None, this step will be skipped.
+    mode : str, optional
+        Mode for handling existing elements. Options:
+        - "append": Skip if element exists (default)
+        - "overwrite": Replace if element exists
+
+    Modifies
+    --------
+    sdata : SpatialData
+        Adds a table element to the SpatialData object
     """
     if table_path is None:
         logger.debug("Table skipped...")
         return
 
+    if f"tables/{G4XKeys.TABLE_KEY}" in sdata.elements_paths_on_disk():
+        if mode == "append":
+            logger.debug("Table already exists. Skipping...")
+            return
+        elif mode == "overwrite":
+            logger.debug("Deleting existing table")
+            if G4XKeys.TABLE_KEY in sdata:
+                del sdata[G4XKeys.TABLE_KEY]
+            sdata.delete_element_from_disk(G4XKeys.TABLE_KEY)
+
     adata = read_h5ad(table_path)
+
+    # Link table annotations to cell shapes
+    shape_key = f"{G4XKeys.CELL_BOUNDARIES_KEY}_shapes"
+    adata.obs["region"] = shape_key
+    adata.obs["label"] = adata.obs["cell_id"].str.split("-").str[1]
     sdata[G4XKeys.TABLE_KEY] = TableModel.parse(adata)
+    sdata.set_table_annotates_spatialelement(
+        G4XKeys.TABLE_KEY,
+        region=shape_key,
+        region_key="region",
+        instance_key="label",
+    )
 
     logger.debug("Writing table to disk")
     sdata.write_element(G4XKeys.TABLE_KEY)
@@ -605,7 +686,11 @@ def _deep_update(base_dict, update_dict):
     Recursively update a dictionary with another dictionary.
     """
     for key, value in update_dict.items():
-        if isinstance(value, dict) and key in base_dict and isinstance(base_dict[key], dict):
+        if (
+            isinstance(value, dict)
+            and key in base_dict
+            and isinstance(base_dict[key], dict)
+        ):
             _deep_update(base_dict[key], value)
         else:
             base_dict[key] = value

From be6eca33932c9025ae7fa7667a92f314958e83f6 Mon Sep 17 00:00:00 2001
From: Clarence Mah <clarence.k.mah@gmail.com>
Date: Mon, 17 Mar 2025 18:14:33 -0700
Subject: [PATCH 06/13] swap axes for tx no longer needed

---
 src/spatialdata_io/_constants/_constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py
index acf12625..021b9d57 100644
--- a/src/spatialdata_io/_constants/_constants.py
+++ b/src/spatialdata_io/_constants/_constants.py
@@ -425,7 +425,7 @@ class G4XKeys(ModeEnum):
     TRANSCRIPTS_COORD_X = "x_pixel_coordinate"
     TRANSCRIPTS_COORD_Y = "y_pixel_coordinate"
     TRANSCRIPTS_FEATURE_KEY = "gene_name"
-    TRANSCRIPTS_SWAP_XY = True
+    TRANSCRIPTS_SWAP_XY = False
 
     # Tables
     TABLES_DIR = "single_cell_data"

From 3db3877c2b437a3b317cf03922b251a0fb77cb58 Mon Sep 17 00:00:00 2001
From: Clarence Mah <clarence.k.mah@gmail.com>
Date: Wed, 19 Mar 2025 16:18:06 -0700
Subject: [PATCH 07/13] parse bool properly for swap_xy

---
 src/spatialdata_io/readers/g4x.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py
index 17ee6177..84e189e1 100644
--- a/src/spatialdata_io/readers/g4x.py
+++ b/src/spatialdata_io/readers/g4x.py
@@ -246,7 +246,7 @@ def g4x_sample(
                     "y": G4XKeys.TRANSCRIPTS_COORD_Y,
                 },
                 feature_key=G4XKeys.TRANSCRIPTS_FEATURE_KEY,
-                swap_xy=G4XKeys.TRANSCRIPTS_SWAP_XY,
+                swap_xy=G4XKeys.TRANSCRIPTS_SWAP_XY == "True",
                 mode=mode,
             )
             pbar.update(1)
@@ -606,7 +606,9 @@ def _write_transcripts(
             raise ValueError(f"Unsupported file type: {transcript_dir / pattern}")
         pbar.update(1)
 
+        logger.debug(f"swap_xy: {swap_xy}, {type(swap_xy)}")
         if swap_xy:
+            logger.debug("Swapping x and y coordinates")
             transcripts[[coordinates["x"], coordinates["y"]]] = transcripts[
                 [coordinates["y"], coordinates["x"]]
             ]

From 8d7a41a202e0023ccb1edba69afc034873f94034 Mon Sep 17 00:00:00 2001
From: Clarence Mah <clarence.k.mah@gmail.com>
Date: Wed, 19 Mar 2025 16:20:34 -0700
Subject: [PATCH 08/13] introduce offset to generated shapes

---
 src/spatialdata_io/_constants/_constants.py |  1 +
 src/spatialdata_io/readers/g4x.py           | 20 +++++++++++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py
index 021b9d57..c56d3069 100644
--- a/src/spatialdata_io/_constants/_constants.py
+++ b/src/spatialdata_io/_constants/_constants.py
@@ -412,6 +412,7 @@ class G4XKeys(ModeEnum):
     CELL_BOUNDARIES_KEY = "nuclei_exp"
     SEGMENTATION_DIR = "segmentation"
     SEGMENTATION_PATTERN = "segmentation_mask.npz"
+    OFFSET = "-0.5"
 
     # Protein
     PROTEIN_KEY = "protein"
diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py
index 84e189e1..dff96c34 100644
--- a/src/spatialdata_io/readers/g4x.py
+++ b/src/spatialdata_io/readers/g4x.py
@@ -445,13 +445,23 @@ def _write_segmentation(
     logger.debug("Converting to Labels2DModel")
     sdata[nuclei_key] = Labels2DModel.parse(nuclei_raw, **kwargs)
     sdata[nuclei_exp_key] = Labels2DModel.parse(nuclei_exp, **kwargs)
+
+    # Convert labels to polygons using "label" as index and translating xy coordinates to (almost) match label pixel coordinates
     logger.debug("Converting to polygons")
-    sdata[shapes_nuclei_key] = to_polygons(sdata[nuclei_key]).reset_index(drop=True)
-    sdata[shapes_nuclei_exp_key] = to_polygons(sdata[nuclei_exp_key]).reset_index(
-        drop=True
+    offset = float(G4XKeys.OFFSET)
+
+    # Nuclei shapes
+    sdata[shapes_nuclei_key] = to_polygons(sdata[nuclei_key]).set_index("label")
+    sdata[shapes_nuclei_key].geometry = sdata[shapes_nuclei_key].translate(
+        xoff=offset, yoff=offset
+    )
+    sdata[shapes_nuclei_key].index = sdata[shapes_nuclei_key].index.astype(str)
+
+    # Expanded nuclei shapes
+    sdata[shapes_nuclei_exp_key] = to_polygons(sdata[nuclei_exp_key]).set_index("label")
+    sdata[shapes_nuclei_exp_key].geometry = sdata[shapes_nuclei_exp_key].translate(
+        xoff=offset, yoff=offset
     )
-    # Set index for shapes
-    sdata[shapes_nuclei_exp_key] = sdata[shapes_nuclei_exp_key].set_index("label")
     sdata[shapes_nuclei_exp_key].index = sdata[shapes_nuclei_exp_key].index.astype(str)
 
     logger.debug("Writing elements")

From a7a1f639abd58f1b7ba161df34d38478504e9dcb Mon Sep 17 00:00:00 2001
From: Clarence Mah <clarence.k.mah@gmail.com>
Date: Wed, 19 Mar 2025 16:25:49 -0700
Subject: [PATCH 09/13] read 16bit jp2 images with glymur

---
 pyproject.toml                    |  1 +
 src/spatialdata_io/readers/g4x.py | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 81f0ae28..8c92ee6f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
     "readfcs",
     "tifffile>=2023.8.12",
     "ome-types",
+    "glymur",
 ]
 
 [project.optional-dependencies]
diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py
index dff96c34..3d636606 100644
--- a/src/spatialdata_io/readers/g4x.py
+++ b/src/spatialdata_io/readers/g4x.py
@@ -6,6 +6,7 @@
 
 import dask.dataframe as dd
 import numpy as np
+import glymur
 from anndata.io import read_h5ad
 from dask.array.image import imread
 from spatialdata import SpatialData, to_polygons
@@ -333,7 +334,7 @@ def _write_he(
 
         # Load and process image
         logger.debug(f"Loading H&E image from {he_file}")
-        img = imread(str(he_file)).compute().squeeze()
+        img = imread(str(he_file), imread=_glymur_imread).compute().squeeze()
         logger.debug(f"H&E image shape: {img.shape}")
         logger.debug(f"H&E image dtype: {img.dtype}")
         if len(img.shape) == 2:
@@ -706,3 +707,10 @@ def _deep_update(base_dict, update_dict):
             _deep_update(base_dict[key], value)
         else:
             base_dict[key] = value
+
+
+def _glymur_imread(img_path: str):
+    """
+    Read a JP2 file using glymur. This is for compatibility with 16-bit JP2 files.
+    """
+    return glymur.Jp2k(img_path)[:]

From d5d33d6e416ffe2d3598cafc1aa25450d3f0675b Mon Sep 17 00:00:00 2001
From: Clarence Mah <clarence.k.mah@gmail.com>
Date: Wed, 19 Mar 2025 17:21:45 -0700
Subject: [PATCH 10/13] Add notes on differences in generated SpatialData
 object in g4x function

---
 src/spatialdata_io/readers/g4x.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py
index 3d636606..ea5c147e 100644
--- a/src/spatialdata_io/readers/g4x.py
+++ b/src/spatialdata_io/readers/g4x.py
@@ -64,6 +64,14 @@ def g4x(
     -------
     sdatas : Union[SpatialData, list[SpatialData]]
         A single SpatialData object if processing a single sample directory, otherwise a list of SpatialData objects.
+
+    Notes
+    -----
+    There will be several minor differences between the original data and the generated SpatialData object:
+
+    - cell and nuclear polygons are smoothed when vectorized from label data
+    - cell and nuclear polygons are offset by `-0.5` pixels relative to the rasterized label data
+    - transcript table counts correspond exactly to cell labels but not necessarily to cell polygons
     """
     if isinstance(input_path, str):
         input_path = Path(input_path)

From 3dc01c987df79d6de84fab0d8c2bf1bb9a185f97 Mon Sep 17 00:00:00 2001
From: Clarence Mah <clarence.k.mah@gmail.com>
Date: Mon, 24 Mar 2025 15:32:11 -0700
Subject: [PATCH 11/13] use .v attributes for accessing keys for python3.10
 compat

---
 src/spatialdata_io/_constants/_constants.py |  1 +
 src/spatialdata_io/readers/g4x.py           | 70 ++++++++++-----------
 2 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py
index c56d3069..363e6300 100644
--- a/src/spatialdata_io/_constants/_constants.py
+++ b/src/spatialdata_io/_constants/_constants.py
@@ -402,6 +402,7 @@ class VisiumHDKeys(ModeEnum):
 
 
 class G4XKeys(ModeEnum):
+    """Keys for G4X formatted dataset."""
 
     # H&E
     HE_DIR = "h_and_e"
diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py
index ea5c147e..14888674 100644
--- a/src/spatialdata_io/readers/g4x.py
+++ b/src/spatialdata_io/readers/g4x.py
@@ -216,8 +216,8 @@ def g4x_sample(
             pbar.set_description(steps[pbar.n])
             _write_he(
                 sdata,
-                he_dir=input_path / G4XKeys.HE_DIR,
-                pattern=G4XKeys.HE_PATTERN,
+                he_dir=input_path / G4XKeys.HE_DIR.v,
+                pattern=G4XKeys.HE_PATTERN.v,
                 mode=mode,
             )
             pbar.update(1)
@@ -226,10 +226,10 @@ def g4x_sample(
             pbar.set_description(steps[pbar.n])
             _write_segmentation(
                 sdata,
-                nuclei_dir=input_path / G4XKeys.SEGMENTATION_DIR,
-                pattern=G4XKeys.SEGMENTATION_PATTERN,
-                nuclei_key=G4XKeys.NUCLEI_BOUNDARIES_KEY,
-                nuclei_exp_key=G4XKeys.CELL_BOUNDARIES_KEY,
+                nuclei_dir=input_path / G4XKeys.SEGMENTATION_DIR.v,
+                pattern=G4XKeys.SEGMENTATION_PATTERN.v,
+                nuclei_key=G4XKeys.NUCLEI_BOUNDARIES_KEY.v,
+                nuclei_exp_key=G4XKeys.CELL_BOUNDARIES_KEY.v,
                 mode=mode,
             )
             pbar.update(1)
@@ -238,8 +238,8 @@ def g4x_sample(
             pbar.set_description(steps[pbar.n])
             _write_protein_images(
                 sdata,
-                protein_dir=input_path / G4XKeys.PROTEIN_DIR,
-                pattern=G4XKeys.PROTEIN_PATTERN,
+                protein_dir=input_path / G4XKeys.PROTEIN_DIR.v,
+                pattern=G4XKeys.PROTEIN_PATTERN.v,
                 mode=mode,
             )
             pbar.update(1)
@@ -248,14 +248,14 @@ def g4x_sample(
             pbar.set_description(steps[pbar.n])
             _write_transcripts(
                 sdata,
-                transcripts_dir=input_path / G4XKeys.TRANSCRIPTS_DIR,
-                pattern=G4XKeys.TRANSCRIPTS_PATTERN,
+                transcripts_dir=input_path / G4XKeys.TRANSCRIPTS_DIR.v,
+                pattern=G4XKeys.TRANSCRIPTS_PATTERN.v,
                 coordinates={
-                    "x": G4XKeys.TRANSCRIPTS_COORD_X,
-                    "y": G4XKeys.TRANSCRIPTS_COORD_Y,
+                    "x": G4XKeys.TRANSCRIPTS_COORD_X.v,
+                    "y": G4XKeys.TRANSCRIPTS_COORD_Y.v,
                 },
-                feature_key=G4XKeys.TRANSCRIPTS_FEATURE_KEY,
-                swap_xy=G4XKeys.TRANSCRIPTS_SWAP_XY == "True",
+                feature_key=G4XKeys.TRANSCRIPTS_FEATURE_KEY.v,
+                swap_xy=G4XKeys.TRANSCRIPTS_SWAP_XY.v == "True",
                 mode=mode,
             )
             pbar.update(1)
@@ -264,7 +264,7 @@ def g4x_sample(
             pbar.set_description(steps[pbar.n])
             _write_table(
                 sdata,
-                table_path=input_path / G4XKeys.TABLES_DIR / G4XKeys.TABLE_PATTERN,
+                table_path=input_path / G4XKeys.TABLES_DIR.v / G4XKeys.TABLE_PATTERN.v,
                 mode=mode,
             )
             pbar.update(1)
@@ -457,7 +457,7 @@ def _write_segmentation(
 
     # Convert labels to polygons using "label" as index and translating xy coordinates to (almost) match label pixel coordinates
     logger.debug("Converting to polygons")
-    offset = float(G4XKeys.OFFSET)
+    offset = float(G4XKeys.OFFSET.v)
 
     # Nuclei shapes
     sdata[shapes_nuclei_key] = to_polygons(sdata[nuclei_key]).set_index("label")
@@ -528,9 +528,9 @@ def _write_protein_images(
             return
         elif mode == "overwrite":
             logger.debug("Deleting existing protein images")
-            if G4XKeys.PROTEIN_KEY in sdata:
-                del sdata[G4XKeys.PROTEIN_KEY]
-            sdata.delete_element_from_disk(G4XKeys.PROTEIN_KEY)
+            if G4XKeys.PROTEIN_KEY.v in sdata:
+                del sdata[G4XKeys.PROTEIN_KEY.v]
+            sdata.delete_element_from_disk(G4XKeys.PROTEIN_KEY.v)
     img_list.sort()
 
     # Get channel names from filenames
@@ -553,12 +553,12 @@ def _write_protein_images(
 
     # Create Image2DModel and write
     logger.debug("Converting to Image2DModel")
-    sdata[G4XKeys.PROTEIN_KEY] = Image2DModel.parse(
+    sdata[G4XKeys.PROTEIN_KEY.v] = Image2DModel.parse(
         protein_stack, c_coords=channel_names, **kwargs
     )
 
     logger.debug("Writing protein images")
-    sdata.write_element(G4XKeys.PROTEIN_KEY)
+    sdata.write_element(G4XKeys.PROTEIN_KEY.v)
 
 
 def _write_transcripts(
@@ -601,15 +601,15 @@ def _write_transcripts(
         logger.debug("Transcripts skipped...")
         return
 
-    if f"points/{G4XKeys.TRANSCRIPTS_KEY}" in sdata.elements_paths_on_disk():
+    if f"points/{G4XKeys.TRANSCRIPTS_KEY.v}" in sdata.elements_paths_on_disk():
         if mode == "append":
             logger.debug("Transcripts already exist. Skipping...")
             return
         elif mode == "overwrite":
             logger.debug("Deleting existing transcripts")
-            if G4XKeys.TRANSCRIPTS_KEY in sdata:
-                del sdata[G4XKeys.TRANSCRIPTS_KEY]
-            sdata.delete_element_from_disk(G4XKeys.TRANSCRIPTS_KEY)
+            if G4XKeys.TRANSCRIPTS_KEY.v in sdata:
+                del sdata[G4XKeys.TRANSCRIPTS_KEY.v]
+            sdata.delete_element_from_disk(G4XKeys.TRANSCRIPTS_KEY.v)
 
     transcript_dir = Path(transcripts_dir)
     with tqdm(total=3, desc="Processing transcripts", leave=False) as pbar:
@@ -633,7 +633,7 @@ def _write_transcripts(
             ]
 
         pbar.set_description("Converting to PointsModel")
-        sdata[G4XKeys.TRANSCRIPTS_KEY] = PointsModel.parse(
+        sdata[G4XKeys.TRANSCRIPTS_KEY.v] = PointsModel.parse(
             transcripts,
             coordinates=coordinates,
             feature_key=feature_key,
@@ -641,7 +641,7 @@ def _write_transcripts(
         pbar.update(1)
 
         pbar.set_description("Writing to disk")
-        sdata.write_element(G4XKeys.TRANSCRIPTS_KEY)
+        sdata.write_element(G4XKeys.TRANSCRIPTS_KEY.v)
         pbar.update(1)
 
 
@@ -674,32 +674,32 @@ def _write_table(
         logger.debug("Table skipped...")
         return
 
-    if f"tables/{G4XKeys.TABLE_KEY}" in sdata.elements_paths_on_disk():
+    if f"tables/{G4XKeys.TABLE_KEY.v}" in sdata.elements_paths_on_disk():
         if mode == "append":
             logger.debug("Table already exists. Skipping...")
             return
         elif mode == "overwrite":
             logger.debug("Deleting existing table")
-            if G4XKeys.TABLE_KEY in sdata:
-                del sdata[G4XKeys.TABLE_KEY]
-            sdata.delete_element_from_disk(G4XKeys.TABLE_KEY)
+            if G4XKeys.TABLE_KEY.v in sdata:
+                del sdata[G4XKeys.TABLE_KEY.v]
+            sdata.delete_element_from_disk(G4XKeys.TABLE_KEY.v)
 
     adata = read_h5ad(table_path)
 
     # Link table annotations to cell shapes
-    shape_key = f"{G4XKeys.CELL_BOUNDARIES_KEY}_shapes"
+    shape_key = f"{G4XKeys.CELL_BOUNDARIES_KEY.v}_shapes"
     adata.obs["region"] = shape_key
     adata.obs["label"] = adata.obs["cell_id"].str.split("-").str[1]
-    sdata[G4XKeys.TABLE_KEY] = TableModel.parse(adata)
+    sdata[G4XKeys.TABLE_KEY.v] = TableModel.parse(adata)
     sdata.set_table_annotates_spatialelement(
-        G4XKeys.TABLE_KEY,
+        G4XKeys.TABLE_KEY.v,
         region=shape_key,
         region_key="region",
         instance_key="label",
     )
 
     logger.debug("Writing table to disk")
-    sdata.write_element(G4XKeys.TABLE_KEY)
+    sdata.write_element(G4XKeys.TABLE_KEY.v)
 
 
 def _deep_update(base_dict, update_dict):

From b9b186500bbc201e85cd14a238fa501fa7d38df1 Mon Sep 17 00:00:00 2001
From: xyi <xyi@singulargenomics.com>
Date: Thu, 24 Apr 2025 12:36:03 -0700
Subject: [PATCH 12/13] raise max images size and manually set chunk size

---
 src/spatialdata_io/readers/g4x.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py
index 14888674..9d9b6d94 100644
--- a/src/spatialdata_io/readers/g4x.py
+++ b/src/spatialdata_io/readers/g4x.py
@@ -7,6 +7,7 @@
 import dask.dataframe as dd
 import numpy as np
 import glymur
+import PIL
 from anndata.io import read_h5ad
 from dask.array.image import imread
 from spatialdata import SpatialData, to_polygons
@@ -22,9 +23,12 @@
 from spatialdata_io._constants._constants import G4XKeys
 from spatialdata_io._docs import inject_docs
 
+PIL.Image.MAX_IMAGE_PIXELS = 500000000
+
 __all__ = ["g4x"]
 
 
+
 @inject_docs(xx=G4XKeys)
 def g4x(
     input_path: str | Path,
@@ -355,7 +359,7 @@ def _write_he(
         kwargs["scale_factors"] = (
             [2, 2, 2] if "scale_factors" not in kwargs else kwargs["scale_factors"]
         )
-        kwargs["chunks"] = "auto" if "chunks" not in kwargs else kwargs["chunks"]
+        kwargs["chunks"] = [1, 1024, 1024] if "chunks" not in kwargs else kwargs["chunks"]
         sdata[img_key] = Image2DModel.parse(img, **kwargs)
         logger.debug(f"Writing Image2DModel for {img_key}")
         sdata.write_element(img_key)

From f670424f08d73a2a48d00b0ab4f993ff236732a3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 24 Apr 2025 19:36:31 +0000
Subject: [PATCH 13/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/spatialdata_io/readers/g4x.py | 73 +++++++------------------------
 1 file changed, 17 insertions(+), 56 deletions(-)

diff --git a/src/spatialdata_io/readers/g4x.py b/src/spatialdata_io/readers/g4x.py
index 9d9b6d94..c61759ca 100644
--- a/src/spatialdata_io/readers/g4x.py
+++ b/src/spatialdata_io/readers/g4x.py
@@ -2,11 +2,10 @@
 
 import re
 from pathlib import Path
-from typing import Union
 
 import dask.dataframe as dd
-import numpy as np
 import glymur
+import numpy as np
 import PIL
 from anndata.io import read_h5ad
 from dask.array.image import imread
@@ -28,7 +27,6 @@
 __all__ = ["g4x"]
 
 
-
 @inject_docs(xx=G4XKeys)
 def g4x(
     input_path: str | Path,
@@ -83,25 +81,15 @@ def g4x(
         output_path = Path(output_path)
 
     # Determine if input_path is a run directory or a single sample directory
-    if any(
-        p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) for p in input_path.iterdir()
-    ):
+    if any(p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name) for p in input_path.iterdir()):
         # Run directory with multiple samples
-        sample_input_paths = [
-            p
-            for p in input_path.iterdir()
-            if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name)
-        ]
+        sample_input_paths = [p for p in input_path.iterdir() if p.is_dir() and re.match(r"[A-Z][0-9]{2}", p.name)]
         logger.debug(f"Found {len(sample_input_paths)} samples.")
 
         if output_path is None:
-            sample_output_paths = [
-                input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths
-            ]
+            sample_output_paths = [input_path / p.name / f"{p.name}.zarr" for p in sample_input_paths]
         else:
-            sample_output_paths = [
-                output_path / f"{p.name}.zarr" for p in sample_input_paths
-            ]
+            sample_output_paths = [output_path / f"{p.name}.zarr" for p in sample_input_paths]
 
         sdatas = []
         for sample_input_path, sample_output_path in tqdm(
@@ -191,9 +179,7 @@ def g4x_sample(
         output_zarr_path = Path(output_zarr_path)
         if output_zarr_path.suffix != ".zarr":
             logger.error(f"Output path must end with '.zarr'. Got {output_zarr_path}")
-            raise ValueError(
-                f"Output path must end with '.zarr'. Got {output_zarr_path}"
-            )
+            raise ValueError(f"Output path must end with '.zarr'. Got {output_zarr_path}")
 
     if mode not in ["append", "overwrite"]:
         msg = f"Invalid mode '{mode}'. Must be one of: 'append', 'overwrite'"
@@ -356,9 +342,7 @@ def _write_he(
         # Create Image2DModel and write
         logger.debug(f"Creating Image2DModel for {img_key}")
         kwargs["dims"] = ["c", "y", "x"] if "dims" not in kwargs else kwargs["dims"]
-        kwargs["scale_factors"] = (
-            [2, 2, 2] if "scale_factors" not in kwargs else kwargs["scale_factors"]
-        )
+        kwargs["scale_factors"] = [2, 2, 2] if "scale_factors" not in kwargs else kwargs["scale_factors"]
         kwargs["chunks"] = [1, 1024, 1024] if "chunks" not in kwargs else kwargs["chunks"]
         sdata[img_key] = Image2DModel.parse(img, **kwargs)
         logger.debug(f"Writing Image2DModel for {img_key}")
@@ -430,9 +414,7 @@ def _write_segmentation(
         f"shapes/{shapes_nuclei_exp_key}",
     ]
 
-    if mode == "append" and any(
-        p in sdata.elements_paths_on_disk() for p in elements_paths
-    ):
+    if mode == "append" and any(p in sdata.elements_paths_on_disk() for p in elements_paths):
         logger.debug("Segmentation already exist. Skipping...")
         return
     elif mode == "overwrite":
@@ -440,10 +422,7 @@ def _write_segmentation(
         for el in elements:
             if el in sdata:
                 del sdata[el]
-            if (
-                f"labels/{el}" in sdata.elements_paths_on_disk()
-                or f"shapes/{el}" in sdata.elements_paths_on_disk()
-            ):
+            if f"labels/{el}" in sdata.elements_paths_on_disk() or f"shapes/{el}" in sdata.elements_paths_on_disk():
                 sdata.delete_element_from_disk(el)
 
     # Load and process segmentation data
@@ -465,16 +444,12 @@ def _write_segmentation(
 
     # Nuclei shapes
     sdata[shapes_nuclei_key] = to_polygons(sdata[nuclei_key]).set_index("label")
-    sdata[shapes_nuclei_key].geometry = sdata[shapes_nuclei_key].translate(
-        xoff=offset, yoff=offset
-    )
+    sdata[shapes_nuclei_key].geometry = sdata[shapes_nuclei_key].translate(xoff=offset, yoff=offset)
     sdata[shapes_nuclei_key].index = sdata[shapes_nuclei_key].index.astype(str)
 
     # Expanded nuclei shapes
     sdata[shapes_nuclei_exp_key] = to_polygons(sdata[nuclei_exp_key]).set_index("label")
-    sdata[shapes_nuclei_exp_key].geometry = sdata[shapes_nuclei_exp_key].translate(
-        xoff=offset, yoff=offset
-    )
+    sdata[shapes_nuclei_exp_key].geometry = sdata[shapes_nuclei_exp_key].translate(xoff=offset, yoff=offset)
     sdata[shapes_nuclei_exp_key].index = sdata[shapes_nuclei_exp_key].index.astype(str)
 
     logger.debug("Writing elements")
@@ -519,9 +494,7 @@ def _write_protein_images(
     img_list.sort()
 
     if not img_list:
-        logger.warning(
-            f"No protein images found matching pattern '{pattern}' in {protein_dir}"
-        )
+        logger.warning(f"No protein images found matching pattern '{pattern}' in {protein_dir}")
         return
     logger.debug(f"Found {len(img_list)} protein images")
 
@@ -546,20 +519,14 @@ def _write_protein_images(
     logger.debug(f"Images shape: {protein_stack.shape}")
 
     kwargs["dims"] = ["c", "y", "x"] if "dims" not in kwargs else kwargs["dims"]
-    kwargs["scale_factors"] = (
-        [2, 2, 2] if "scale_factors" not in kwargs else kwargs["scale_factors"]
-    )
+    kwargs["scale_factors"] = [2, 2, 2] if "scale_factors" not in kwargs else kwargs["scale_factors"]
     kwargs["chunks"] = (
-        [1, protein_stack.shape[-2], protein_stack.shape[-1]]
-        if "chunks" not in kwargs
-        else kwargs["chunks"]
+        [1, protein_stack.shape[-2], protein_stack.shape[-1]] if "chunks" not in kwargs else kwargs["chunks"]
     )
 
     # Create Image2DModel and write
     logger.debug("Converting to Image2DModel")
-    sdata[G4XKeys.PROTEIN_KEY.v] = Image2DModel.parse(
-        protein_stack, c_coords=channel_names, **kwargs
-    )
+    sdata[G4XKeys.PROTEIN_KEY.v] = Image2DModel.parse(protein_stack, c_coords=channel_names, **kwargs)
 
     logger.debug("Writing protein images")
     sdata.write_element(G4XKeys.PROTEIN_KEY.v)
@@ -632,9 +599,7 @@ def _write_transcripts(
         logger.debug(f"swap_xy: {swap_xy}, {type(swap_xy)}")
         if swap_xy:
             logger.debug("Swapping x and y coordinates")
-            transcripts[[coordinates["x"], coordinates["y"]]] = transcripts[
-                [coordinates["y"], coordinates["x"]]
-            ]
+            transcripts[[coordinates["x"], coordinates["y"]]] = transcripts[[coordinates["y"], coordinates["x"]]]
 
         pbar.set_description("Converting to PointsModel")
         sdata[G4XKeys.TRANSCRIPTS_KEY.v] = PointsModel.parse(
@@ -711,11 +676,7 @@ def _deep_update(base_dict, update_dict):
     Recursively update a dictionary with another dictionary.
     """
     for key, value in update_dict.items():
-        if (
-            isinstance(value, dict)
-            and key in base_dict
-            and isinstance(base_dict[key], dict)
-        ):
+        if isinstance(value, dict) and key in base_dict and isinstance(base_dict[key], dict):
             _deep_update(base_dict[key], value)
         else:
             base_dict[key] = value