From 0a54d7750134372da70db8569fd0f978625be287 Mon Sep 17 00:00:00 2001 From: julienmortier Date: Mon, 27 Oct 2025 09:51:38 +0100 Subject: [PATCH 1/5] added region_annotations --- src/harpy/shape/__init__.pyi | 6 +- src/harpy/shape/_filters.py | 349 ++++++++++++++ src/harpy/shape/_shape.py | 119 ++++- src/harpy/table/__init__.pyi | 77 ++-- src/harpy/table/_region_annotations.py | 612 +++++++++++++++++++++++++ src/harpy/table/_table.py | 234 +++++++++- 6 files changed, 1344 insertions(+), 53 deletions(-) create mode 100755 src/harpy/shape/_filters.py mode change 100644 => 100755 src/harpy/table/__init__.pyi create mode 100755 src/harpy/table/_region_annotations.py diff --git a/src/harpy/shape/__init__.pyi b/src/harpy/shape/__init__.pyi index 99ffd363..fbf27a9f 100644 --- a/src/harpy/shape/__init__.pyi +++ b/src/harpy/shape/__init__.pyi @@ -1,5 +1,6 @@ from ._cell_expansion import create_voronoi_boundaries -from ._shape import add_shapes_layer, filter_shapes_layer, intersect_rectangles, vectorize +from ._shape import add_shapes_layer, filter_shapes_layer, intersect_rectangles, vectorize, prep_region_annotations +from ._filters import filter_by_morphology, filter_by_shapes __all__ = [ "add_shapes_layer", @@ -7,4 +8,7 @@ __all__ = [ "filter_shapes_layer", "intersect_rectangles", "vectorize", + "prep_region_annotations", + "filter_by_morphology", + "filter_by_shapes", ] diff --git a/src/harpy/shape/_filters.py b/src/harpy/shape/_filters.py new file mode 100755 index 00000000..f82dadcf --- /dev/null +++ b/src/harpy/shape/_filters.py @@ -0,0 +1,349 @@ +from typing import Optional, Dict, Union, List +from spatialdata import SpatialData +import geopandas as gpd +import pandas as pd +import numpy as np +from shapely.geometry import Polygon, MultiPolygon, Point, MultiPoint, LineString, MultiLineString +from harpy.shape._shape import add_shapes_layer + +def filter_by_morphology( + sdata: SpatialData, + shapes_layer: str, + output_shapes_layer: str, + filters: Optional[Dict[str, tuple]] = None, + keep_unsupported: bool = False, + calculate_all_features: bool = False, + calculate_all_features_grouped: bool = False, + shape_names_column: str = "name", + pixel_size_um: float = 1.0, + overwrite: bool = True, +): + """ + Filter polygons in a SpatialData shapes layer based on morphological features. + It is recommended to run `hp.sh.prep_region_annotations` first when filtering region annotations that + may contain multipolygons, unnamed annotations, etc. + + Parameters + ---------- + sdata + SpatialData object containing the input shapes layer. + shapes_layer + Name of the shapes layer in `sdata.shapes` containing polygons. + output_shapes_layer + Name of the shapes layer to store the filtered polygons. + filters + Dictionary specifying filtering thresholds. + Each entry should be of the form: + `{ "feature_name": (min_value, max_value) }` + where `None` can be used to skip a bound, e.g.: + `{ "area": (50, 200), "convexity": (None, 0.8) }` + + Supported features: + - `"area"`: Area of the polygon (px²). + → Use to filter out very small or very large polygons. + - `"perimeter"`: Perimeter length (px). + → Use to detect irregular boundaries or fragmented shapes. + - `"circularity"`: 4π * area / perimeter². + → 1 for a perfect circle; lower = irregular shape. + - `"compactness"`: perimeter² / area. + → Shape compactness measure. + - `"convex_area"`: Area of convex hull (px²). + → Useful with solidity and convexity. + - `"solidity"`: area / convex_area. + → Low values mean concave or fragmented shapes. + - `"convexity"`: convex_perimeter / perimeter. + → 1 for perfectly convex shapes. Lower for rough or spiky boundaries. + - `"centroid_dif"`: Distance between polygon and convex hull centroids normalized by the polygon area. + → Captures off-centered concavity or asymmetry. + - `"major_axis_length"`: Length of the longest side of the minimum rotated bounding box (px). + → Use to filter very long or very short shapes. + - `"minor_axis_length"`: Length of the shortest side of the minimum rotated bounding box (px). + → Use to filter very wide or very skinny shapes. + - `"major_minor_axis_ratio"`: major_axis_length / minor_axis_length. + → High values indicate elongated polygons; ~1 for round shapes. + + You can use grouped filters by suffixing features with `-grouped`. + Grouped filters merge polygons sharing the same name in `shape_names_column` before computing morphological features and + can be used interchangibly with regular filters. + + keep_unsupported + Only Polygon and MultiPolygon are supported. Set keep_unsupported to True to skip any unssupported geometries types (e.g. Point), but + keep them in the output_shapes_layer. Set to False to remove them from output_shapes_layer. + calculate_all_features + If True, computes all supported morphological features regardless of which ones are used in `filters` and saves them to `output_shapes_layer`. + If False, only computes and saves morphological features needed for `filters`. + calculate_all_features_grouped + If True, computes all morphological features for merged group geometries (by {shape_names_column}), regardless + of which ones are used in grouped filters. + shape_names_column + Column name in shapes layer containing geometry names. Required when using grouped filters. + + pixel_size_um + Scale factor to convert geometric measurements from pixel units to microns. + - Applied to: + * "area" and "convex_area" → scaled by (pixel_size_um)² + * "perimeter", "major_axis_length", and "minor_axis_length" → scaled by (pixel_size_um) + - Dimensionless ratios (e.g., "circularity", "compactness", "solidity", "convexity", "major_minor_axis_ratio") + are unaffected by scaling but are computed using the scaled geometric quantities. + Defaults to 1.0 (no scaling, i.e. units remain in pixels). + Note that this affects the min/max values that need to be specified in `filters`. + overwrite + Whether to overwrite an existing shapes layer. + + Returns + ------- + SpatialData object with updated shapes layer containing only the filtered polygons. + """ + + # Get filters and split into individual and grouped filters + filters = filters or {} + + grouped_filters = {k.replace("-grouped", ""): v for k, v in filters.items() if k.endswith("-grouped")} + individual_filters = {k: v for k, v in filters.items() if not k.endswith("-grouped")} + + required_individual = set(individual_filters.keys()) + required_grouped = set(grouped_filters.keys()) + + if calculate_all_features: + required_individual.update([ + "area", "perimeter", "convex_area", + "circularity", "compactness", "solidity", + "convexity", "centroid_dif", + "major_axis_length", "minor_axis_length", "major_minor_axis_ratio" + ]) + print("Calculating all supported morphological features (per polygon).") + + if calculate_all_features_grouped: + required_grouped.update([ + "area", "perimeter", "convex_area", + "circularity", "compactness", "solidity", + "convexity", "centroid_dif", + "major_axis_length", "minor_axis_length", "major_minor_axis_ratio" + ]) + print("Calculating all supported morphological features (grouped).") + + # Create copy of shapes layer + gdf = sdata.shapes[shapes_layer].copy() + + # Filter out geometries that are not Polygon or MultiPolygon + supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon))) + + unssuported = len(gdf) - supported_mask.sum() + if unssuported > 0: + print(f"Found {unssuported} non-polygon geometries in {shapes_layer}.") + if unssuported == len(gdf): + print("No supported geometries (Polygon, MultiPolygon) found. Exiting...") + return sdata + + if keep_unsupported: + gdf_skipped = gdf[~supported_mask].copy() + else: + gdf_skipped = gpd.GeoDataFrame(columns=gdf.columns, geometry=[]) + + gdf = gdf[supported_mask].copy() + + # Dissolve polygons by name + grouped_gdf = None + if grouped_filters or calculate_all_features_grouped: + if shape_names_column not in gdf.columns: + raise ValueError("Grouped filters require a 'name' column in the shapes layer.") + + grouped_gdf = gdf.dissolve(by=shape_names_column, as_index=False, aggfunc="first").copy() + grouped_gdf["geometry"] = gdf.groupby(shape_names_column).geometry.apply(lambda x: x.unary_union).values + print(f"Polygons merged by {shape_names_column}, resulting in {len(grouped_gdf)} groups.") + + # Compute metrics + def _compute_morphological_features(gdf, required, pixel_size_um, suffix): + if any(filter in required for filter in ["area", "circularity", "compactness", "solidity", "centroid_dif"]): + gdf[f"area{suffix}"] = gdf.geometry.area * pixel_size_um**2 + + if any(filter in required for filter in ["perimeter", "circularity", "compactness", "convexity"]): + gdf[f"perimeter{suffix}"] = gdf.geometry.length * pixel_size_um + + if any(filter in required for filter in ["convex_area", "solidity"]): + gdf[f"convex_area{suffix}"] = gdf.geometry.convex_hull.area * pixel_size_um**2 + + if "circularity" in required: + gdf[f"circularity{suffix}"] = 4 * np.pi * gdf[f"area{suffix}"] / (gdf[f"perimeter{suffix}"] ** 2) + + if "compactness" in required: + gdf[f"compactness{suffix}"] = (gdf[f"perimeter{suffix}"] ** 2) / gdf[f"area{suffix}"] + + if "solidity" in required: + gdf[f"solidity{suffix}"] = gdf[f"area{suffix}"] / gdf[f"convex_area{suffix}"] + + if "centroid_dif" in required: + gdf[f"centroid_x{suffix}"] = gdf.geometry.centroid.x + gdf[f"centroid_y{suffix}"] = gdf.geometry.centroid.y + hull_centroids = gdf.geometry.convex_hull.centroid + gdf[f"centroid_dif{suffix}"] = np.sqrt( + (gdf[f"centroid_x{suffix}"] - hull_centroids.x) ** 2 + + (gdf[f"centroid_y{suffix}"] - hull_centroids.y) ** 2 + ) / np.sqrt(gdf[f"area{suffix}"]) + + if "convexity" in required: + gdf[f"convexity{suffix}"] = (gdf.geometry.convex_hull.length * pixel_size_um) / gdf[f"perimeter{suffix}"] + + # Get bounding box lengths from minimum rotated rectangle + if any(filter in required for filter in ["major_axis_length", "minor_axis_length", "major_minor_axis_ratio"]): + def _rotated_rect_axes(geom): + try: + rect = geom.minimum_rotated_rectangle + x, y = rect.exterior.coords.xy + # Compute edge lengths + edges = np.sqrt(np.diff(x)**2 + np.diff(y)**2) + edges = np.sort(edges[:-1]) # drop duplicate closing edge + minor, major = edges[0], edges[1] + ratio = major / minor if minor > 0 else np.nan + return major, minor, ratio + except Exception: + return np.nan, np.nan, np.nan + + results = np.array([_rotated_rect_axes(g) for g in gdf.geometry]) + major_axis_length, minor_axis_length, major_minor_axis_ratio = results.T + if any(filter in required for filter in ["major_axis_length", "major_minor_axis_ratio"]): + gdf[f"major_axis_length{suffix}"] = major_axis_length * pixel_size_um + if any(filter in required for filter in ["minor_axis_length", "major_minor_axis_ratio"]): + gdf[f"minor_axis_length{suffix}"] = minor_axis_length * pixel_size_um + if "major_minor_axis_ratio" in required: + gdf[f"major_minor_axis_ratio{suffix}"] = major_minor_axis_ratio + + return gdf + + gdf = _compute_morphological_features(gdf, required_individual, pixel_size_um, "") + grouped_gdf = _compute_morphological_features(grouped_gdf, required_grouped, pixel_size_um, "-grouped") + + # Merge grouped metrics back into main gdf + if grouped_gdf is not None and not grouped_gdf.empty: + grouped_metrics = [col for col in grouped_gdf.columns if col.endswith("-grouped")] + gdf = gdf.merge( + grouped_gdf[[shape_names_column] + grouped_metrics], + on=shape_names_column, + how="left" + ) + + # Apply filters + print(f"Applying morphological filter(s) on {len(gdf)} polygons...") + mask = np.ones(len(gdf), dtype=bool) + if filters: + for feature, (min_val, max_val) in filters.items(): + print(f"\nFiltering by '{feature}': {min_val} ≤ value ≤ {max_val}") + if feature not in gdf.columns: + raise KeyError(f"Feature '{feature}' was not computed. Check your spelling or supported feature list.") + + # Apply lower bound + if min_val is not None: + to_remove = (gdf[feature] < min_val) & mask + removed_low = to_remove.sum() + mask &= gdf[feature] >= min_val + print(f" - Removed {removed_low} polygons with {feature} ≤ {min_val}") + + # Apply upper bound + if max_val is not None: + to_remove = (gdf[feature] > max_val) & mask + removed_high = to_remove.sum() + mask &= gdf[feature] <= max_val + print(f" - Removed {removed_high} polygons with {feature} ≥ {max_val}") + + remaining = mask.sum() + print(f" → Remaining after filtering '{feature}': {remaining} polygons") + + + filtered_gdf = gdf[mask] + print(f"\nKept {len(filtered_gdf)} / {len(gdf)} polygons after morphological filters.") + + # Add filtered shapes layer + input_gdf = pd.concat([filtered_gdf, gdf_skipped], ignore_index=False) + sdata = add_shapes_layer( + sdata, + input=input_gdf, + output_layer=output_shapes_layer, + overwrite=overwrite, + ) + + return sdata + +def filter_by_shapes( + sdata: SpatialData, + target_shapes_layer: str, + mask_shapes_layer: str, + output_shapes_layer: str, + shape_names_column: Optional[str] = None, + shape_names: Optional[Union[str, List[str]]] = None, + keep_intersecting: bool = True, + overwrite: bool = False, +): + """ + Filter polygons in a target shapes layer (typically containg segmention boundaries) based on intersection with polygons in a mask layer + (typically containing region annotations). + + Parameters + ---------- + sdata + SpatialData object containing the target and mask shapes layers. + target_shapes_layer + Name of shapes layer whose polygons will be filtered. + mask_shapes_layer + Name of shapes layer used as mask for filtering. + output_shapes_layer + Name of the output shapes layer to store filtered polygons. + shape_names_column + Optional column in mask_shapes_layer to select specific polygons. + shape_names + Name or list of names of polygons in mask_shapes_layer to use for filtering. Ignored if shape_names_column is None. + keep_intersecting + If True, keeps polygons that intersect the mask. + If False, removes polygons that intersect the mask. + overwrite + If True, overwrites the output shapes layer if it exists. + + Returns + ------- + SpatialData object with filtered shapes layer. + """ + + # Copy target layer + target_gdf = sdata.shapes[target_shapes_layer].copy() + + # Copy mask layer + mask_gdf = sdata.shapes[mask_shapes_layer].copy() + + # Optionally select subset of mask polygons + if shape_names_column is not None and shape_names is not None: + if shape_names_column not in mask_gdf.columns: + raise ValueError(f"Column '{shape_names_column}' not found in mask layer '{mask_shapes_layer}'.") + if isinstance(shape_names, str): + shape_names = [shape_names] + mask_gdf = mask_gdf[mask_gdf[shape_names_column].isin(shape_names)].copy() + if mask_gdf.empty: + raise ValueError(f"No geometries found in '{mask_shapes_layer}' matching {shape_names}.") + + # Build mask union + mask_union = mask_gdf.unary_union + + # Compute intersection boolean + target_gdf["intersects_mask"] = target_gdf.geometry.intersects(mask_union) + + # Filter depending on mode + if keep_intersecting: + filtered_gdf = target_gdf[target_gdf["intersects_mask"]].copy() + else: + filtered_gdf = target_gdf[~target_gdf["intersects_mask"]].copy() + + filtered_gdf.drop(columns=["intersects_mask"], inplace=True) + removed = len(target_gdf) - len(filtered_gdf) + + if keep_intersecting: + print(f"Kept {len(filtered_gdf)} / {len(target_gdf)} geometries intersecting '{mask_shapes_layer}' (removed {removed}).") + else: + print(f"Removed {removed} / {len(target_gdf)} geometries intersecting '{mask_shapes_layer}' (kept {len(filtered_gdf)}).") + + # Add to SpatialData + sdata = add_shapes_layer( + sdata, + input=filtered_gdf, + output_layer=output_shapes_layer, + overwrite=overwrite, + ) + + return sdata diff --git a/src/harpy/shape/_shape.py b/src/harpy/shape/_shape.py index c5473db8..c7f29f7f 100644 --- a/src/harpy/shape/_shape.py +++ b/src/harpy/shape/_shape.py @@ -2,7 +2,7 @@ from dask.array import Array from geopandas import GeoDataFrame -from shapely.geometry import GeometryCollection, MultiPolygon, Polygon +from shapely.geometry import GeometryCollection, MultiPolygon, Polygon, Point, MultiPoint, LineString, MultiLineString from spatialdata import SpatialData from spatialdata.models._utils import MappingToCoordinateSystem_t from spatialdata.transformations import get_transformation @@ -10,6 +10,7 @@ from harpy.image._image import _get_spatial_element from harpy.shape._manager import ShapesLayerManager +import numpy as np def vectorize( sdata, @@ -179,3 +180,119 @@ def intersect_rectangles(rect1: list[int | float], rect2: list[int | float]) -> return [x_min, x_max, y_min, y_max] else: return None + + +def prep_region_annotations( + sdata: SpatialData, + shapes_layer: str, + output_shapes_layer: str, + shape_names_column: str = "name", + unnamed: str = "unnamed", + unique_shape_names_column: str = "name-unique", + erosion: float = 0.5, + overwrite: bool = False, +): + """ + Prepares region annotations in a shapes layer for `hp.sh.filter_by_morphology`, `hp.tb.assign_cells_to_shapes` and `hp.tb.compute_distance_to_shapes`. + Operations performed: + - Ensures a shape name column exists and fills missing names. + - Converts Points with a `radius` column into circular polygons. Points without a `radius` column will be preserved as Points. + - Slightly erodes polygons to avoid shared edges. + - Explodes multipolygons into separate single polygons. + - Generates unique names for shapes with duplicate base names. + + Parameters + ---------- + sdata + The SpatialData object containing the input shapes layer. + shapes_layer + The shapes layer in `sdata.shapes` to use as input. + output_shapes_layer + The output shapes layer in `sdata.tables` to which the updated shapes layer will be written. + shape_names_column + Column name in shapes layer containing geometry names. If not present, new names will be generated. + unnamed + Name to be assigned to any unnamed geometries in `shape_names_column`. Defaults to 'unnamed'. + unique_shape_names_column + Column name in which unique names will be created for single polygons by appending a counter to the original name in `shape_names_column` for polygons with the same name. Note + that multipolygons will be split in individual polygons and each will get a unique name based on the original name of the multipolygon. Unique names will be stored in + `{shape_names_column}-unique` in the updated shapes layer. + erosion + Number of pixels to erode polygons by. This can avoid problems with overlapping edges of geometries when calculating distances. Default is 0.5 (i.e. erosion by 0.5 pixels). + overwrite + If True, overwrites the `output_shapes_layer` if it already exists in `sdata`. + + Returns + ------- + Modified `sdata` object with updated updated shapes layer. + """ + + # Create copy of shapes layer + gdf = sdata.shapes[shapes_layer].copy() + print(f"Found {len(gdf)} geometries in {shapes_layer}.") + + # Ensure shape_names_column exist + if shape_names_column not in gdf.columns: + gdf[shape_names_column] = unnamed + + gdf[shape_names_column] = gdf[shape_names_column].fillna("").astype(str) + unnamed_mask = gdf[shape_names_column] == "" + for i, idx in enumerate(gdf[unnamed_mask].index): + gdf.at[idx, shape_names_column] = unnamed + + # Convert Points with a radius column to circular polygons + def _point_to_circle(geom, radius=None): + if isinstance(geom, Point) and radius is not None: + return geom.buffer(radius, resolution=16) + return geom + + if "radius" in gdf.columns: + mask_points_with_radius = gdf.geometry.geom_type.eq("Point") & gdf["radius"].notna() + n_converted = mask_points_with_radius.sum() + + if n_converted > 0: + print(f"Converting {n_converted} Point geometries with 'radius' to circular polygons.") + gdf.loc[mask_points_with_radius, "geometry"] = gdf.loc[mask_points_with_radius].apply( + lambda row: _point_to_circle(row.geometry, getattr(row, "radius", None)), + axis=1 + ) + + # Slightly erode all polygons (this avoids any shared borders between polygons) + gdf["geometry"] = gdf.geometry.apply( + lambda geom: geom.buffer( + -erosion, + join_style=2, + resolution=16) + if geom.geom_type in ["Polygon", "MultiPolygon"] else geom + ) + polygon_mask = ~gdf.geometry.is_empty + removed = len(gdf) - polygon_mask.sum() + gdf = gdf[polygon_mask] # drop any polygons that collapsed to empty + if removed > 0: + print(f"Removed {removed} polygons that collapsed to empty after erosion.") + + # Explode multipolygons into single polygons (this allows us to treat multipolygons as unique polygons) + n_multipolygons = gdf.geometry.apply(lambda g: g.geom_type == "MultiPolygon").sum() + gdf = gdf.explode(index_parts=False).reset_index(drop=True) + n_after = len(gdf) + print(f"Split {n_multipolygons} multipolygons into individual polygons. Total number of geometries after splitting multipolgons is {n_after}.") + + # Create unique names + sizes = gdf.groupby(shape_names_column)[shape_names_column].transform("size") + counter = gdf.groupby(shape_names_column).cumcount() + 1 + gdf[unique_shape_names_column] = np.where( + sizes.eq(1), + gdf[shape_names_column], + gdf[shape_names_column] + counter.astype(str) + ) + + # Add filtered shapes layer + sdata = add_shapes_layer( + sdata, + input=gdf, + output_layer=output_shapes_layer, + overwrite=overwrite, + ) + + return sdata + diff --git a/src/harpy/table/__init__.pyi b/src/harpy/table/__init__.pyi old mode 100644 new mode 100755 index bd9b5837..a060960c --- a/src/harpy/table/__init__.pyi +++ b/src/harpy/table/__init__.pyi @@ -1,36 +1,41 @@ -from ._allocation import allocate, bin_counts -from ._allocation_intensity import allocate_intensity -from ._annotation import cluster_cleanliness, score_genes, score_genes_iter -from ._clustering import kmeans, leiden -from ._enrichment import nhood_enrichment -from ._preprocess import preprocess_proteomics, preprocess_transcriptomics -from ._regionprops import add_regionprop_features -from ._table import add_table_layer, correct_marker_genes, filter_on_size -from .cell_clustering._clustering import flowsom -from .cell_clustering._preprocess import cell_clustering_preprocess -from .cell_clustering._weighted_channel_expression import weighted_channel_expression -from .pixel_clustering._cluster_intensity import cluster_intensity -from .pixel_clustering._neighbors import spatial_pixel_neighbors - -__all__ = [ - "add_table_layer", - "correct_marker_genes", - "filter_on_size", - "flowsom", - "weighted_channel_expression", - "cell_clustering_preprocess", - "cluster_intensity", - "spatial_pixel_neighbors", - "kmeans", - "leiden", - "nhood_enrichment", - "preprocess_proteomics", - "preprocess_transcriptomics", - "add_regionprop_features", - "cluster_cleanliness", - "score_genes", - "score_genes_iter", - "allocate", - "bin_counts", - "allocate_intensity", -] +from ._allocation import allocate, bin_counts +from ._allocation_intensity import allocate_intensity +from ._annotation import cluster_cleanliness, score_genes, score_genes_iter +from ._clustering import kmeans, leiden +from ._enrichment import nhood_enrichment +from ._preprocess import preprocess_proteomics, preprocess_transcriptomics +from ._regionprops import add_regionprop_features +from ._table import add_table_layer, correct_marker_genes, filter_on_size, filter_numerical, filter_categorical +from .cell_clustering._clustering import flowsom +from .cell_clustering._preprocess import cell_clustering_preprocess +from .cell_clustering._weighted_channel_expression import weighted_channel_expression +from .pixel_clustering._cluster_intensity import cluster_intensity +from .pixel_clustering._neighbors import spatial_pixel_neighbors +from ._region_annotations import assign_cells_to_shapes, compute_distance_to_shapes + +__all__ = [ + "add_table_layer", + "correct_marker_genes", + "filter_on_size", + "filter_numerical", + "filter_categorical", + "flowsom", + "weighted_channel_expression", + "cell_clustering_preprocess", + "cluster_intensity", + "spatial_pixel_neighbors", + "kmeans", + "leiden", + "nhood_enrichment", + "preprocess_proteomics", + "preprocess_transcriptomics", + "add_regionprop_features", + "cluster_cleanliness", + "score_genes", + "score_genes_iter", + "allocate", + "bin_counts", + "allocate_intensity", + "assign_cells_to_shapes", + "compute_distance_to_shapes", +] diff --git a/src/harpy/table/_region_annotations.py b/src/harpy/table/_region_annotations.py new file mode 100755 index 00000000..b463de11 --- /dev/null +++ b/src/harpy/table/_region_annotations.py @@ -0,0 +1,612 @@ +from spatialdata import SpatialData +import geopandas as gpd +import pandas as pd +from typing import Literal +from shapely.geometry import Polygon, MultiPolygon, Point, MultiPoint, LineString, MultiLineString +from harpy.table._table import add_table_layer +from harpy.utils._keys import _REGION_KEY, _INSTANCE_KEY + +def assign_cells_to_shapes( + sdata: SpatialData, + shapes_layer: str, + table_layer: str, + output_table_layer: str, + shape_names_column: str = "name", + unique_shape_names_column: str = "name-unique", + output_column: str = None, + mode: Literal["original_names", "unique_names", "both"] = "original_names", + create_column_per_shape: bool = False, + overlap_tolerance: float = 0.1, + spatial_key: str = 'spatial', + xy_columns: tuple = None, + overwrite: bool = False, +): + """ + Assign cells to polygons in a shapes layer and update the sdata table layer. It is recommended to run `hp.sh.prep_region_annotations` first. + + Parameters + ---------- + sdata + The SpatialData object containing the input table layer and shapes layer. + shapes_layer + The shapes layer in `sdata.shapes` to use as input. + table_layer + The table layer in `sdata.tables` to use as input. + output_table_layer + The output table layer in `sdata.tables` to which the updated table layer will be written. + shape_names_column + Column name in shapes layer containing geometry names. + unique_shape_names_column + Column name in shapes layer containing unique geometry names. + output_column + Name of the output column in `sdata.tables[table_layer].obs` to store the shape name of the geometries a cell was found in (if `create_column_per_shape` is False). + For the `unique_names` mode, the output will be stored in `{output_column}-unique` in `sdata.tables[table_layer].obs` (if `create_column_per_shape` is False). + If create_column_per_shape is True and output_column is not None, then column names will be in the format `{output_colum}-{shape_name}`. + If create_column_per_shape is True and output_column is None, then column names will be in the format `{shape_name}`. + mode + When set to `original_names`, original polygon names from `shape_names_column` will be used. + When set to `unique_names`, unique polygon names from `unique_shape_names_column` will be used. Use `both`, to run both modes at the same time. + create_column_per_shape + If True, create one column (named according to the shape names) per shape indicating whether a cell is located inside it. + overlap_tolerance + Tolerance for detecting overlapping polygons (area units of geometry CRS). + spatial_key + Key in `sdata.tables[table_layer].obsm` containing spatial coordinates. Ignored if `xy_columns` is provided. + xy_columns + Tuple of column names in `sdata.tables[table_layer].obs` containing the x and y coordinates the cells. + If None, defaults to using coordinates from `sdata.tables[table_layer].obsm[spatial_key]`. + overwrite + If True, overwrites the `output_table_layer` and/or `output_shapes_layer` if it already exists in `sdata`. + + Notes + ------- + - Only `Polygon` and `MultiPolygon` geometries are supported. Non-polygon geometries (e.g., `Point`, `LineString`) are skipped. + + Returns + ------- + Modified `sdata` object with updated table layer. + """ + + if not output_column and not create_column_per_shape: + raise ValueError("Specify `output_column` or set `create_column_per_shape=True`.") + + # Create copy of shapes layer + gdf = sdata.shapes[shapes_layer].copy() + + # Create copy of table layer + adata = sdata.tables[table_layer].copy() + + # Filter out geometries that are not Polygon or MultiPolygon + supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon))) + skipped = len(gdf) - supported_mask.sum() + if skipped > 0: + print(f"Skipped {skipped} non-polygon geometries in {shapes_layer}.") + if skipped == len(gdf): + print("No supported geometries (Polygon, MultiPolygon) found.") + return sdata + + gdf = gdf[supported_mask].copy().reset_index(drop=True) + + # Check for overlapping polygons + total_area = gdf.geometry.area.sum() + union_area = gdf.geometry.unary_union.area + if total_area - union_area > overlap_tolerance and not create_column_per_shape: + raise ValueError(f"Overlapping polygons detected in {shapes_layer}. Correct polygons or use create_column_per_shape.") + elif total_area - union_area > 0 and not create_column_per_shape: + print(f"Overlaps detected (Δ={total_area - union_area:.3f}), below tolerance threshold {overlap_tolerance}.") + + # Get cell coordinates + if xy_columns is not None: + x_col, y_col = xy_columns + coords = adata.obs[[x_col, y_col]].to_numpy() + else: + if spatial_key not in adata.obsm: + raise KeyError( + f"No spatial coordinates found in `obsm['{spatial_key}']` and `xy_columns` not provided.") + coords = adata.obsm[spatial_key] + if coords.shape[1] != 2: + raise ValueError(f"`obsm['{spatial_key}']` must have shape (n_cells, 2).") + + # Function to assign cell to a polygon + def _assign_region(x, y, gdf, column, sindex): + point = Point(x, y) + + # Quick bounding box search to prefilter the cells + candidate_idx = list(sindex.intersection(point.bounds)) + if not candidate_idx: + return None + + # Slower point‑in‑polygon search + candidate_gdf = gdf.iloc[candidate_idx] + match = candidate_gdf[candidate_gdf.contains(point)] + + if match.empty: + return None + + return match[column].iloc[0] + + # Assign cells to polygons + if create_column_per_shape: + # Collect all relevant name columns based on mode + name_columns = [] + if mode in ("original_names", "both"): + name_columns.append(shape_names_column) + if mode in ("unique_names", "both"): + name_columns.append(unique_shape_names_column) + + # Collect all unique names across both columns + name_to_column = {} + for name_column in name_columns: + for name in gdf[name_column].dropna().astype(str).unique(): + if name not in name_to_column: + name_to_column[name] = name_column + + # Create column per shape name + for shape_name, name_column in name_to_column.items(): + col_name = f"{output_column}-{shape_name}" if output_column else shape_name + + gdf_shape = gdf[gdf[name_column] == shape_name] + if gdf_shape.empty: + continue + + sidx_shape = gdf_shape.sindex # Build spatial index + assigned = [ + _assign_region(x, y, gdf_shape, name_column, sidx_shape) for x, y in coords + ] + adata.obs[col_name] = assigned + + else: + # One output column per mode + run_assign_dict = {} + if mode in ("original_names", "both"): + run_assign_dict[output_column] = shape_names_column + if mode in ("unique_names", "both"): + run_assign_dict[f"{output_column}-unique"] = unique_shape_names_column + + for output_column_name, name_column in run_assign_dict.items(): + # Assign one shape name per cell (to be avoided with overlapping regions) + sidx = gdf.sindex # Build spatial index + assigned = [ + _assign_region(x, y, gdf, name_column, sidx) + for x, y in coords + ] + adata.obs[output_column_name] = assigned + + # Add table layer + sdata = add_table_layer( + sdata, + adata = adata, + output_layer = output_table_layer, + region = adata.obs[_REGION_KEY].cat.categories.to_list(), + overwrite = overwrite, + ) + + return sdata + +def compute_distance_to_shapes( + sdata: SpatialData, + shapes_layer: str, + table_layer: str, + output_table_layer: str, + shape_names_column: str = "name", + unique_shape_names_column: str = "name-unique", + output_name: str = None, + pixel_size_um: float = 1.0, + modes: list[Literal[ + "nearest_edge", "nearest_edge_grouped", "all_edges", + "nearest_outer_edge", "nearest_outer_edge_grouped", "all_outer_edges", + "nearest_inner_edge", "nearest_inner_edge_grouped", "nearest_inner_edge_grouped_unique", "all_inner_edges", + "nearest_centroid", "nearest_centroid_grouped", "all_centroids", + "nearest_point", "nearest_point_grouped", "all_points" + ]] = ["all_edges"], + spatial_key: str = 'spatial', + xy_columns: tuple = None, + overwrite: bool = False, +): + """ + Compute distances from cells to polygons in a shapes layer and update the sdata table layer. + It is recommended to run `hp.sh.prep_region_annotations` first. + + Parameters + ---------- + sdata + The SpatialData object containing the input table layer and shapes layer. + shapes_layer + The shapes layer in `sdata.shapes` to use as input. + table_layer + The table layer in `sdata.tables` to use as input. + output_table_layer + The output table layer in `sdata.tables` to which the updated table layer will be written. + shape_names_column + Column name in shapes layer containing geometry names. + unique_shape_names_column + Column name in shapes layer containing unique geometry names. + output_name + Prefix for new distance columns in `.obs`. If None, no prefix is added. + pixel_size_um + Scale factor to convert distances to microns. Defaults to 1 (i.e. distances are in pixels). + modes + Which distance features to calculate. Options include: + + Edge distances (For Polygon and MultiPolygon geometries) + - `"nearest_edge"`: Distance to the nearest edge (outer or inner) of any polygon. + Creates `{output_name}-distance_to_nearest_edge` and `{output_name}-name_of_nearest_edge` columns. + - `"nearest_edge_grouped"`: For each group of shapes with the same name, compute the distance to the edge that is nearest. + Creates `{output_name}-distance_to_nearest_edge_of_` and `{output_name}-name_of_nearest_edge_of_` columns. + - `"all_edges"`: For each individual polygon, compute the distance to its edge. + Creates `{output_name}-distance_to_edge_of_` column. + + Outer edge distances (For Polygon and MultiPolygon geometries) + - `"nearest_outer_edge"`: Distance to the nearest outer edge of any polygon. + Creates `{output_name}-distance_to_nearest_outer_edge` and `{output_name}-name_of_nearest_outer_edge` columns. + - `"nearest_outer_edge_grouped"`: For each group of shapes with the same name, compute the distance to the outer edge that is nearest. + Creates `{output_name}-distance_to_nearest_outer_edge_of_` and `{output_name}-name_of_nearest_outer_edge_of_` columns. + - `"all_outer_edges"`: For each individual polygon, compute the distance to its outer edge. + Creates `{output_name}-distance_to_outer_edge_of_` column. + + Inner edge (hole) distances (For Polygon and MultiPolygon geometries) + - `"nearest_inner_edge"`: Distance to the nearest interior edge (“hole”) of any polygon. + Creates `{output_name}-distance_to_nearest_inner_edge` and `{output_name}-name_of_nearest_inner_edge` columns. + - `"nearest_inner_edge_grouped"`: For each group of shapes with the same name, compute the distance to the nearest inner edge of that group. + Creates `{output_name}-distance_to_nearest_inner_edge_of_` and `{output_name}-name_of_nearest_inner_edge_of_` columns. + - `"nearest_inner_edge_grouped_unique"`: For each individual polygon, compute the distance to the nearest inner edge of that polygon. + Creates `{output_name}-distance_to_nearest_inner_edge_of_` and `{output_name}-name_of_nearest_inner_edge_of_` columns. + - `"all_inner_edges"`: For all holes, compute the distance to each inner edge. + Creates `{output_name}-distance_to_inner_edge_of_-hole` column. + + Centroid distances (For Polygon and MultiPolygon geometries) + - `"nearest_centroid"`: Distance to the nearest centroid of any polygon. + Creates `{output_name}-distance_to_nearest_centroid` and `{output_name}-name_of_nearest_centroid` columns. + - `"nearest_centroid_grouped"`: For each group of shapes with the same name, compute the distance to the centroid that is nearest. + Creates `{output_name}-distance_to_nearest_centroid_of_` and `{output_name}-name_of_nearest_centroid_of_` columns. + - `"all_centroids"`: For each individual polygon, compute the distance to its centroid. + Creates `{output_name}-distance_to_centroid_of_` column. + + Point distances (For Point geometries) + - `"nearest_point"`: Distance to the nearest point. + Creates `{output_name}-distance_to_nearest_point` and `{output_name}-name_of_nearest_point` columns. + - `"nearest_point_grouped"`: For each group of points with the same name, compute the nearest distance. + Creates `{output_name}-distance_to_nearest_point_of_` and `{output_name}-name_of_nearest_point_of_` columns. + - `"all_points"`: For each individual Point, compute the distance to point coordinates. + Creates `{output_name}-distance_to_point_` column. + + spatial_key + Key in `sdata.tables[table_layer].obsm` containing spatial coordinates. Ignored if `xy_columns` is provided. + xy_columns + Tuple of column names in `sdata.tables[table_layer].obs` containing the x and y coordinates the cells. + If None, defaults to using coordinates from `sdata.tables[table_layer].obsm[spatial_key]`. + overwrite + If True, overwrites the `output_table_layer` if it already exists in `sdata`. + + Notes + ------- + - Only `Polygon`, `MultiPolygon` and `Point` geometries are supported. Other geometries (e.g., `LineString`, `MultiPoint`) are skipped. + + Returns + ------- + Modified `sdata` object with updated table layer. + """ + + if output_name is not None: + output_name = f"{output_name}-" + elif output_name is None: + output_name = "" + + # Create copy of shapes layer + gdf = sdata.shapes[shapes_layer].copy() + + # Create copy of table layer + adata = sdata.tables[table_layer].copy() + + # Filter out geometries that are not Polygon, MultiPolygon or Point + supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon, Point))) + skipped = len(gdf) - supported_mask.sum() + if skipped > 0: + print(f"Skipped {skipped} geometries in {shapes_layer} that are not Polygon, MultiPolygon or Point.") + + gdf = gdf[supported_mask].copy().reset_index(drop=True) + + # Separate gdf by type + gdf_polygons = gdf[gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])].copy() + gdf_points = gdf[gdf.geometry.geom_type == "Point"].copy() + + has_polygons = not gdf_polygons.empty + has_points = not gdf_points.empty + + if not has_polygons and not has_points: + print("No supported geometries (Polygon, MultiPolygon, Point) found.") + return sdata + + # Get cell coordinates + if xy_columns is not None: + x_col, y_col = xy_columns + coords = adata.obs[[x_col, y_col]].to_numpy() + else: + if spatial_key not in adata.obsm: + raise KeyError( + f"No spatial coordinates found in `obsm['{spatial_key}']` and `xy_columns` not provided.") + coords = adata.obsm[spatial_key] + if coords.shape[1] != 2: + raise ValueError(f"`obsm['{spatial_key}']` must have shape (n_cells, 2).") + + # Build point GeoDataFrame + pts = gpd.GeoSeries( + [Point(x, y) for x, y in coords], + crs=gdf.crs + ) + + pts_gdf = gpd.GeoDataFrame({"geometry": pts}, crs=gdf.crs).set_index(adata.obs.index) + + # Polygon and MultiPolygon + if has_polygons: + # Extract exterior and interior boundaries + def _extract_exterior_lines(geom): + if isinstance(geom, Polygon): + return [geom.exterior] + elif isinstance(geom, MultiPolygon): + return [poly.exterior for poly in geom.geoms] + return [] + + def _extract_interior_lines(geom): + if isinstance(geom, Polygon): + return list(geom.interiors) + elif isinstance(geom, MultiPolygon): + lines = [] + for poly in geom.geoms: + lines.extend(poly.interiors) + return lines + return [] + + names = gdf_polygons[unique_shape_names_column].tolist() + exteriors = [MultiLineString(_extract_exterior_lines(geom)) for geom in gdf_polygons.geometry] + interiors = [MultiLineString(_extract_interior_lines(geom)) for geom in gdf_polygons.geometry] + + ext_gdf = gpd.GeoDataFrame({unique_shape_names_column: names}, geometry=exteriors, crs=gdf.crs) + int_gdf = gpd.GeoDataFrame({unique_shape_names_column: names}, geometry=interiors, crs=gdf.crs) + + # Edges (outer and inner) + if "nearest_edge" in modes: + print(f"Calculating 'nearest_edge' distances'") + all_edges_gdf = gpd.GeoDataFrame( + pd.concat([ext_gdf, int_gdf], ignore_index=True), + crs=gdf.crs + ) + joined = gpd.sjoin_nearest( + pts_gdf, + all_edges_gdf, + how="left", distance_col="dist" + ) + adata.obs[f"{output_name}distance_to_nearest_edge"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs[f"{output_name}name_of_nearest_edge"] = joined[unique_shape_names_column] + + if "nearest_edge_grouped" in modes: + print(f"Calculating 'nearest_edge_grouped' distances'") + for name, group in gdf_polygons.groupby(shape_names_column): + edges = [] + labels = [] + for idx, row in group.iterrows(): + edge_lines = _extract_exterior_lines(row.geometry) + _extract_interior_lines(row.geometry) + edges.extend(edge_lines) + labels.extend([row[unique_shape_names_column]] * len(edge_lines)) + + edge_gdf = gpd.GeoDataFrame({"geometry": edges, unique_shape_names_column: labels}, crs=gdf.crs) + joined = gpd.sjoin_nearest(pts_gdf, edge_gdf, how="left", distance_col="dist") + + adata.obs[f"{output_name}distance_to_nearest_edge_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}name_of_nearest_edge_of_{name}"] = joined[unique_shape_names_column] + + if "all_edges" in modes: + print(f"Calculating 'all_edges' distances'") + for _, feat in gdf_polygons.reset_index(drop=True).iterrows(): + adata.obs[f"{output_name}distance_to_edge_of_{feat[unique_shape_names_column]}"] = pts.distance(feat.geometry.boundary).to_numpy() * pixel_size_um + + # Outer edges + if "nearest_outer_edge" in modes: + print(f"Calculating 'nearest_outer_edge' distances'") + joined = gpd.sjoin_nearest( + pts_gdf, + ext_gdf, + how="left", distance_col="dist" + ) + adata.obs[f"{output_name}distance_to_nearest_outer_edge"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs[f"{output_name}name_of_nearest_outer_edge"] = joined[unique_shape_names_column] + + if "nearest_outer_edge_grouped" in modes: + print(f"Calculating 'nearest_outer_edge_grouped' distances'") + for name, group in gdf_polygons.groupby(shape_names_column): + edges = [] + labels = [] + for idx, row in group.iterrows(): + edge_lines = _extract_exterior_lines(row.geometry) + edges.extend(edge_lines) + labels.extend([row[unique_shape_names_column]] * len(edge_lines)) + + edge_gdf = gpd.GeoDataFrame({"geometry": edges, unique_shape_names_column: labels}, crs=gdf.crs) + joined = gpd.sjoin_nearest(pts_gdf, edge_gdf, how="left", distance_col="dist") + + adata.obs[f"{output_name}distance_to_nearest_outer_edge_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}name_of_nearest_outer_edge_of_{name}"] = joined[unique_shape_names_column] + + if "all_outer_edges" in modes: + print(f"Calculating 'all_outer_edges' distances'") + for _, feat in gdf_polygons.reset_index(drop=True).iterrows(): + adata.obs[f"{output_name}distance_to_outer_edge_of_{feat[unique_shape_names_column]}"] = pts.distance(feat.geometry.exterior).to_numpy() * pixel_size_um + + # Inner edges + hole_geoms = [] + hole_names = [] + + for _, row in gdf_polygons.reset_index(drop=True).iterrows(): + base = row[unique_shape_names_column] + geom = row.geometry + + holes =_extract_interior_lines(geom) + + if len(holes) == 0: + continue + + if len(holes) == 1: + hole = holes[0] + hole_geoms.append(LineString(hole.coords)) + hole_names.append(base) + + else: + for i, hole in enumerate(holes, start=1): + hole_geoms.append(LineString(hole.coords)) + hole_names.append(f"{base}-hole{i}") + + holes_gdf = gpd.GeoDataFrame( + {unique_shape_names_column: hole_names}, + geometry=hole_geoms, + crs=gdf.crs + ) + + if "nearest_inner_edge" in modes and not holes_gdf.empty: + print(f"Calculating 'nearest_inner_edge' distances'") + joined = gpd.sjoin_nearest( + pts_gdf, + holes_gdf, + how="left", + distance_col="dist" + ) + adata.obs[f"{output_name}distance_to_nearest_inner_edge"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs[f"{output_name}name_of_nearest_inner_edge"] = joined[unique_shape_names_column] + + if "nearest_inner_edge_grouped" in modes: + print(f"Calculating 'nearest_inner_edge_grouped' distances'") + for name, group in gdf_polygons.groupby(shape_names_column): + hole_lines = [] + hole_labels = [] + + for idx, row in group.iterrows(): + base = row[unique_shape_names_column] + for i, hole in enumerate(_extract_interior_lines(row.geometry), start=1): + hole_lines.append(LineString(hole.coords)) + hole_labels.append(f"{base}-hole{i}") + + if not hole_lines: + continue + + hole_gdf = gpd.GeoDataFrame({ + "geometry": hole_lines, + "hole_name": hole_labels + }, crs=gdf.crs) + + joined = gpd.sjoin_nearest(pts_gdf, hole_gdf, how="left", distance_col="dist") + + adata.obs[f"{output_name}distance_to_nearest_inner_edge_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}name_of_nearest_inner_edge_of_{name}"] = joined["hole_name"] + + if "nearest_inner_edge_grouped_unique" in modes: + print(f"Calculating 'nearest_inner_edge_grouped_unique' distances'") + for idx, row in gdf_polygons.iterrows(): + base = row[unique_shape_names_column] + holes = _extract_interior_lines(row.geometry) + + if not holes: + continue + + hole_geoms = [LineString(hole.coords) for hole in holes] + hole_names = [f"{base}-hole{i+1}" for i in range(len(holes))] + + hole_gdf = gpd.GeoDataFrame({ + "geometry": hole_geoms, + "hole_name": hole_names + }, crs=gdf.crs) + + joined = gpd.sjoin_nearest(pts_gdf, hole_gdf, how="left", distance_col="dist") + + adata.obs[f"{output_name}distance_to_nearest_inner_edge_of_{base}"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}name_of_nearest_inner_edge_of_{base}"] = joined["hole_name"] + + if "all_inner_edges" in modes and not holes_gdf.empty: + print(f"Calculating 'all_inner_edges' distances'") + for _, hole in holes_gdf.iterrows(): + hole_name = hole[unique_shape_names_column] + col = f"{output_name}distance_to_inner_edge_of_{hole_name}" + adata.obs[col] = pts.distance(hole.geometry).to_numpy() * pixel_size_um + + # Centroids + centroids_df = gdf_polygons.reset_index(drop=True)[[unique_shape_names_column]].copy() + centroids_df["geometry"] = gdf_polygons.geometry.centroid.reset_index(drop=True) + + centroids_gdf = gpd.GeoDataFrame( + centroids_df, + geometry="geometry", + crs=gdf.crs + ) + + if "nearest_centroid" in modes: + print(f"Calculating 'nearest_centroid' distances'") + joined = gpd.sjoin_nearest( + pts_gdf, centroids_gdf, + how="left", + distance_col="dist" + ) + adata.obs.loc[joined.index, f"{output_name}distance_to_nearest_centroid"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs.loc[joined.index, f"{output_name}name_of_nearest_centroid"] = ( + joined[unique_shape_names_column].to_numpy() + ) + + if "nearest_centroid_grouped" in modes: + print(f"Calculating 'nearest_centroid_grouped' distances'") + for name, group in gdf_polygons.groupby(shape_names_column): + centroids = group.geometry.centroid + labels = group[unique_shape_names_column].tolist() + + centroid_gdf = gpd.GeoDataFrame({ + "geometry": centroids, + unique_shape_names_column: labels + }, crs=gdf.crs) + + joined = gpd.sjoin_nearest(pts_gdf, centroid_gdf, how="left", distance_col="dist") + + adata.obs[f"{output_name}distance_to_nearest_centroid_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}name_of_nearest_centroid_of_{name}"] = joined[unique_shape_names_column] + + if "all_centroids" in modes: + print(f"Calculating 'all_centroids' distances'") + for idx, feat in gdf_polygons.reset_index(drop=True).iterrows(): + adata.obs[f"{output_name}distance_to_centroid_of_{feat[unique_shape_names_column]}"] = pts.distance(feat.geometry.centroid).to_numpy() * pixel_size_um + + # Point + if has_points: + if "nearest_point" in modes: + print(f"Calculating 'nearest_point' distances'") + joined = gpd.sjoin_nearest( + pts_gdf, gdf_points, how="left", distance_col="dist" + ) + adata.obs[f"{output_name}distance_to_nearest_point"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}name_of_nearest_point"] = joined[f"{shape_names_column}-unique"] + + if "nearest_point_grouped" in modes: + print(f"Calculating 'nearest_point_grouped' distances'") + for name, group in gdf_points.groupby(shape_names_column): + joined = gpd.sjoin_nearest(pts_gdf, group, how="left", distance_col="dist") + adata.obs[f"{output_name}distance_to_nearest_point_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}name_of_nearest_point_of_{name}"] = joined[f"{shape_names_column}-unique"] + + if "all_points" in modes: + print(f"Calculating 'all_points' distances'") + for _, row in gdf_points.iterrows(): + adata.obs[f"{output_name}distance_to_point_{row[unique_shape_names_column]}"] = pts.distance(row.geometry).to_numpy() * pixel_size_um + + + # Add table layer + sdata = add_table_layer( + sdata, + adata = adata, + output_layer = output_table_layer, + region = adata.obs[_REGION_KEY].cat.categories.to_list(), + overwrite = overwrite, + ) + + return sdata \ No newline at end of file diff --git a/src/harpy/table/_table.py b/src/harpy/table/_table.py index bc467b02..afa90d5d 100644 --- a/src/harpy/table/_table.py +++ b/src/harpy/table/_table.py @@ -3,9 +3,11 @@ from collections.abc import Iterable import numpy as np +import pandas as pd from anndata import AnnData from spatialdata import SpatialData from spatialdata.models import TableModel +from typing import Optional, Union, Sequence from harpy.shape._shape import filter_shapes_layer from harpy.table._manager import TableLayerManager @@ -223,15 +225,17 @@ def filter_on_size( labels_layer: list[str], table_layer: str, output_layer: str, - min_size: int = 100, - max_size: int = 100000, - update_shapes_layers: bool = True, cellsize_key=_CELLSIZE_KEY, + min_size: float | int | None = None, + max_size: float | int | None = None, + update_shapes_layers: bool = True, + prefix_filtered_shapes_layer: str = "filtered_size", overwrite: bool = False, ) -> SpatialData: """Returns the updated SpatialData object. All cells with a size outside of the min and max size range are removed using the `cellsize_key` in `.obs`. Run e.g. `harpy.tb.preprocess_transcriptomics` or `harpy.tb.preprocess_proteomics` to obtain cell sizes. + Cells are kept if min_size ≤ cellsize_key ≤ max_size. Parameters ---------- @@ -246,16 +250,83 @@ def filter_on_size( The table layer in `sdata`. output_layer The output table layer in `sdata`. + cellsize_key + Column in `sdata.tables[table_layer].obs` containing cell sizes. min_size - minimum size in pixels. + minimum size in pixels. If None, this value is not used for filtering. max_size - maximum size in pixels. + maximum size in pixels. If None, this value is not used for filtering. update_shapes_layers Whether to filter the shapes layers associated with `labels_layer`. If set to `True`, cells that do not appear in resulting `output_layer` (with `_REGION_KEY` equal to `labels_layer`) will be removed from the shapes layers (via `_INSTANCE_KEY`) in the `sdata` object. - Filtered shapes will be added to `sdata` with prefix 'filtered_size'. - cellsize_key - Column in `sdata.tables[table_layer].obs` containing cell sizes. + Filtered shapes will be added to `sdata` with prefix `prefix_filtered_shapes_layer`. + prefix_filtered_shapes_layer + prefix to use for filtered shapes layer if update_shapes_layers is True. Defaults to 'filtered_size'. + overwrite + If True, overwrites the `output_layer` if it already exists in `sdata`. + + Returns + ------- + The updated SpatialData object. + """ + + sdata = filter_numerical( + sdata = sdata, + labels_layer = labels_layer, + table_layer = table_layer, + output_layer = output_layer, + numerical_column = cellsize_key, + min_value = min_size, + max_value = max_size, + update_shapes_layers = update_shapes_layers, + prefix_filtered_shapes_layer = prefix_filtered_shapes_layer, + overwrite = overwrite, + ) + + return sdata + + +def filter_numerical( + sdata: SpatialData, + labels_layer: list[str], + table_layer: str, + output_layer: str, + numerical_column: str, + min_value: float | int | None = None, + max_value: float | int | None = None, + update_shapes_layers: bool = True, + prefix_filtered_shapes_layer: str = "filtered", + overwrite: bool = False, +) -> SpatialData: + """Returns the updated SpatialData object. + + All cells with a size outside of the min and max size range are removed using the `numerical_column` in `.obs` (cells are kept if min_value ≤ numerical_column ≤ max_value). + + Parameters + ---------- + sdata + The SpatialData object. + labels_layer + The labels layer(s) of `sdata` used to select the cells via the _REGION_KEY in `sdata.tables[table_layer].obs`. + Note that if `output_layer` is equal to `table_layer` and overwrite is True, + cells in `sdata.tables[table_layer]` linked to other `labels_layer` (via the _REGION_KEY), will be removed from `sdata.tables[table_layer]` + (also from the backing zarr store if it is backed). + table_layer + The table layer in `sdata`. + output_layer + The output table layer in `sdata`. + numerical_column + Name of numerical column in `sdata.tables[table_layer].obs` to use for filtering with `min_value` and `max_value`. + min_value + minimum value of `numerical_column`. If None, this value is not used for filtering. + max_value + maximum value of `numerical_column`. If None, this value is not used for filtering. + update_shapes_layers + Whether to filter the shapes layers associated with `labels_layer`. + If set to `True`, cells that do not appear in resulting `output_layer` (with `_REGION_KEY` equal to `labels_layer`) will be removed from the shapes layers (via `_INSTANCE_KEY`) in the `sdata` object. + Filtered shapes will be added to `sdata` with prefix `prefix_filtered_shapes_layer`. + prefix_filtered_shapes_layer + prefix to use for filtered shapes layer if update_shapes_layers is True. Defaults to 'filtered'. overwrite If True, overwrites the `output_layer` if it already exists in `sdata`. @@ -264,13 +335,141 @@ def filter_on_size( The updated SpatialData object. """ process_table_instance = ProcessTable(sdata, labels_layer=labels_layer, table_layer=table_layer) - adata = process_table_instance._get_adata() + adata = process_table_instance._get_adata().copy() + start = adata.shape[0] + + if numerical_column not in adata.obs.columns: + raise ValueError( + f"Column '{numerical_column}' not found in '{table_layer}.obs'. " + ) + + if not np.issubdtype(adata.obs[numerical_column].dtype, np.number): + raise ValueError( + f"Column '{numerical_column}' must be numeric, but dtype is {adata.obs[numerical_column].dtype}." + ) + + # Filter cells based on min and max values + mask = pd.Series(True, index=adata.obs.index) + + if min_value is not None: + below = (adata.obs[numerical_column] < min_value).sum() + log.info(f"Removed {below} cells below {min_value}.") + mask &= adata.obs[numerical_column] >= min_value + if max_value is not None: + above = (adata.obs[numerical_column] > max_value).sum() + log.info(f"Removed {above} cells above {max_value}.") + mask &= adata.obs[numerical_column] <= max_value + + adata = adata[mask, :].copy() + + sdata = add_table_layer( + sdata, + adata=adata, + output_layer=output_layer, + region=process_table_instance.labels_layer, + overwrite=overwrite, + ) + + if update_shapes_layers: + for _labels_layer in process_table_instance.labels_layer: + sdata = filter_shapes_layer( + sdata, + table_layer=output_layer, + labels_layer=_labels_layer, + prefix_filtered_shapes_layer=prefix_filtered_shapes_layer, + ) + + filtered = start - adata.shape[0] + log.info( + f"Removed {filtered} / {start} cells based on '{numerical_column}' (min={min_value}, max={max_value}) and kept {adata.shape[0]}." + ) + + return sdata + + +def filter_categorical( + sdata: SpatialData, + labels_layer: list[str], + table_layer: str, + output_layer: str, + categorical_column: str, + include_values: Optional[Union[str, Sequence[str]]] = None, + exclude_values: Optional[Union[str, Sequence[str]]] = None, + update_shapes_layers: bool = True, + prefix_filtered_shapes_layer: str = "filtered", + overwrite: bool = False, +) -> SpatialData: + """Filter cells based on categorical values in `.obs`. + + Removes or keeps cells based on specific values in a categorical column of + `sdata.tables[table_layer].obs`. + + Parameters + ---------- + sdata + The SpatialData object. + labels_layer + The labels layer(s) of `sdata` used to select the cells via the _REGION_KEY in `sdata.tables[table_layer].obs`. + Note that if `output_layer` is equal to `table_layer` and overwrite is True, + cells in `sdata.tables[table_layer]` linked to other `labels_layer` (via the _REGION_KEY), will be removed from `sdata.tables[table_layer]` + (also from the backing zarr store if it is backed). + table_layer + The table layer in `sdata`. + output_layer + The output table layer in `sdata`. + categorical_column + Name of the categorical column in `.obs` to use for filtering. + include_values + Value(s) to keep. Only cells whose `categorical_column` matches one of these + values will be kept. Mutually exclusive with `exclude_values`. + exclude_values + Value(s) to remove. Cells whose `categorical_column` matches one of these + values will be removed. Mutually exclusive with `include_values`. + update_shapes_layers + Whether to filter the shapes layers associated with `labels_layer`. + If set to `True`, cells that do not appear in resulting `output_layer` (with `_REGION_KEY` equal to `labels_layer`) will be removed from the shapes layers (via `_INSTANCE_KEY`) in the `sdata` object. + Filtered shapes will be added to `sdata` with prefix `prefix_filtered_shapes_layer`. + prefix_filtered_shapes_layer + prefix to use for filtered shapes layer if update_shapes_layers is True. Defaults to 'filtered'. + overwrite + If True, overwrites the `output_layer` if it already exists in `sdata`. + + Returns + ------- + The updated SpatialData object. + """ + if include_values is not None and exclude_values is not None: + raise ValueError("Specify only one of 'include_values' or 'exclude_values'.") + + process_table_instance = ProcessTable(sdata, labels_layer=labels_layer, table_layer=table_layer) + adata = process_table_instance._get_adata().copy() start = adata.shape[0] + + if categorical_column not in adata.obs.columns: + raise ValueError(f"Column '{categorical_column}' not found in '{table_layer}.obs'.") + + # Ensure include/exclude are lists + if isinstance(include_values, str): + include_values = [include_values] + if isinstance(exclude_values, str): + exclude_values = [exclude_values] + + # Filter + mask = pd.Series(True, index=adata.obs.index) - # Filter cells based on size and distance - # need to do the copy because we pop the spatialdata_attrs in add_table_layer, otherwise it would not be updated inplace - adata = adata[adata.obs[cellsize_key] < max_size, :].copy() - adata = adata[adata.obs[cellsize_key] > min_size, :].copy() + if include_values is not None: + kept = adata.obs[categorical_column].isin(include_values).sum() + removed = (~adata.obs[categorical_column].isin(include_values)).sum() + log.info(f"Found {kept} cells in {include_values} to keep.") + mask &= adata.obs[categorical_column].isin(include_values) + + elif exclude_values is not None: + removed = adata.obs[categorical_column].isin(exclude_values).sum() + kept = (~adata.obs[categorical_column].isin(exclude_values)).sum() + log.info(f"Found {removed} cells in {exclude_values} to remove.") + mask &= ~adata.obs[categorical_column].isin(exclude_values) + + adata = adata[mask, :].copy() sdata = add_table_layer( sdata, @@ -286,11 +485,16 @@ def filter_on_size( sdata, table_layer=output_layer, labels_layer=_labels_layer, - prefix_filtered_shapes_layer="filtered_size", + prefix_filtered_shapes_layer=prefix_filtered_shapes_layer, ) filtered = start - adata.shape[0] - log.info(f"{filtered} cells were filtered out based on size.") + log.info( + f"Removed {filtered} / {start} cells based on '{categorical_column}' " + f"({'included' if include_values is not None else 'excluded'}: " + f"{include_values if include_values is not None else exclude_values}) " + f"and kept {adata.shape[0]}." + ) return sdata From 7d7b79dbac2e49215f7ec7f25c695dd0fd0443e5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 Oct 2025 10:39:05 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/harpy/shape/__init__.pyi | 2 +- src/harpy/shape/_filters.py | 716 +++++++------- src/harpy/shape/_shape.py | 61 +- src/harpy/table/__init__.pyi | 82 +- src/harpy/table/_region_annotations.py | 1207 ++++++++++++------------ src/harpy/table/_table.py | 46 +- 6 files changed, 1053 insertions(+), 1061 deletions(-) diff --git a/src/harpy/shape/__init__.pyi b/src/harpy/shape/__init__.pyi index fbf27a9f..7546a934 100644 --- a/src/harpy/shape/__init__.pyi +++ b/src/harpy/shape/__init__.pyi @@ -1,6 +1,6 @@ from ._cell_expansion import create_voronoi_boundaries -from ._shape import add_shapes_layer, filter_shapes_layer, intersect_rectangles, vectorize, prep_region_annotations from ._filters import filter_by_morphology, filter_by_shapes +from ._shape import add_shapes_layer, filter_shapes_layer, intersect_rectangles, prep_region_annotations, vectorize __all__ = [ "add_shapes_layer", diff --git a/src/harpy/shape/_filters.py b/src/harpy/shape/_filters.py index f82dadcf..f7f8a5c6 100755 --- a/src/harpy/shape/_filters.py +++ b/src/harpy/shape/_filters.py @@ -1,349 +1,367 @@ -from typing import Optional, Dict, Union, List -from spatialdata import SpatialData -import geopandas as gpd -import pandas as pd -import numpy as np -from shapely.geometry import Polygon, MultiPolygon, Point, MultiPoint, LineString, MultiLineString -from harpy.shape._shape import add_shapes_layer - -def filter_by_morphology( - sdata: SpatialData, - shapes_layer: str, - output_shapes_layer: str, - filters: Optional[Dict[str, tuple]] = None, - keep_unsupported: bool = False, - calculate_all_features: bool = False, - calculate_all_features_grouped: bool = False, - shape_names_column: str = "name", - pixel_size_um: float = 1.0, - overwrite: bool = True, -): - """ - Filter polygons in a SpatialData shapes layer based on morphological features. - It is recommended to run `hp.sh.prep_region_annotations` first when filtering region annotations that - may contain multipolygons, unnamed annotations, etc. - - Parameters - ---------- - sdata - SpatialData object containing the input shapes layer. - shapes_layer - Name of the shapes layer in `sdata.shapes` containing polygons. - output_shapes_layer - Name of the shapes layer to store the filtered polygons. - filters - Dictionary specifying filtering thresholds. - Each entry should be of the form: - `{ "feature_name": (min_value, max_value) }` - where `None` can be used to skip a bound, e.g.: - `{ "area": (50, 200), "convexity": (None, 0.8) }` - - Supported features: - - `"area"`: Area of the polygon (px²). - → Use to filter out very small or very large polygons. - - `"perimeter"`: Perimeter length (px). - → Use to detect irregular boundaries or fragmented shapes. - - `"circularity"`: 4π * area / perimeter². - → 1 for a perfect circle; lower = irregular shape. - - `"compactness"`: perimeter² / area. - → Shape compactness measure. - - `"convex_area"`: Area of convex hull (px²). - → Useful with solidity and convexity. - - `"solidity"`: area / convex_area. - → Low values mean concave or fragmented shapes. - - `"convexity"`: convex_perimeter / perimeter. - → 1 for perfectly convex shapes. Lower for rough or spiky boundaries. - - `"centroid_dif"`: Distance between polygon and convex hull centroids normalized by the polygon area. - → Captures off-centered concavity or asymmetry. - - `"major_axis_length"`: Length of the longest side of the minimum rotated bounding box (px). - → Use to filter very long or very short shapes. - - `"minor_axis_length"`: Length of the shortest side of the minimum rotated bounding box (px). - → Use to filter very wide or very skinny shapes. - - `"major_minor_axis_ratio"`: major_axis_length / minor_axis_length. - → High values indicate elongated polygons; ~1 for round shapes. - - You can use grouped filters by suffixing features with `-grouped`. - Grouped filters merge polygons sharing the same name in `shape_names_column` before computing morphological features and - can be used interchangibly with regular filters. - - keep_unsupported - Only Polygon and MultiPolygon are supported. Set keep_unsupported to True to skip any unssupported geometries types (e.g. Point), but - keep them in the output_shapes_layer. Set to False to remove them from output_shapes_layer. - calculate_all_features - If True, computes all supported morphological features regardless of which ones are used in `filters` and saves them to `output_shapes_layer`. - If False, only computes and saves morphological features needed for `filters`. - calculate_all_features_grouped - If True, computes all morphological features for merged group geometries (by {shape_names_column}), regardless - of which ones are used in grouped filters. - shape_names_column - Column name in shapes layer containing geometry names. Required when using grouped filters. - - pixel_size_um - Scale factor to convert geometric measurements from pixel units to microns. - - Applied to: - * "area" and "convex_area" → scaled by (pixel_size_um)² - * "perimeter", "major_axis_length", and "minor_axis_length" → scaled by (pixel_size_um) - - Dimensionless ratios (e.g., "circularity", "compactness", "solidity", "convexity", "major_minor_axis_ratio") - are unaffected by scaling but are computed using the scaled geometric quantities. - Defaults to 1.0 (no scaling, i.e. units remain in pixels). - Note that this affects the min/max values that need to be specified in `filters`. - overwrite - Whether to overwrite an existing shapes layer. - - Returns - ------- - SpatialData object with updated shapes layer containing only the filtered polygons. - """ - - # Get filters and split into individual and grouped filters - filters = filters or {} - - grouped_filters = {k.replace("-grouped", ""): v for k, v in filters.items() if k.endswith("-grouped")} - individual_filters = {k: v for k, v in filters.items() if not k.endswith("-grouped")} - - required_individual = set(individual_filters.keys()) - required_grouped = set(grouped_filters.keys()) - - if calculate_all_features: - required_individual.update([ - "area", "perimeter", "convex_area", - "circularity", "compactness", "solidity", - "convexity", "centroid_dif", - "major_axis_length", "minor_axis_length", "major_minor_axis_ratio" - ]) - print("Calculating all supported morphological features (per polygon).") - - if calculate_all_features_grouped: - required_grouped.update([ - "area", "perimeter", "convex_area", - "circularity", "compactness", "solidity", - "convexity", "centroid_dif", - "major_axis_length", "minor_axis_length", "major_minor_axis_ratio" - ]) - print("Calculating all supported morphological features (grouped).") - - # Create copy of shapes layer - gdf = sdata.shapes[shapes_layer].copy() - - # Filter out geometries that are not Polygon or MultiPolygon - supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon))) - - unssuported = len(gdf) - supported_mask.sum() - if unssuported > 0: - print(f"Found {unssuported} non-polygon geometries in {shapes_layer}.") - if unssuported == len(gdf): - print("No supported geometries (Polygon, MultiPolygon) found. Exiting...") - return sdata - - if keep_unsupported: - gdf_skipped = gdf[~supported_mask].copy() - else: - gdf_skipped = gpd.GeoDataFrame(columns=gdf.columns, geometry=[]) - - gdf = gdf[supported_mask].copy() - - # Dissolve polygons by name - grouped_gdf = None - if grouped_filters or calculate_all_features_grouped: - if shape_names_column not in gdf.columns: - raise ValueError("Grouped filters require a 'name' column in the shapes layer.") - - grouped_gdf = gdf.dissolve(by=shape_names_column, as_index=False, aggfunc="first").copy() - grouped_gdf["geometry"] = gdf.groupby(shape_names_column).geometry.apply(lambda x: x.unary_union).values - print(f"Polygons merged by {shape_names_column}, resulting in {len(grouped_gdf)} groups.") - - # Compute metrics - def _compute_morphological_features(gdf, required, pixel_size_um, suffix): - if any(filter in required for filter in ["area", "circularity", "compactness", "solidity", "centroid_dif"]): - gdf[f"area{suffix}"] = gdf.geometry.area * pixel_size_um**2 - - if any(filter in required for filter in ["perimeter", "circularity", "compactness", "convexity"]): - gdf[f"perimeter{suffix}"] = gdf.geometry.length * pixel_size_um - - if any(filter in required for filter in ["convex_area", "solidity"]): - gdf[f"convex_area{suffix}"] = gdf.geometry.convex_hull.area * pixel_size_um**2 - - if "circularity" in required: - gdf[f"circularity{suffix}"] = 4 * np.pi * gdf[f"area{suffix}"] / (gdf[f"perimeter{suffix}"] ** 2) - - if "compactness" in required: - gdf[f"compactness{suffix}"] = (gdf[f"perimeter{suffix}"] ** 2) / gdf[f"area{suffix}"] - - if "solidity" in required: - gdf[f"solidity{suffix}"] = gdf[f"area{suffix}"] / gdf[f"convex_area{suffix}"] - - if "centroid_dif" in required: - gdf[f"centroid_x{suffix}"] = gdf.geometry.centroid.x - gdf[f"centroid_y{suffix}"] = gdf.geometry.centroid.y - hull_centroids = gdf.geometry.convex_hull.centroid - gdf[f"centroid_dif{suffix}"] = np.sqrt( - (gdf[f"centroid_x{suffix}"] - hull_centroids.x) ** 2 + - (gdf[f"centroid_y{suffix}"] - hull_centroids.y) ** 2 - ) / np.sqrt(gdf[f"area{suffix}"]) - - if "convexity" in required: - gdf[f"convexity{suffix}"] = (gdf.geometry.convex_hull.length * pixel_size_um) / gdf[f"perimeter{suffix}"] - - # Get bounding box lengths from minimum rotated rectangle - if any(filter in required for filter in ["major_axis_length", "minor_axis_length", "major_minor_axis_ratio"]): - def _rotated_rect_axes(geom): - try: - rect = geom.minimum_rotated_rectangle - x, y = rect.exterior.coords.xy - # Compute edge lengths - edges = np.sqrt(np.diff(x)**2 + np.diff(y)**2) - edges = np.sort(edges[:-1]) # drop duplicate closing edge - minor, major = edges[0], edges[1] - ratio = major / minor if minor > 0 else np.nan - return major, minor, ratio - except Exception: - return np.nan, np.nan, np.nan - - results = np.array([_rotated_rect_axes(g) for g in gdf.geometry]) - major_axis_length, minor_axis_length, major_minor_axis_ratio = results.T - if any(filter in required for filter in ["major_axis_length", "major_minor_axis_ratio"]): - gdf[f"major_axis_length{suffix}"] = major_axis_length * pixel_size_um - if any(filter in required for filter in ["minor_axis_length", "major_minor_axis_ratio"]): - gdf[f"minor_axis_length{suffix}"] = minor_axis_length * pixel_size_um - if "major_minor_axis_ratio" in required: - gdf[f"major_minor_axis_ratio{suffix}"] = major_minor_axis_ratio - - return gdf - - gdf = _compute_morphological_features(gdf, required_individual, pixel_size_um, "") - grouped_gdf = _compute_morphological_features(grouped_gdf, required_grouped, pixel_size_um, "-grouped") - - # Merge grouped metrics back into main gdf - if grouped_gdf is not None and not grouped_gdf.empty: - grouped_metrics = [col for col in grouped_gdf.columns if col.endswith("-grouped")] - gdf = gdf.merge( - grouped_gdf[[shape_names_column] + grouped_metrics], - on=shape_names_column, - how="left" - ) - - # Apply filters - print(f"Applying morphological filter(s) on {len(gdf)} polygons...") - mask = np.ones(len(gdf), dtype=bool) - if filters: - for feature, (min_val, max_val) in filters.items(): - print(f"\nFiltering by '{feature}': {min_val} ≤ value ≤ {max_val}") - if feature not in gdf.columns: - raise KeyError(f"Feature '{feature}' was not computed. Check your spelling or supported feature list.") - - # Apply lower bound - if min_val is not None: - to_remove = (gdf[feature] < min_val) & mask - removed_low = to_remove.sum() - mask &= gdf[feature] >= min_val - print(f" - Removed {removed_low} polygons with {feature} ≤ {min_val}") - - # Apply upper bound - if max_val is not None: - to_remove = (gdf[feature] > max_val) & mask - removed_high = to_remove.sum() - mask &= gdf[feature] <= max_val - print(f" - Removed {removed_high} polygons with {feature} ≥ {max_val}") - - remaining = mask.sum() - print(f" → Remaining after filtering '{feature}': {remaining} polygons") - - - filtered_gdf = gdf[mask] - print(f"\nKept {len(filtered_gdf)} / {len(gdf)} polygons after morphological filters.") - - # Add filtered shapes layer - input_gdf = pd.concat([filtered_gdf, gdf_skipped], ignore_index=False) - sdata = add_shapes_layer( - sdata, - input=input_gdf, - output_layer=output_shapes_layer, - overwrite=overwrite, - ) - - return sdata - -def filter_by_shapes( - sdata: SpatialData, - target_shapes_layer: str, - mask_shapes_layer: str, - output_shapes_layer: str, - shape_names_column: Optional[str] = None, - shape_names: Optional[Union[str, List[str]]] = None, - keep_intersecting: bool = True, - overwrite: bool = False, -): - """ - Filter polygons in a target shapes layer (typically containg segmention boundaries) based on intersection with polygons in a mask layer - (typically containing region annotations). - - Parameters - ---------- - sdata - SpatialData object containing the target and mask shapes layers. - target_shapes_layer - Name of shapes layer whose polygons will be filtered. - mask_shapes_layer - Name of shapes layer used as mask for filtering. - output_shapes_layer - Name of the output shapes layer to store filtered polygons. - shape_names_column - Optional column in mask_shapes_layer to select specific polygons. - shape_names - Name or list of names of polygons in mask_shapes_layer to use for filtering. Ignored if shape_names_column is None. - keep_intersecting - If True, keeps polygons that intersect the mask. - If False, removes polygons that intersect the mask. - overwrite - If True, overwrites the output shapes layer if it exists. - - Returns - ------- - SpatialData object with filtered shapes layer. - """ - - # Copy target layer - target_gdf = sdata.shapes[target_shapes_layer].copy() - - # Copy mask layer - mask_gdf = sdata.shapes[mask_shapes_layer].copy() - - # Optionally select subset of mask polygons - if shape_names_column is not None and shape_names is not None: - if shape_names_column not in mask_gdf.columns: - raise ValueError(f"Column '{shape_names_column}' not found in mask layer '{mask_shapes_layer}'.") - if isinstance(shape_names, str): - shape_names = [shape_names] - mask_gdf = mask_gdf[mask_gdf[shape_names_column].isin(shape_names)].copy() - if mask_gdf.empty: - raise ValueError(f"No geometries found in '{mask_shapes_layer}' matching {shape_names}.") - - # Build mask union - mask_union = mask_gdf.unary_union - - # Compute intersection boolean - target_gdf["intersects_mask"] = target_gdf.geometry.intersects(mask_union) - - # Filter depending on mode - if keep_intersecting: - filtered_gdf = target_gdf[target_gdf["intersects_mask"]].copy() - else: - filtered_gdf = target_gdf[~target_gdf["intersects_mask"]].copy() - - filtered_gdf.drop(columns=["intersects_mask"], inplace=True) - removed = len(target_gdf) - len(filtered_gdf) - - if keep_intersecting: - print(f"Kept {len(filtered_gdf)} / {len(target_gdf)} geometries intersecting '{mask_shapes_layer}' (removed {removed}).") - else: - print(f"Removed {removed} / {len(target_gdf)} geometries intersecting '{mask_shapes_layer}' (kept {len(filtered_gdf)}).") - - # Add to SpatialData - sdata = add_shapes_layer( - sdata, - input=filtered_gdf, - output_layer=output_shapes_layer, - overwrite=overwrite, - ) - - return sdata +import geopandas as gpd +import numpy as np +import pandas as pd +from shapely.geometry import MultiPolygon, Polygon +from spatialdata import SpatialData + +from harpy.shape._shape import add_shapes_layer + + +def filter_by_morphology( + sdata: SpatialData, + shapes_layer: str, + output_shapes_layer: str, + filters: dict[str, tuple] | None = None, + keep_unsupported: bool = False, + calculate_all_features: bool = False, + calculate_all_features_grouped: bool = False, + shape_names_column: str = "name", + pixel_size_um: float = 1.0, + overwrite: bool = True, +): + """ + Filter polygons in a SpatialData shapes layer based on morphological features. + It is recommended to run `hp.sh.prep_region_annotations` first when filtering region annotations that + may contain multipolygons, unnamed annotations, etc. + + Parameters + ---------- + sdata + SpatialData object containing the input shapes layer. + shapes_layer + Name of the shapes layer in `sdata.shapes` containing polygons. + output_shapes_layer + Name of the shapes layer to store the filtered polygons. + filters + Dictionary specifying filtering thresholds. + Each entry should be of the form: + `{ "feature_name": (min_value, max_value) }` + where `None` can be used to skip a bound, e.g.: + `{ "area": (50, 200), "convexity": (None, 0.8) }` + + Supported features: + - `"area"`: Area of the polygon (px²). + → Use to filter out very small or very large polygons. + - `"perimeter"`: Perimeter length (px). + → Use to detect irregular boundaries or fragmented shapes. + - `"circularity"`: 4π * area / perimeter². + → 1 for a perfect circle; lower = irregular shape. + - `"compactness"`: perimeter² / area. + → Shape compactness measure. + - `"convex_area"`: Area of convex hull (px²). + → Useful with solidity and convexity. + - `"solidity"`: area / convex_area. + → Low values mean concave or fragmented shapes. + - `"convexity"`: convex_perimeter / perimeter. + → 1 for perfectly convex shapes. Lower for rough or spiky boundaries. + - `"centroid_dif"`: Distance between polygon and convex hull centroids normalized by the polygon area. + → Captures off-centered concavity or asymmetry. + - `"major_axis_length"`: Length of the longest side of the minimum rotated bounding box (px). + → Use to filter very long or very short shapes. + - `"minor_axis_length"`: Length of the shortest side of the minimum rotated bounding box (px). + → Use to filter very wide or very skinny shapes. + - `"major_minor_axis_ratio"`: major_axis_length / minor_axis_length. + → High values indicate elongated polygons; ~1 for round shapes. + + You can use grouped filters by suffixing features with `-grouped`. + Grouped filters merge polygons sharing the same name in `shape_names_column` before computing morphological features and + can be used interchangibly with regular filters. + + keep_unsupported + Only Polygon and MultiPolygon are supported. Set keep_unsupported to True to skip any unssupported geometries types (e.g. Point), but + keep them in the output_shapes_layer. Set to False to remove them from output_shapes_layer. + calculate_all_features + If True, computes all supported morphological features regardless of which ones are used in `filters` and saves them to `output_shapes_layer`. + If False, only computes and saves morphological features needed for `filters`. + calculate_all_features_grouped + If True, computes all morphological features for merged group geometries (by {shape_names_column}), regardless + of which ones are used in grouped filters. + shape_names_column + Column name in shapes layer containing geometry names. Required when using grouped filters. + + pixel_size_um + Scale factor to convert geometric measurements from pixel units to microns. + - Applied to: + * "area" and "convex_area" → scaled by (pixel_size_um)² + * "perimeter", "major_axis_length", and "minor_axis_length" → scaled by (pixel_size_um) + - Dimensionless ratios (e.g., "circularity", "compactness", "solidity", "convexity", "major_minor_axis_ratio") + are unaffected by scaling but are computed using the scaled geometric quantities. + Defaults to 1.0 (no scaling, i.e. units remain in pixels). + Note that this affects the min/max values that need to be specified in `filters`. + overwrite + Whether to overwrite an existing shapes layer. + + Returns + ------- + SpatialData object with updated shapes layer containing only the filtered polygons. + """ + # Get filters and split into individual and grouped filters + filters = filters or {} + + grouped_filters = {k.replace("-grouped", ""): v for k, v in filters.items() if k.endswith("-grouped")} + individual_filters = {k: v for k, v in filters.items() if not k.endswith("-grouped")} + + required_individual = set(individual_filters.keys()) + required_grouped = set(grouped_filters.keys()) + + if calculate_all_features: + required_individual.update( + [ + "area", + "perimeter", + "convex_area", + "circularity", + "compactness", + "solidity", + "convexity", + "centroid_dif", + "major_axis_length", + "minor_axis_length", + "major_minor_axis_ratio", + ] + ) + print("Calculating all supported morphological features (per polygon).") + + if calculate_all_features_grouped: + required_grouped.update( + [ + "area", + "perimeter", + "convex_area", + "circularity", + "compactness", + "solidity", + "convexity", + "centroid_dif", + "major_axis_length", + "minor_axis_length", + "major_minor_axis_ratio", + ] + ) + print("Calculating all supported morphological features (grouped).") + + # Create copy of shapes layer + gdf = sdata.shapes[shapes_layer].copy() + + # Filter out geometries that are not Polygon or MultiPolygon + supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon))) + + unssuported = len(gdf) - supported_mask.sum() + if unssuported > 0: + print(f"Found {unssuported} non-polygon geometries in {shapes_layer}.") + if unssuported == len(gdf): + print("No supported geometries (Polygon, MultiPolygon) found. Exiting...") + return sdata + + if keep_unsupported: + gdf_skipped = gdf[~supported_mask].copy() + else: + gdf_skipped = gpd.GeoDataFrame(columns=gdf.columns, geometry=[]) + + gdf = gdf[supported_mask].copy() + + # Dissolve polygons by name + grouped_gdf = None + if grouped_filters or calculate_all_features_grouped: + if shape_names_column not in gdf.columns: + raise ValueError("Grouped filters require a 'name' column in the shapes layer.") + + grouped_gdf = gdf.dissolve(by=shape_names_column, as_index=False, aggfunc="first").copy() + grouped_gdf["geometry"] = gdf.groupby(shape_names_column).geometry.apply(lambda x: x.unary_union).values + print(f"Polygons merged by {shape_names_column}, resulting in {len(grouped_gdf)} groups.") + + # Compute metrics + def _compute_morphological_features(gdf, required, pixel_size_um, suffix): + if any(filter in required for filter in ["area", "circularity", "compactness", "solidity", "centroid_dif"]): + gdf[f"area{suffix}"] = gdf.geometry.area * pixel_size_um**2 + + if any(filter in required for filter in ["perimeter", "circularity", "compactness", "convexity"]): + gdf[f"perimeter{suffix}"] = gdf.geometry.length * pixel_size_um + + if any(filter in required for filter in ["convex_area", "solidity"]): + gdf[f"convex_area{suffix}"] = gdf.geometry.convex_hull.area * pixel_size_um**2 + + if "circularity" in required: + gdf[f"circularity{suffix}"] = 4 * np.pi * gdf[f"area{suffix}"] / (gdf[f"perimeter{suffix}"] ** 2) + + if "compactness" in required: + gdf[f"compactness{suffix}"] = (gdf[f"perimeter{suffix}"] ** 2) / gdf[f"area{suffix}"] + + if "solidity" in required: + gdf[f"solidity{suffix}"] = gdf[f"area{suffix}"] / gdf[f"convex_area{suffix}"] + + if "centroid_dif" in required: + gdf[f"centroid_x{suffix}"] = gdf.geometry.centroid.x + gdf[f"centroid_y{suffix}"] = gdf.geometry.centroid.y + hull_centroids = gdf.geometry.convex_hull.centroid + gdf[f"centroid_dif{suffix}"] = np.sqrt( + (gdf[f"centroid_x{suffix}"] - hull_centroids.x) ** 2 + + (gdf[f"centroid_y{suffix}"] - hull_centroids.y) ** 2 + ) / np.sqrt(gdf[f"area{suffix}"]) + + if "convexity" in required: + gdf[f"convexity{suffix}"] = (gdf.geometry.convex_hull.length * pixel_size_um) / gdf[f"perimeter{suffix}"] + + # Get bounding box lengths from minimum rotated rectangle + if any(filter in required for filter in ["major_axis_length", "minor_axis_length", "major_minor_axis_ratio"]): + + def _rotated_rect_axes(geom): + try: + rect = geom.minimum_rotated_rectangle + x, y = rect.exterior.coords.xy + # Compute edge lengths + edges = np.sqrt(np.diff(x) ** 2 + np.diff(y) ** 2) + edges = np.sort(edges[:-1]) # drop duplicate closing edge + minor, major = edges[0], edges[1] + ratio = major / minor if minor > 0 else np.nan + return major, minor, ratio + except Exception: + return np.nan, np.nan, np.nan + + results = np.array([_rotated_rect_axes(g) for g in gdf.geometry]) + major_axis_length, minor_axis_length, major_minor_axis_ratio = results.T + if any(filter in required for filter in ["major_axis_length", "major_minor_axis_ratio"]): + gdf[f"major_axis_length{suffix}"] = major_axis_length * pixel_size_um + if any(filter in required for filter in ["minor_axis_length", "major_minor_axis_ratio"]): + gdf[f"minor_axis_length{suffix}"] = minor_axis_length * pixel_size_um + if "major_minor_axis_ratio" in required: + gdf[f"major_minor_axis_ratio{suffix}"] = major_minor_axis_ratio + + return gdf + + gdf = _compute_morphological_features(gdf, required_individual, pixel_size_um, "") + grouped_gdf = _compute_morphological_features(grouped_gdf, required_grouped, pixel_size_um, "-grouped") + + # Merge grouped metrics back into main gdf + if grouped_gdf is not None and not grouped_gdf.empty: + grouped_metrics = [col for col in grouped_gdf.columns if col.endswith("-grouped")] + gdf = gdf.merge(grouped_gdf[[shape_names_column] + grouped_metrics], on=shape_names_column, how="left") + + # Apply filters + print(f"Applying morphological filter(s) on {len(gdf)} polygons...") + mask = np.ones(len(gdf), dtype=bool) + if filters: + for feature, (min_val, max_val) in filters.items(): + print(f"\nFiltering by '{feature}': {min_val} ≤ value ≤ {max_val}") + if feature not in gdf.columns: + raise KeyError(f"Feature '{feature}' was not computed. Check your spelling or supported feature list.") + + # Apply lower bound + if min_val is not None: + to_remove = (gdf[feature] < min_val) & mask + removed_low = to_remove.sum() + mask &= gdf[feature] >= min_val + print(f" - Removed {removed_low} polygons with {feature} ≤ {min_val}") + + # Apply upper bound + if max_val is not None: + to_remove = (gdf[feature] > max_val) & mask + removed_high = to_remove.sum() + mask &= gdf[feature] <= max_val + print(f" - Removed {removed_high} polygons with {feature} ≥ {max_val}") + + remaining = mask.sum() + print(f" → Remaining after filtering '{feature}': {remaining} polygons") + + filtered_gdf = gdf[mask] + print(f"\nKept {len(filtered_gdf)} / {len(gdf)} polygons after morphological filters.") + + # Add filtered shapes layer + input_gdf = pd.concat([filtered_gdf, gdf_skipped], ignore_index=False) + sdata = add_shapes_layer( + sdata, + input=input_gdf, + output_layer=output_shapes_layer, + overwrite=overwrite, + ) + + return sdata + + +def filter_by_shapes( + sdata: SpatialData, + target_shapes_layer: str, + mask_shapes_layer: str, + output_shapes_layer: str, + shape_names_column: str | None = None, + shape_names: str | list[str] | None = None, + keep_intersecting: bool = True, + overwrite: bool = False, +): + """ + Filter polygons in a target shapes layer (typically containg segmention boundaries) based on intersection with polygons in a mask layer + (typically containing region annotations). + + Parameters + ---------- + sdata + SpatialData object containing the target and mask shapes layers. + target_shapes_layer + Name of shapes layer whose polygons will be filtered. + mask_shapes_layer + Name of shapes layer used as mask for filtering. + output_shapes_layer + Name of the output shapes layer to store filtered polygons. + shape_names_column + Optional column in mask_shapes_layer to select specific polygons. + shape_names + Name or list of names of polygons in mask_shapes_layer to use for filtering. Ignored if shape_names_column is None. + keep_intersecting + If True, keeps polygons that intersect the mask. + If False, removes polygons that intersect the mask. + overwrite + If True, overwrites the output shapes layer if it exists. + + Returns + ------- + SpatialData object with filtered shapes layer. + """ + # Copy target layer + target_gdf = sdata.shapes[target_shapes_layer].copy() + + # Copy mask layer + mask_gdf = sdata.shapes[mask_shapes_layer].copy() + + # Optionally select subset of mask polygons + if shape_names_column is not None and shape_names is not None: + if shape_names_column not in mask_gdf.columns: + raise ValueError(f"Column '{shape_names_column}' not found in mask layer '{mask_shapes_layer}'.") + if isinstance(shape_names, str): + shape_names = [shape_names] + mask_gdf = mask_gdf[mask_gdf[shape_names_column].isin(shape_names)].copy() + if mask_gdf.empty: + raise ValueError(f"No geometries found in '{mask_shapes_layer}' matching {shape_names}.") + + # Build mask union + mask_union = mask_gdf.unary_union + + # Compute intersection boolean + target_gdf["intersects_mask"] = target_gdf.geometry.intersects(mask_union) + + # Filter depending on mode + if keep_intersecting: + filtered_gdf = target_gdf[target_gdf["intersects_mask"]].copy() + else: + filtered_gdf = target_gdf[~target_gdf["intersects_mask"]].copy() + + filtered_gdf.drop(columns=["intersects_mask"], inplace=True) + removed = len(target_gdf) - len(filtered_gdf) + + if keep_intersecting: + print( + f"Kept {len(filtered_gdf)} / {len(target_gdf)} geometries intersecting '{mask_shapes_layer}' (removed {removed})." + ) + else: + print( + f"Removed {removed} / {len(target_gdf)} geometries intersecting '{mask_shapes_layer}' (kept {len(filtered_gdf)})." + ) + + # Add to SpatialData + sdata = add_shapes_layer( + sdata, + input=filtered_gdf, + output_layer=output_shapes_layer, + overwrite=overwrite, + ) + + return sdata diff --git a/src/harpy/shape/_shape.py b/src/harpy/shape/_shape.py index c7f29f7f..f758110e 100644 --- a/src/harpy/shape/_shape.py +++ b/src/harpy/shape/_shape.py @@ -1,8 +1,9 @@ from __future__ import annotations +import numpy as np from dask.array import Array from geopandas import GeoDataFrame -from shapely.geometry import GeometryCollection, MultiPolygon, Polygon, Point, MultiPoint, LineString, MultiLineString +from shapely.geometry import GeometryCollection, MultiPolygon, Point, Polygon from spatialdata import SpatialData from spatialdata.models._utils import MappingToCoordinateSystem_t from spatialdata.transformations import get_transformation @@ -10,7 +11,6 @@ from harpy.image._image import _get_spatial_element from harpy.shape._manager import ShapesLayerManager -import numpy as np def vectorize( sdata, @@ -180,8 +180,8 @@ def intersect_rectangles(rect1: list[int | float], rect2: list[int | float]) -> return [x_min, x_max, y_min, y_max] else: return None - - + + def prep_region_annotations( sdata: SpatialData, shapes_layer: str, @@ -214,38 +214,37 @@ def prep_region_annotations( unnamed Name to be assigned to any unnamed geometries in `shape_names_column`. Defaults to 'unnamed'. unique_shape_names_column - Column name in which unique names will be created for single polygons by appending a counter to the original name in `shape_names_column` for polygons with the same name. Note - that multipolygons will be split in individual polygons and each will get a unique name based on the original name of the multipolygon. Unique names will be stored in + Column name in which unique names will be created for single polygons by appending a counter to the original name in `shape_names_column` for polygons with the same name. Note + that multipolygons will be split in individual polygons and each will get a unique name based on the original name of the multipolygon. Unique names will be stored in `{shape_names_column}-unique` in the updated shapes layer. erosion - Number of pixels to erode polygons by. This can avoid problems with overlapping edges of geometries when calculating distances. Default is 0.5 (i.e. erosion by 0.5 pixels). + Number of pixels to erode polygons by. This can avoid problems with overlapping edges of geometries when calculating distances. Default is 0.5 (i.e. erosion by 0.5 pixels). overwrite If True, overwrites the `output_shapes_layer` if it already exists in `sdata`. - + Returns ------- Modified `sdata` object with updated updated shapes layer. """ - # Create copy of shapes layer gdf = sdata.shapes[shapes_layer].copy() print(f"Found {len(gdf)} geometries in {shapes_layer}.") - + # Ensure shape_names_column exist if shape_names_column not in gdf.columns: gdf[shape_names_column] = unnamed - + gdf[shape_names_column] = gdf[shape_names_column].fillna("").astype(str) unnamed_mask = gdf[shape_names_column] == "" - for i, idx in enumerate(gdf[unnamed_mask].index): + for _i, idx in enumerate(gdf[unnamed_mask].index): gdf.at[idx, shape_names_column] = unnamed - + # Convert Points with a radius column to circular polygons def _point_to_circle(geom, radius=None): if isinstance(geom, Point) and radius is not None: return geom.buffer(radius, resolution=16) return geom - + if "radius" in gdf.columns: mask_points_with_radius = gdf.geometry.geom_type.eq("Point") & gdf["radius"].notna() n_converted = mask_points_with_radius.sum() @@ -253,37 +252,34 @@ def _point_to_circle(geom, radius=None): if n_converted > 0: print(f"Converting {n_converted} Point geometries with 'radius' to circular polygons.") gdf.loc[mask_points_with_radius, "geometry"] = gdf.loc[mask_points_with_radius].apply( - lambda row: _point_to_circle(row.geometry, getattr(row, "radius", None)), - axis=1 + lambda row: _point_to_circle(row.geometry, getattr(row, "radius", None)), axis=1 ) - + # Slightly erode all polygons (this avoids any shared borders between polygons) gdf["geometry"] = gdf.geometry.apply( - lambda geom: geom.buffer( - -erosion, - join_style=2, - resolution=16) - if geom.geom_type in ["Polygon", "MultiPolygon"] else geom + lambda geom: geom.buffer(-erosion, join_style=2, resolution=16) + if geom.geom_type in ["Polygon", "MultiPolygon"] + else geom ) - polygon_mask = ~gdf.geometry.is_empty - removed = len(gdf) - polygon_mask.sum() - gdf = gdf[polygon_mask] # drop any polygons that collapsed to empty - if removed > 0: + polygon_mask = ~gdf.geometry.is_empty + removed = len(gdf) - polygon_mask.sum() + gdf = gdf[polygon_mask] # drop any polygons that collapsed to empty + if removed > 0: print(f"Removed {removed} polygons that collapsed to empty after erosion.") - + # Explode multipolygons into single polygons (this allows us to treat multipolygons as unique polygons) n_multipolygons = gdf.geometry.apply(lambda g: g.geom_type == "MultiPolygon").sum() gdf = gdf.explode(index_parts=False).reset_index(drop=True) n_after = len(gdf) - print(f"Split {n_multipolygons} multipolygons into individual polygons. Total number of geometries after splitting multipolgons is {n_after}.") - + print( + f"Split {n_multipolygons} multipolygons into individual polygons. Total number of geometries after splitting multipolgons is {n_after}." + ) + # Create unique names sizes = gdf.groupby(shape_names_column)[shape_names_column].transform("size") counter = gdf.groupby(shape_names_column).cumcount() + 1 gdf[unique_shape_names_column] = np.where( - sizes.eq(1), - gdf[shape_names_column], - gdf[shape_names_column] + counter.astype(str) + sizes.eq(1), gdf[shape_names_column], gdf[shape_names_column] + counter.astype(str) ) # Add filtered shapes layer @@ -295,4 +291,3 @@ def _point_to_circle(geom, radius=None): ) return sdata - diff --git a/src/harpy/table/__init__.pyi b/src/harpy/table/__init__.pyi index a060960c..a254ba3a 100755 --- a/src/harpy/table/__init__.pyi +++ b/src/harpy/table/__init__.pyi @@ -1,41 +1,41 @@ -from ._allocation import allocate, bin_counts -from ._allocation_intensity import allocate_intensity -from ._annotation import cluster_cleanliness, score_genes, score_genes_iter -from ._clustering import kmeans, leiden -from ._enrichment import nhood_enrichment -from ._preprocess import preprocess_proteomics, preprocess_transcriptomics -from ._regionprops import add_regionprop_features -from ._table import add_table_layer, correct_marker_genes, filter_on_size, filter_numerical, filter_categorical -from .cell_clustering._clustering import flowsom -from .cell_clustering._preprocess import cell_clustering_preprocess -from .cell_clustering._weighted_channel_expression import weighted_channel_expression -from .pixel_clustering._cluster_intensity import cluster_intensity -from .pixel_clustering._neighbors import spatial_pixel_neighbors -from ._region_annotations import assign_cells_to_shapes, compute_distance_to_shapes - -__all__ = [ - "add_table_layer", - "correct_marker_genes", - "filter_on_size", - "filter_numerical", - "filter_categorical", - "flowsom", - "weighted_channel_expression", - "cell_clustering_preprocess", - "cluster_intensity", - "spatial_pixel_neighbors", - "kmeans", - "leiden", - "nhood_enrichment", - "preprocess_proteomics", - "preprocess_transcriptomics", - "add_regionprop_features", - "cluster_cleanliness", - "score_genes", - "score_genes_iter", - "allocate", - "bin_counts", - "allocate_intensity", - "assign_cells_to_shapes", - "compute_distance_to_shapes", -] +from ._allocation import allocate, bin_counts +from ._allocation_intensity import allocate_intensity +from ._annotation import cluster_cleanliness, score_genes, score_genes_iter +from ._clustering import kmeans, leiden +from ._enrichment import nhood_enrichment +from ._preprocess import preprocess_proteomics, preprocess_transcriptomics +from ._region_annotations import assign_cells_to_shapes, compute_distance_to_shapes +from ._regionprops import add_regionprop_features +from ._table import add_table_layer, correct_marker_genes, filter_categorical, filter_numerical, filter_on_size +from .cell_clustering._clustering import flowsom +from .cell_clustering._preprocess import cell_clustering_preprocess +from .cell_clustering._weighted_channel_expression import weighted_channel_expression +from .pixel_clustering._cluster_intensity import cluster_intensity +from .pixel_clustering._neighbors import spatial_pixel_neighbors + +__all__ = [ + "add_table_layer", + "correct_marker_genes", + "filter_on_size", + "filter_numerical", + "filter_categorical", + "flowsom", + "weighted_channel_expression", + "cell_clustering_preprocess", + "cluster_intensity", + "spatial_pixel_neighbors", + "kmeans", + "leiden", + "nhood_enrichment", + "preprocess_proteomics", + "preprocess_transcriptomics", + "add_regionprop_features", + "cluster_cleanliness", + "score_genes", + "score_genes_iter", + "allocate", + "bin_counts", + "allocate_intensity", + "assign_cells_to_shapes", + "compute_distance_to_shapes", +] diff --git a/src/harpy/table/_region_annotations.py b/src/harpy/table/_region_annotations.py index b463de11..23eb70c3 100755 --- a/src/harpy/table/_region_annotations.py +++ b/src/harpy/table/_region_annotations.py @@ -1,612 +1,595 @@ -from spatialdata import SpatialData -import geopandas as gpd -import pandas as pd -from typing import Literal -from shapely.geometry import Polygon, MultiPolygon, Point, MultiPoint, LineString, MultiLineString -from harpy.table._table import add_table_layer -from harpy.utils._keys import _REGION_KEY, _INSTANCE_KEY - -def assign_cells_to_shapes( - sdata: SpatialData, - shapes_layer: str, - table_layer: str, - output_table_layer: str, - shape_names_column: str = "name", - unique_shape_names_column: str = "name-unique", - output_column: str = None, - mode: Literal["original_names", "unique_names", "both"] = "original_names", - create_column_per_shape: bool = False, - overlap_tolerance: float = 0.1, - spatial_key: str = 'spatial', - xy_columns: tuple = None, - overwrite: bool = False, -): - """ - Assign cells to polygons in a shapes layer and update the sdata table layer. It is recommended to run `hp.sh.prep_region_annotations` first. - - Parameters - ---------- - sdata - The SpatialData object containing the input table layer and shapes layer. - shapes_layer - The shapes layer in `sdata.shapes` to use as input. - table_layer - The table layer in `sdata.tables` to use as input. - output_table_layer - The output table layer in `sdata.tables` to which the updated table layer will be written. - shape_names_column - Column name in shapes layer containing geometry names. - unique_shape_names_column - Column name in shapes layer containing unique geometry names. - output_column - Name of the output column in `sdata.tables[table_layer].obs` to store the shape name of the geometries a cell was found in (if `create_column_per_shape` is False). - For the `unique_names` mode, the output will be stored in `{output_column}-unique` in `sdata.tables[table_layer].obs` (if `create_column_per_shape` is False). - If create_column_per_shape is True and output_column is not None, then column names will be in the format `{output_colum}-{shape_name}`. - If create_column_per_shape is True and output_column is None, then column names will be in the format `{shape_name}`. - mode - When set to `original_names`, original polygon names from `shape_names_column` will be used. - When set to `unique_names`, unique polygon names from `unique_shape_names_column` will be used. Use `both`, to run both modes at the same time. - create_column_per_shape - If True, create one column (named according to the shape names) per shape indicating whether a cell is located inside it. - overlap_tolerance - Tolerance for detecting overlapping polygons (area units of geometry CRS). - spatial_key - Key in `sdata.tables[table_layer].obsm` containing spatial coordinates. Ignored if `xy_columns` is provided. - xy_columns - Tuple of column names in `sdata.tables[table_layer].obs` containing the x and y coordinates the cells. - If None, defaults to using coordinates from `sdata.tables[table_layer].obsm[spatial_key]`. - overwrite - If True, overwrites the `output_table_layer` and/or `output_shapes_layer` if it already exists in `sdata`. - - Notes - ------- - - Only `Polygon` and `MultiPolygon` geometries are supported. Non-polygon geometries (e.g., `Point`, `LineString`) are skipped. - - Returns - ------- - Modified `sdata` object with updated table layer. - """ - - if not output_column and not create_column_per_shape: - raise ValueError("Specify `output_column` or set `create_column_per_shape=True`.") - - # Create copy of shapes layer - gdf = sdata.shapes[shapes_layer].copy() - - # Create copy of table layer - adata = sdata.tables[table_layer].copy() - - # Filter out geometries that are not Polygon or MultiPolygon - supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon))) - skipped = len(gdf) - supported_mask.sum() - if skipped > 0: - print(f"Skipped {skipped} non-polygon geometries in {shapes_layer}.") - if skipped == len(gdf): - print("No supported geometries (Polygon, MultiPolygon) found.") - return sdata - - gdf = gdf[supported_mask].copy().reset_index(drop=True) - - # Check for overlapping polygons - total_area = gdf.geometry.area.sum() - union_area = gdf.geometry.unary_union.area - if total_area - union_area > overlap_tolerance and not create_column_per_shape: - raise ValueError(f"Overlapping polygons detected in {shapes_layer}. Correct polygons or use create_column_per_shape.") - elif total_area - union_area > 0 and not create_column_per_shape: - print(f"Overlaps detected (Δ={total_area - union_area:.3f}), below tolerance threshold {overlap_tolerance}.") - - # Get cell coordinates - if xy_columns is not None: - x_col, y_col = xy_columns - coords = adata.obs[[x_col, y_col]].to_numpy() - else: - if spatial_key not in adata.obsm: - raise KeyError( - f"No spatial coordinates found in `obsm['{spatial_key}']` and `xy_columns` not provided.") - coords = adata.obsm[spatial_key] - if coords.shape[1] != 2: - raise ValueError(f"`obsm['{spatial_key}']` must have shape (n_cells, 2).") - - # Function to assign cell to a polygon - def _assign_region(x, y, gdf, column, sindex): - point = Point(x, y) - - # Quick bounding box search to prefilter the cells - candidate_idx = list(sindex.intersection(point.bounds)) - if not candidate_idx: - return None - - # Slower point‑in‑polygon search - candidate_gdf = gdf.iloc[candidate_idx] - match = candidate_gdf[candidate_gdf.contains(point)] - - if match.empty: - return None - - return match[column].iloc[0] - - # Assign cells to polygons - if create_column_per_shape: - # Collect all relevant name columns based on mode - name_columns = [] - if mode in ("original_names", "both"): - name_columns.append(shape_names_column) - if mode in ("unique_names", "both"): - name_columns.append(unique_shape_names_column) - - # Collect all unique names across both columns - name_to_column = {} - for name_column in name_columns: - for name in gdf[name_column].dropna().astype(str).unique(): - if name not in name_to_column: - name_to_column[name] = name_column - - # Create column per shape name - for shape_name, name_column in name_to_column.items(): - col_name = f"{output_column}-{shape_name}" if output_column else shape_name - - gdf_shape = gdf[gdf[name_column] == shape_name] - if gdf_shape.empty: - continue - - sidx_shape = gdf_shape.sindex # Build spatial index - assigned = [ - _assign_region(x, y, gdf_shape, name_column, sidx_shape) for x, y in coords - ] - adata.obs[col_name] = assigned - - else: - # One output column per mode - run_assign_dict = {} - if mode in ("original_names", "both"): - run_assign_dict[output_column] = shape_names_column - if mode in ("unique_names", "both"): - run_assign_dict[f"{output_column}-unique"] = unique_shape_names_column - - for output_column_name, name_column in run_assign_dict.items(): - # Assign one shape name per cell (to be avoided with overlapping regions) - sidx = gdf.sindex # Build spatial index - assigned = [ - _assign_region(x, y, gdf, name_column, sidx) - for x, y in coords - ] - adata.obs[output_column_name] = assigned - - # Add table layer - sdata = add_table_layer( - sdata, - adata = adata, - output_layer = output_table_layer, - region = adata.obs[_REGION_KEY].cat.categories.to_list(), - overwrite = overwrite, - ) - - return sdata - -def compute_distance_to_shapes( - sdata: SpatialData, - shapes_layer: str, - table_layer: str, - output_table_layer: str, - shape_names_column: str = "name", - unique_shape_names_column: str = "name-unique", - output_name: str = None, - pixel_size_um: float = 1.0, - modes: list[Literal[ - "nearest_edge", "nearest_edge_grouped", "all_edges", - "nearest_outer_edge", "nearest_outer_edge_grouped", "all_outer_edges", - "nearest_inner_edge", "nearest_inner_edge_grouped", "nearest_inner_edge_grouped_unique", "all_inner_edges", - "nearest_centroid", "nearest_centroid_grouped", "all_centroids", - "nearest_point", "nearest_point_grouped", "all_points" - ]] = ["all_edges"], - spatial_key: str = 'spatial', - xy_columns: tuple = None, - overwrite: bool = False, -): - """ - Compute distances from cells to polygons in a shapes layer and update the sdata table layer. - It is recommended to run `hp.sh.prep_region_annotations` first. - - Parameters - ---------- - sdata - The SpatialData object containing the input table layer and shapes layer. - shapes_layer - The shapes layer in `sdata.shapes` to use as input. - table_layer - The table layer in `sdata.tables` to use as input. - output_table_layer - The output table layer in `sdata.tables` to which the updated table layer will be written. - shape_names_column - Column name in shapes layer containing geometry names. - unique_shape_names_column - Column name in shapes layer containing unique geometry names. - output_name - Prefix for new distance columns in `.obs`. If None, no prefix is added. - pixel_size_um - Scale factor to convert distances to microns. Defaults to 1 (i.e. distances are in pixels). - modes - Which distance features to calculate. Options include: - - Edge distances (For Polygon and MultiPolygon geometries) - - `"nearest_edge"`: Distance to the nearest edge (outer or inner) of any polygon. - Creates `{output_name}-distance_to_nearest_edge` and `{output_name}-name_of_nearest_edge` columns. - - `"nearest_edge_grouped"`: For each group of shapes with the same name, compute the distance to the edge that is nearest. - Creates `{output_name}-distance_to_nearest_edge_of_` and `{output_name}-name_of_nearest_edge_of_` columns. - - `"all_edges"`: For each individual polygon, compute the distance to its edge. - Creates `{output_name}-distance_to_edge_of_` column. - - Outer edge distances (For Polygon and MultiPolygon geometries) - - `"nearest_outer_edge"`: Distance to the nearest outer edge of any polygon. - Creates `{output_name}-distance_to_nearest_outer_edge` and `{output_name}-name_of_nearest_outer_edge` columns. - - `"nearest_outer_edge_grouped"`: For each group of shapes with the same name, compute the distance to the outer edge that is nearest. - Creates `{output_name}-distance_to_nearest_outer_edge_of_` and `{output_name}-name_of_nearest_outer_edge_of_` columns. - - `"all_outer_edges"`: For each individual polygon, compute the distance to its outer edge. - Creates `{output_name}-distance_to_outer_edge_of_` column. - - Inner edge (hole) distances (For Polygon and MultiPolygon geometries) - - `"nearest_inner_edge"`: Distance to the nearest interior edge (“hole”) of any polygon. - Creates `{output_name}-distance_to_nearest_inner_edge` and `{output_name}-name_of_nearest_inner_edge` columns. - - `"nearest_inner_edge_grouped"`: For each group of shapes with the same name, compute the distance to the nearest inner edge of that group. - Creates `{output_name}-distance_to_nearest_inner_edge_of_` and `{output_name}-name_of_nearest_inner_edge_of_` columns. - - `"nearest_inner_edge_grouped_unique"`: For each individual polygon, compute the distance to the nearest inner edge of that polygon. - Creates `{output_name}-distance_to_nearest_inner_edge_of_` and `{output_name}-name_of_nearest_inner_edge_of_` columns. - - `"all_inner_edges"`: For all holes, compute the distance to each inner edge. - Creates `{output_name}-distance_to_inner_edge_of_-hole` column. - - Centroid distances (For Polygon and MultiPolygon geometries) - - `"nearest_centroid"`: Distance to the nearest centroid of any polygon. - Creates `{output_name}-distance_to_nearest_centroid` and `{output_name}-name_of_nearest_centroid` columns. - - `"nearest_centroid_grouped"`: For each group of shapes with the same name, compute the distance to the centroid that is nearest. - Creates `{output_name}-distance_to_nearest_centroid_of_` and `{output_name}-name_of_nearest_centroid_of_` columns. - - `"all_centroids"`: For each individual polygon, compute the distance to its centroid. - Creates `{output_name}-distance_to_centroid_of_` column. - - Point distances (For Point geometries) - - `"nearest_point"`: Distance to the nearest point. - Creates `{output_name}-distance_to_nearest_point` and `{output_name}-name_of_nearest_point` columns. - - `"nearest_point_grouped"`: For each group of points with the same name, compute the nearest distance. - Creates `{output_name}-distance_to_nearest_point_of_` and `{output_name}-name_of_nearest_point_of_` columns. - - `"all_points"`: For each individual Point, compute the distance to point coordinates. - Creates `{output_name}-distance_to_point_` column. - - spatial_key - Key in `sdata.tables[table_layer].obsm` containing spatial coordinates. Ignored if `xy_columns` is provided. - xy_columns - Tuple of column names in `sdata.tables[table_layer].obs` containing the x and y coordinates the cells. - If None, defaults to using coordinates from `sdata.tables[table_layer].obsm[spatial_key]`. - overwrite - If True, overwrites the `output_table_layer` if it already exists in `sdata`. - - Notes - ------- - - Only `Polygon`, `MultiPolygon` and `Point` geometries are supported. Other geometries (e.g., `LineString`, `MultiPoint`) are skipped. - - Returns - ------- - Modified `sdata` object with updated table layer. - """ - - if output_name is not None: - output_name = f"{output_name}-" - elif output_name is None: - output_name = "" - - # Create copy of shapes layer - gdf = sdata.shapes[shapes_layer].copy() - - # Create copy of table layer - adata = sdata.tables[table_layer].copy() - - # Filter out geometries that are not Polygon, MultiPolygon or Point - supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon, Point))) - skipped = len(gdf) - supported_mask.sum() - if skipped > 0: - print(f"Skipped {skipped} geometries in {shapes_layer} that are not Polygon, MultiPolygon or Point.") - - gdf = gdf[supported_mask].copy().reset_index(drop=True) - - # Separate gdf by type - gdf_polygons = gdf[gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])].copy() - gdf_points = gdf[gdf.geometry.geom_type == "Point"].copy() - - has_polygons = not gdf_polygons.empty - has_points = not gdf_points.empty - - if not has_polygons and not has_points: - print("No supported geometries (Polygon, MultiPolygon, Point) found.") - return sdata - - # Get cell coordinates - if xy_columns is not None: - x_col, y_col = xy_columns - coords = adata.obs[[x_col, y_col]].to_numpy() - else: - if spatial_key not in adata.obsm: - raise KeyError( - f"No spatial coordinates found in `obsm['{spatial_key}']` and `xy_columns` not provided.") - coords = adata.obsm[spatial_key] - if coords.shape[1] != 2: - raise ValueError(f"`obsm['{spatial_key}']` must have shape (n_cells, 2).") - - # Build point GeoDataFrame - pts = gpd.GeoSeries( - [Point(x, y) for x, y in coords], - crs=gdf.crs - ) - - pts_gdf = gpd.GeoDataFrame({"geometry": pts}, crs=gdf.crs).set_index(adata.obs.index) - - # Polygon and MultiPolygon - if has_polygons: - # Extract exterior and interior boundaries - def _extract_exterior_lines(geom): - if isinstance(geom, Polygon): - return [geom.exterior] - elif isinstance(geom, MultiPolygon): - return [poly.exterior for poly in geom.geoms] - return [] - - def _extract_interior_lines(geom): - if isinstance(geom, Polygon): - return list(geom.interiors) - elif isinstance(geom, MultiPolygon): - lines = [] - for poly in geom.geoms: - lines.extend(poly.interiors) - return lines - return [] - - names = gdf_polygons[unique_shape_names_column].tolist() - exteriors = [MultiLineString(_extract_exterior_lines(geom)) for geom in gdf_polygons.geometry] - interiors = [MultiLineString(_extract_interior_lines(geom)) for geom in gdf_polygons.geometry] - - ext_gdf = gpd.GeoDataFrame({unique_shape_names_column: names}, geometry=exteriors, crs=gdf.crs) - int_gdf = gpd.GeoDataFrame({unique_shape_names_column: names}, geometry=interiors, crs=gdf.crs) - - # Edges (outer and inner) - if "nearest_edge" in modes: - print(f"Calculating 'nearest_edge' distances'") - all_edges_gdf = gpd.GeoDataFrame( - pd.concat([ext_gdf, int_gdf], ignore_index=True), - crs=gdf.crs - ) - joined = gpd.sjoin_nearest( - pts_gdf, - all_edges_gdf, - how="left", distance_col="dist" - ) - adata.obs[f"{output_name}distance_to_nearest_edge"] = ( - joined["dist"].to_numpy() * pixel_size_um - ) - adata.obs[f"{output_name}name_of_nearest_edge"] = joined[unique_shape_names_column] - - if "nearest_edge_grouped" in modes: - print(f"Calculating 'nearest_edge_grouped' distances'") - for name, group in gdf_polygons.groupby(shape_names_column): - edges = [] - labels = [] - for idx, row in group.iterrows(): - edge_lines = _extract_exterior_lines(row.geometry) + _extract_interior_lines(row.geometry) - edges.extend(edge_lines) - labels.extend([row[unique_shape_names_column]] * len(edge_lines)) - - edge_gdf = gpd.GeoDataFrame({"geometry": edges, unique_shape_names_column: labels}, crs=gdf.crs) - joined = gpd.sjoin_nearest(pts_gdf, edge_gdf, how="left", distance_col="dist") - - adata.obs[f"{output_name}distance_to_nearest_edge_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um - adata.obs[f"{output_name}name_of_nearest_edge_of_{name}"] = joined[unique_shape_names_column] - - if "all_edges" in modes: - print(f"Calculating 'all_edges' distances'") - for _, feat in gdf_polygons.reset_index(drop=True).iterrows(): - adata.obs[f"{output_name}distance_to_edge_of_{feat[unique_shape_names_column]}"] = pts.distance(feat.geometry.boundary).to_numpy() * pixel_size_um - - # Outer edges - if "nearest_outer_edge" in modes: - print(f"Calculating 'nearest_outer_edge' distances'") - joined = gpd.sjoin_nearest( - pts_gdf, - ext_gdf, - how="left", distance_col="dist" - ) - adata.obs[f"{output_name}distance_to_nearest_outer_edge"] = ( - joined["dist"].to_numpy() * pixel_size_um - ) - adata.obs[f"{output_name}name_of_nearest_outer_edge"] = joined[unique_shape_names_column] - - if "nearest_outer_edge_grouped" in modes: - print(f"Calculating 'nearest_outer_edge_grouped' distances'") - for name, group in gdf_polygons.groupby(shape_names_column): - edges = [] - labels = [] - for idx, row in group.iterrows(): - edge_lines = _extract_exterior_lines(row.geometry) - edges.extend(edge_lines) - labels.extend([row[unique_shape_names_column]] * len(edge_lines)) - - edge_gdf = gpd.GeoDataFrame({"geometry": edges, unique_shape_names_column: labels}, crs=gdf.crs) - joined = gpd.sjoin_nearest(pts_gdf, edge_gdf, how="left", distance_col="dist") - - adata.obs[f"{output_name}distance_to_nearest_outer_edge_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um - adata.obs[f"{output_name}name_of_nearest_outer_edge_of_{name}"] = joined[unique_shape_names_column] - - if "all_outer_edges" in modes: - print(f"Calculating 'all_outer_edges' distances'") - for _, feat in gdf_polygons.reset_index(drop=True).iterrows(): - adata.obs[f"{output_name}distance_to_outer_edge_of_{feat[unique_shape_names_column]}"] = pts.distance(feat.geometry.exterior).to_numpy() * pixel_size_um - - # Inner edges - hole_geoms = [] - hole_names = [] - - for _, row in gdf_polygons.reset_index(drop=True).iterrows(): - base = row[unique_shape_names_column] - geom = row.geometry - - holes =_extract_interior_lines(geom) - - if len(holes) == 0: - continue - - if len(holes) == 1: - hole = holes[0] - hole_geoms.append(LineString(hole.coords)) - hole_names.append(base) - - else: - for i, hole in enumerate(holes, start=1): - hole_geoms.append(LineString(hole.coords)) - hole_names.append(f"{base}-hole{i}") - - holes_gdf = gpd.GeoDataFrame( - {unique_shape_names_column: hole_names}, - geometry=hole_geoms, - crs=gdf.crs - ) - - if "nearest_inner_edge" in modes and not holes_gdf.empty: - print(f"Calculating 'nearest_inner_edge' distances'") - joined = gpd.sjoin_nearest( - pts_gdf, - holes_gdf, - how="left", - distance_col="dist" - ) - adata.obs[f"{output_name}distance_to_nearest_inner_edge"] = ( - joined["dist"].to_numpy() * pixel_size_um - ) - adata.obs[f"{output_name}name_of_nearest_inner_edge"] = joined[unique_shape_names_column] - - if "nearest_inner_edge_grouped" in modes: - print(f"Calculating 'nearest_inner_edge_grouped' distances'") - for name, group in gdf_polygons.groupby(shape_names_column): - hole_lines = [] - hole_labels = [] - - for idx, row in group.iterrows(): - base = row[unique_shape_names_column] - for i, hole in enumerate(_extract_interior_lines(row.geometry), start=1): - hole_lines.append(LineString(hole.coords)) - hole_labels.append(f"{base}-hole{i}") - - if not hole_lines: - continue - - hole_gdf = gpd.GeoDataFrame({ - "geometry": hole_lines, - "hole_name": hole_labels - }, crs=gdf.crs) - - joined = gpd.sjoin_nearest(pts_gdf, hole_gdf, how="left", distance_col="dist") - - adata.obs[f"{output_name}distance_to_nearest_inner_edge_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um - adata.obs[f"{output_name}name_of_nearest_inner_edge_of_{name}"] = joined["hole_name"] - - if "nearest_inner_edge_grouped_unique" in modes: - print(f"Calculating 'nearest_inner_edge_grouped_unique' distances'") - for idx, row in gdf_polygons.iterrows(): - base = row[unique_shape_names_column] - holes = _extract_interior_lines(row.geometry) - - if not holes: - continue - - hole_geoms = [LineString(hole.coords) for hole in holes] - hole_names = [f"{base}-hole{i+1}" for i in range(len(holes))] - - hole_gdf = gpd.GeoDataFrame({ - "geometry": hole_geoms, - "hole_name": hole_names - }, crs=gdf.crs) - - joined = gpd.sjoin_nearest(pts_gdf, hole_gdf, how="left", distance_col="dist") - - adata.obs[f"{output_name}distance_to_nearest_inner_edge_of_{base}"] = joined["dist"].to_numpy() * pixel_size_um - adata.obs[f"{output_name}name_of_nearest_inner_edge_of_{base}"] = joined["hole_name"] - - if "all_inner_edges" in modes and not holes_gdf.empty: - print(f"Calculating 'all_inner_edges' distances'") - for _, hole in holes_gdf.iterrows(): - hole_name = hole[unique_shape_names_column] - col = f"{output_name}distance_to_inner_edge_of_{hole_name}" - adata.obs[col] = pts.distance(hole.geometry).to_numpy() * pixel_size_um - - # Centroids - centroids_df = gdf_polygons.reset_index(drop=True)[[unique_shape_names_column]].copy() - centroids_df["geometry"] = gdf_polygons.geometry.centroid.reset_index(drop=True) - - centroids_gdf = gpd.GeoDataFrame( - centroids_df, - geometry="geometry", - crs=gdf.crs - ) - - if "nearest_centroid" in modes: - print(f"Calculating 'nearest_centroid' distances'") - joined = gpd.sjoin_nearest( - pts_gdf, centroids_gdf, - how="left", - distance_col="dist" - ) - adata.obs.loc[joined.index, f"{output_name}distance_to_nearest_centroid"] = ( - joined["dist"].to_numpy() * pixel_size_um - ) - adata.obs.loc[joined.index, f"{output_name}name_of_nearest_centroid"] = ( - joined[unique_shape_names_column].to_numpy() - ) - - if "nearest_centroid_grouped" in modes: - print(f"Calculating 'nearest_centroid_grouped' distances'") - for name, group in gdf_polygons.groupby(shape_names_column): - centroids = group.geometry.centroid - labels = group[unique_shape_names_column].tolist() - - centroid_gdf = gpd.GeoDataFrame({ - "geometry": centroids, - unique_shape_names_column: labels - }, crs=gdf.crs) - - joined = gpd.sjoin_nearest(pts_gdf, centroid_gdf, how="left", distance_col="dist") - - adata.obs[f"{output_name}distance_to_nearest_centroid_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um - adata.obs[f"{output_name}name_of_nearest_centroid_of_{name}"] = joined[unique_shape_names_column] - - if "all_centroids" in modes: - print(f"Calculating 'all_centroids' distances'") - for idx, feat in gdf_polygons.reset_index(drop=True).iterrows(): - adata.obs[f"{output_name}distance_to_centroid_of_{feat[unique_shape_names_column]}"] = pts.distance(feat.geometry.centroid).to_numpy() * pixel_size_um - - # Point - if has_points: - if "nearest_point" in modes: - print(f"Calculating 'nearest_point' distances'") - joined = gpd.sjoin_nearest( - pts_gdf, gdf_points, how="left", distance_col="dist" - ) - adata.obs[f"{output_name}distance_to_nearest_point"] = joined["dist"].to_numpy() * pixel_size_um - adata.obs[f"{output_name}name_of_nearest_point"] = joined[f"{shape_names_column}-unique"] - - if "nearest_point_grouped" in modes: - print(f"Calculating 'nearest_point_grouped' distances'") - for name, group in gdf_points.groupby(shape_names_column): - joined = gpd.sjoin_nearest(pts_gdf, group, how="left", distance_col="dist") - adata.obs[f"{output_name}distance_to_nearest_point_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um - adata.obs[f"{output_name}name_of_nearest_point_of_{name}"] = joined[f"{shape_names_column}-unique"] - - if "all_points" in modes: - print(f"Calculating 'all_points' distances'") - for _, row in gdf_points.iterrows(): - adata.obs[f"{output_name}distance_to_point_{row[unique_shape_names_column]}"] = pts.distance(row.geometry).to_numpy() * pixel_size_um - - - # Add table layer - sdata = add_table_layer( - sdata, - adata = adata, - output_layer = output_table_layer, - region = adata.obs[_REGION_KEY].cat.categories.to_list(), - overwrite = overwrite, - ) - - return sdata \ No newline at end of file +from typing import Literal + +import geopandas as gpd +import pandas as pd +from shapely.geometry import LineString, MultiLineString, MultiPolygon, Point, Polygon +from spatialdata import SpatialData + +from harpy.table._table import add_table_layer +from harpy.utils._keys import _REGION_KEY + + +def assign_cells_to_shapes( + sdata: SpatialData, + shapes_layer: str, + table_layer: str, + output_table_layer: str, + shape_names_column: str = "name", + unique_shape_names_column: str = "name-unique", + output_column: str = None, + mode: Literal["original_names", "unique_names", "both"] = "original_names", + create_column_per_shape: bool = False, + overlap_tolerance: float = 0.1, + spatial_key: str = "spatial", + xy_columns: tuple = None, + overwrite: bool = False, +): + """ + Assign cells to polygons in a shapes layer and update the sdata table layer. It is recommended to run `hp.sh.prep_region_annotations` first. + + Parameters + ---------- + sdata + The SpatialData object containing the input table layer and shapes layer. + shapes_layer + The shapes layer in `sdata.shapes` to use as input. + table_layer + The table layer in `sdata.tables` to use as input. + output_table_layer + The output table layer in `sdata.tables` to which the updated table layer will be written. + shape_names_column + Column name in shapes layer containing geometry names. + unique_shape_names_column + Column name in shapes layer containing unique geometry names. + output_column + Name of the output column in `sdata.tables[table_layer].obs` to store the shape name of the geometries a cell was found in (if `create_column_per_shape` is False). + For the `unique_names` mode, the output will be stored in `{output_column}-unique` in `sdata.tables[table_layer].obs` (if `create_column_per_shape` is False). + If create_column_per_shape is True and output_column is not None, then column names will be in the format `{output_colum}-{shape_name}`. + If create_column_per_shape is True and output_column is None, then column names will be in the format `{shape_name}`. + mode + When set to `original_names`, original polygon names from `shape_names_column` will be used. + When set to `unique_names`, unique polygon names from `unique_shape_names_column` will be used. Use `both`, to run both modes at the same time. + create_column_per_shape + If True, create one column (named according to the shape names) per shape indicating whether a cell is located inside it. + overlap_tolerance + Tolerance for detecting overlapping polygons (area units of geometry CRS). + spatial_key + Key in `sdata.tables[table_layer].obsm` containing spatial coordinates. Ignored if `xy_columns` is provided. + xy_columns + Tuple of column names in `sdata.tables[table_layer].obs` containing the x and y coordinates the cells. + If None, defaults to using coordinates from `sdata.tables[table_layer].obsm[spatial_key]`. + overwrite + If True, overwrites the `output_table_layer` and/or `output_shapes_layer` if it already exists in `sdata`. + + Notes + ----- + - Only `Polygon` and `MultiPolygon` geometries are supported. Non-polygon geometries (e.g., `Point`, `LineString`) are skipped. + + Returns + ------- + Modified `sdata` object with updated table layer. + """ + if not output_column and not create_column_per_shape: + raise ValueError("Specify `output_column` or set `create_column_per_shape=True`.") + + # Create copy of shapes layer + gdf = sdata.shapes[shapes_layer].copy() + + # Create copy of table layer + adata = sdata.tables[table_layer].copy() + + # Filter out geometries that are not Polygon or MultiPolygon + supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon))) + skipped = len(gdf) - supported_mask.sum() + if skipped > 0: + print(f"Skipped {skipped} non-polygon geometries in {shapes_layer}.") + if skipped == len(gdf): + print("No supported geometries (Polygon, MultiPolygon) found.") + return sdata + + gdf = gdf[supported_mask].copy().reset_index(drop=True) + + # Check for overlapping polygons + total_area = gdf.geometry.area.sum() + union_area = gdf.geometry.unary_union.area + if total_area - union_area > overlap_tolerance and not create_column_per_shape: + raise ValueError( + f"Overlapping polygons detected in {shapes_layer}. Correct polygons or use create_column_per_shape." + ) + elif total_area - union_area > 0 and not create_column_per_shape: + print(f"Overlaps detected (Δ={total_area - union_area:.3f}), below tolerance threshold {overlap_tolerance}.") + + # Get cell coordinates + if xy_columns is not None: + x_col, y_col = xy_columns + coords = adata.obs[[x_col, y_col]].to_numpy() + else: + if spatial_key not in adata.obsm: + raise KeyError(f"No spatial coordinates found in `obsm['{spatial_key}']` and `xy_columns` not provided.") + coords = adata.obsm[spatial_key] + if coords.shape[1] != 2: + raise ValueError(f"`obsm['{spatial_key}']` must have shape (n_cells, 2).") + + # Function to assign cell to a polygon + def _assign_region(x, y, gdf, column, sindex): + point = Point(x, y) + + # Quick bounding box search to prefilter the cells + candidate_idx = list(sindex.intersection(point.bounds)) + if not candidate_idx: + return None + + # Slower point‑in‑polygon search + candidate_gdf = gdf.iloc[candidate_idx] + match = candidate_gdf[candidate_gdf.contains(point)] + + if match.empty: + return None + + return match[column].iloc[0] + + # Assign cells to polygons + if create_column_per_shape: + # Collect all relevant name columns based on mode + name_columns = [] + if mode in ("original_names", "both"): + name_columns.append(shape_names_column) + if mode in ("unique_names", "both"): + name_columns.append(unique_shape_names_column) + + # Collect all unique names across both columns + name_to_column = {} + for name_column in name_columns: + for name in gdf[name_column].dropna().astype(str).unique(): + if name not in name_to_column: + name_to_column[name] = name_column + + # Create column per shape name + for shape_name, name_column in name_to_column.items(): + col_name = f"{output_column}-{shape_name}" if output_column else shape_name + + gdf_shape = gdf[gdf[name_column] == shape_name] + if gdf_shape.empty: + continue + + sidx_shape = gdf_shape.sindex # Build spatial index + assigned = [_assign_region(x, y, gdf_shape, name_column, sidx_shape) for x, y in coords] + adata.obs[col_name] = assigned + + else: + # One output column per mode + run_assign_dict = {} + if mode in ("original_names", "both"): + run_assign_dict[output_column] = shape_names_column + if mode in ("unique_names", "both"): + run_assign_dict[f"{output_column}-unique"] = unique_shape_names_column + + for output_column_name, name_column in run_assign_dict.items(): + # Assign one shape name per cell (to be avoided with overlapping regions) + sidx = gdf.sindex # Build spatial index + assigned = [_assign_region(x, y, gdf, name_column, sidx) for x, y in coords] + adata.obs[output_column_name] = assigned + + # Add table layer + sdata = add_table_layer( + sdata, + adata=adata, + output_layer=output_table_layer, + region=adata.obs[_REGION_KEY].cat.categories.to_list(), + overwrite=overwrite, + ) + + return sdata + + +def compute_distance_to_shapes( + sdata: SpatialData, + shapes_layer: str, + table_layer: str, + output_table_layer: str, + shape_names_column: str = "name", + unique_shape_names_column: str = "name-unique", + output_name: str = None, + pixel_size_um: float = 1.0, + modes: list[ + Literal[ + "nearest_edge", + "nearest_edge_grouped", + "all_edges", + "nearest_outer_edge", + "nearest_outer_edge_grouped", + "all_outer_edges", + "nearest_inner_edge", + "nearest_inner_edge_grouped", + "nearest_inner_edge_grouped_unique", + "all_inner_edges", + "nearest_centroid", + "nearest_centroid_grouped", + "all_centroids", + "nearest_point", + "nearest_point_grouped", + "all_points", + ] + ] = None, + spatial_key: str = "spatial", + xy_columns: tuple = None, + overwrite: bool = False, +): + """ + Compute distances from cells to polygons in a shapes layer and update the sdata table layer. + It is recommended to run `hp.sh.prep_region_annotations` first. + + Parameters + ---------- + sdata + The SpatialData object containing the input table layer and shapes layer. + shapes_layer + The shapes layer in `sdata.shapes` to use as input. + table_layer + The table layer in `sdata.tables` to use as input. + output_table_layer + The output table layer in `sdata.tables` to which the updated table layer will be written. + shape_names_column + Column name in shapes layer containing geometry names. + unique_shape_names_column + Column name in shapes layer containing unique geometry names. + output_name + Prefix for new distance columns in `.obs`. If None, no prefix is added. + pixel_size_um + Scale factor to convert distances to microns. Defaults to 1 (i.e. distances are in pixels). + modes + Which distance features to calculate. Options include: + + Edge distances (For Polygon and MultiPolygon geometries) + - `"nearest_edge"`: Distance to the nearest edge (outer or inner) of any polygon. + Creates `{output_name}-distance_to_nearest_edge` and `{output_name}-name_of_nearest_edge` columns. + - `"nearest_edge_grouped"`: For each group of shapes with the same name, compute the distance to the edge that is nearest. + Creates `{output_name}-distance_to_nearest_edge_of_` and `{output_name}-name_of_nearest_edge_of_` columns. + - `"all_edges"`: For each individual polygon, compute the distance to its edge. + Creates `{output_name}-distance_to_edge_of_` column. + + Outer edge distances (For Polygon and MultiPolygon geometries) + - `"nearest_outer_edge"`: Distance to the nearest outer edge of any polygon. + Creates `{output_name}-distance_to_nearest_outer_edge` and `{output_name}-name_of_nearest_outer_edge` columns. + - `"nearest_outer_edge_grouped"`: For each group of shapes with the same name, compute the distance to the outer edge that is nearest. + Creates `{output_name}-distance_to_nearest_outer_edge_of_` and `{output_name}-name_of_nearest_outer_edge_of_` columns. + - `"all_outer_edges"`: For each individual polygon, compute the distance to its outer edge. + Creates `{output_name}-distance_to_outer_edge_of_` column. + + Inner edge (hole) distances (For Polygon and MultiPolygon geometries) + - `"nearest_inner_edge"`: Distance to the nearest interior edge (“hole”) of any polygon. + Creates `{output_name}-distance_to_nearest_inner_edge` and `{output_name}-name_of_nearest_inner_edge` columns. + - `"nearest_inner_edge_grouped"`: For each group of shapes with the same name, compute the distance to the nearest inner edge of that group. + Creates `{output_name}-distance_to_nearest_inner_edge_of_` and `{output_name}-name_of_nearest_inner_edge_of_` columns. + - `"nearest_inner_edge_grouped_unique"`: For each individual polygon, compute the distance to the nearest inner edge of that polygon. + Creates `{output_name}-distance_to_nearest_inner_edge_of_` and `{output_name}-name_of_nearest_inner_edge_of_` columns. + - `"all_inner_edges"`: For all holes, compute the distance to each inner edge. + Creates `{output_name}-distance_to_inner_edge_of_-hole` column. + + Centroid distances (For Polygon and MultiPolygon geometries) + - `"nearest_centroid"`: Distance to the nearest centroid of any polygon. + Creates `{output_name}-distance_to_nearest_centroid` and `{output_name}-name_of_nearest_centroid` columns. + - `"nearest_centroid_grouped"`: For each group of shapes with the same name, compute the distance to the centroid that is nearest. + Creates `{output_name}-distance_to_nearest_centroid_of_` and `{output_name}-name_of_nearest_centroid_of_` columns. + - `"all_centroids"`: For each individual polygon, compute the distance to its centroid. + Creates `{output_name}-distance_to_centroid_of_` column. + + Point distances (For Point geometries) + - `"nearest_point"`: Distance to the nearest point. + Creates `{output_name}-distance_to_nearest_point` and `{output_name}-name_of_nearest_point` columns. + - `"nearest_point_grouped"`: For each group of points with the same name, compute the nearest distance. + Creates `{output_name}-distance_to_nearest_point_of_` and `{output_name}-name_of_nearest_point_of_` columns. + - `"all_points"`: For each individual Point, compute the distance to point coordinates. + Creates `{output_name}-distance_to_point_` column. + + spatial_key + Key in `sdata.tables[table_layer].obsm` containing spatial coordinates. Ignored if `xy_columns` is provided. + xy_columns + Tuple of column names in `sdata.tables[table_layer].obs` containing the x and y coordinates the cells. + If None, defaults to using coordinates from `sdata.tables[table_layer].obsm[spatial_key]`. + overwrite + If True, overwrites the `output_table_layer` if it already exists in `sdata`. + + Notes + ----- + - Only `Polygon`, `MultiPolygon` and `Point` geometries are supported. Other geometries (e.g., `LineString`, `MultiPoint`) are skipped. + + Returns + ------- + Modified `sdata` object with updated table layer. + """ + if modes is None: + modes = ["all_edges"] + if output_name is not None: + output_name = f"{output_name}-" + elif output_name is None: + output_name = "" + + # Create copy of shapes layer + gdf = sdata.shapes[shapes_layer].copy() + + # Create copy of table layer + adata = sdata.tables[table_layer].copy() + + # Filter out geometries that are not Polygon, MultiPolygon or Point + supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon, Point))) + skipped = len(gdf) - supported_mask.sum() + if skipped > 0: + print(f"Skipped {skipped} geometries in {shapes_layer} that are not Polygon, MultiPolygon or Point.") + + gdf = gdf[supported_mask].copy().reset_index(drop=True) + + # Separate gdf by type + gdf_polygons = gdf[gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])].copy() + gdf_points = gdf[gdf.geometry.geom_type == "Point"].copy() + + has_polygons = not gdf_polygons.empty + has_points = not gdf_points.empty + + if not has_polygons and not has_points: + print("No supported geometries (Polygon, MultiPolygon, Point) found.") + return sdata + + # Get cell coordinates + if xy_columns is not None: + x_col, y_col = xy_columns + coords = adata.obs[[x_col, y_col]].to_numpy() + else: + if spatial_key not in adata.obsm: + raise KeyError(f"No spatial coordinates found in `obsm['{spatial_key}']` and `xy_columns` not provided.") + coords = adata.obsm[spatial_key] + if coords.shape[1] != 2: + raise ValueError(f"`obsm['{spatial_key}']` must have shape (n_cells, 2).") + + # Build point GeoDataFrame + pts = gpd.GeoSeries([Point(x, y) for x, y in coords], crs=gdf.crs) + + pts_gdf = gpd.GeoDataFrame({"geometry": pts}, crs=gdf.crs).set_index(adata.obs.index) + + # Polygon and MultiPolygon + if has_polygons: + # Extract exterior and interior boundaries + def _extract_exterior_lines(geom): + if isinstance(geom, Polygon): + return [geom.exterior] + elif isinstance(geom, MultiPolygon): + return [poly.exterior for poly in geom.geoms] + return [] + + def _extract_interior_lines(geom): + if isinstance(geom, Polygon): + return list(geom.interiors) + elif isinstance(geom, MultiPolygon): + lines = [] + for poly in geom.geoms: + lines.extend(poly.interiors) + return lines + return [] + + names = gdf_polygons[unique_shape_names_column].tolist() + exteriors = [MultiLineString(_extract_exterior_lines(geom)) for geom in gdf_polygons.geometry] + interiors = [MultiLineString(_extract_interior_lines(geom)) for geom in gdf_polygons.geometry] + + ext_gdf = gpd.GeoDataFrame({unique_shape_names_column: names}, geometry=exteriors, crs=gdf.crs) + int_gdf = gpd.GeoDataFrame({unique_shape_names_column: names}, geometry=interiors, crs=gdf.crs) + + # Edges (outer and inner) + if "nearest_edge" in modes: + print("Calculating 'nearest_edge' distances'") + all_edges_gdf = gpd.GeoDataFrame(pd.concat([ext_gdf, int_gdf], ignore_index=True), crs=gdf.crs) + joined = gpd.sjoin_nearest(pts_gdf, all_edges_gdf, how="left", distance_col="dist") + adata.obs[f"{output_name}distance_to_nearest_edge"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}name_of_nearest_edge"] = joined[unique_shape_names_column] + + if "nearest_edge_grouped" in modes: + print("Calculating 'nearest_edge_grouped' distances'") + for name, group in gdf_polygons.groupby(shape_names_column): + edges = [] + labels = [] + for _idx, row in group.iterrows(): + edge_lines = _extract_exterior_lines(row.geometry) + _extract_interior_lines(row.geometry) + edges.extend(edge_lines) + labels.extend([row[unique_shape_names_column]] * len(edge_lines)) + + edge_gdf = gpd.GeoDataFrame({"geometry": edges, unique_shape_names_column: labels}, crs=gdf.crs) + joined = gpd.sjoin_nearest(pts_gdf, edge_gdf, how="left", distance_col="dist") + + adata.obs[f"{output_name}distance_to_nearest_edge_of_{name}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs[f"{output_name}name_of_nearest_edge_of_{name}"] = joined[unique_shape_names_column] + + if "all_edges" in modes: + print("Calculating 'all_edges' distances'") + for _, feat in gdf_polygons.reset_index(drop=True).iterrows(): + adata.obs[f"{output_name}distance_to_edge_of_{feat[unique_shape_names_column]}"] = ( + pts.distance(feat.geometry.boundary).to_numpy() * pixel_size_um + ) + + # Outer edges + if "nearest_outer_edge" in modes: + print("Calculating 'nearest_outer_edge' distances'") + joined = gpd.sjoin_nearest(pts_gdf, ext_gdf, how="left", distance_col="dist") + adata.obs[f"{output_name}distance_to_nearest_outer_edge"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}name_of_nearest_outer_edge"] = joined[unique_shape_names_column] + + if "nearest_outer_edge_grouped" in modes: + print("Calculating 'nearest_outer_edge_grouped' distances'") + for name, group in gdf_polygons.groupby(shape_names_column): + edges = [] + labels = [] + for _idx, row in group.iterrows(): + edge_lines = _extract_exterior_lines(row.geometry) + edges.extend(edge_lines) + labels.extend([row[unique_shape_names_column]] * len(edge_lines)) + + edge_gdf = gpd.GeoDataFrame({"geometry": edges, unique_shape_names_column: labels}, crs=gdf.crs) + joined = gpd.sjoin_nearest(pts_gdf, edge_gdf, how="left", distance_col="dist") + + adata.obs[f"{output_name}distance_to_nearest_outer_edge_of_{name}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs[f"{output_name}name_of_nearest_outer_edge_of_{name}"] = joined[unique_shape_names_column] + + if "all_outer_edges" in modes: + print("Calculating 'all_outer_edges' distances'") + for _, feat in gdf_polygons.reset_index(drop=True).iterrows(): + adata.obs[f"{output_name}distance_to_outer_edge_of_{feat[unique_shape_names_column]}"] = ( + pts.distance(feat.geometry.exterior).to_numpy() * pixel_size_um + ) + + # Inner edges + hole_geoms = [] + hole_names = [] + + for _, row in gdf_polygons.reset_index(drop=True).iterrows(): + base = row[unique_shape_names_column] + geom = row.geometry + + holes = _extract_interior_lines(geom) + + if len(holes) == 0: + continue + + if len(holes) == 1: + hole = holes[0] + hole_geoms.append(LineString(hole.coords)) + hole_names.append(base) + + else: + for i, hole in enumerate(holes, start=1): + hole_geoms.append(LineString(hole.coords)) + hole_names.append(f"{base}-hole{i}") + + holes_gdf = gpd.GeoDataFrame({unique_shape_names_column: hole_names}, geometry=hole_geoms, crs=gdf.crs) + + if "nearest_inner_edge" in modes and not holes_gdf.empty: + print("Calculating 'nearest_inner_edge' distances'") + joined = gpd.sjoin_nearest(pts_gdf, holes_gdf, how="left", distance_col="dist") + adata.obs[f"{output_name}distance_to_nearest_inner_edge"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}name_of_nearest_inner_edge"] = joined[unique_shape_names_column] + + if "nearest_inner_edge_grouped" in modes: + print("Calculating 'nearest_inner_edge_grouped' distances'") + for name, group in gdf_polygons.groupby(shape_names_column): + hole_lines = [] + hole_labels = [] + + for _idx, row in group.iterrows(): + base = row[unique_shape_names_column] + for i, hole in enumerate(_extract_interior_lines(row.geometry), start=1): + hole_lines.append(LineString(hole.coords)) + hole_labels.append(f"{base}-hole{i}") + + if not hole_lines: + continue + + hole_gdf = gpd.GeoDataFrame({"geometry": hole_lines, "hole_name": hole_labels}, crs=gdf.crs) + + joined = gpd.sjoin_nearest(pts_gdf, hole_gdf, how="left", distance_col="dist") + + adata.obs[f"{output_name}distance_to_nearest_inner_edge_of_{name}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs[f"{output_name}name_of_nearest_inner_edge_of_{name}"] = joined["hole_name"] + + if "nearest_inner_edge_grouped_unique" in modes: + print("Calculating 'nearest_inner_edge_grouped_unique' distances'") + for _idx, row in gdf_polygons.iterrows(): + base = row[unique_shape_names_column] + holes = _extract_interior_lines(row.geometry) + + if not holes: + continue + + hole_geoms = [LineString(hole.coords) for hole in holes] + hole_names = [f"{base}-hole{i + 1}" for i in range(len(holes))] + + hole_gdf = gpd.GeoDataFrame({"geometry": hole_geoms, "hole_name": hole_names}, crs=gdf.crs) + + joined = gpd.sjoin_nearest(pts_gdf, hole_gdf, how="left", distance_col="dist") + + adata.obs[f"{output_name}distance_to_nearest_inner_edge_of_{base}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs[f"{output_name}name_of_nearest_inner_edge_of_{base}"] = joined["hole_name"] + + if "all_inner_edges" in modes and not holes_gdf.empty: + print("Calculating 'all_inner_edges' distances'") + for _, hole in holes_gdf.iterrows(): + hole_name = hole[unique_shape_names_column] + col = f"{output_name}distance_to_inner_edge_of_{hole_name}" + adata.obs[col] = pts.distance(hole.geometry).to_numpy() * pixel_size_um + + # Centroids + centroids_df = gdf_polygons.reset_index(drop=True)[[unique_shape_names_column]].copy() + centroids_df["geometry"] = gdf_polygons.geometry.centroid.reset_index(drop=True) + + centroids_gdf = gpd.GeoDataFrame(centroids_df, geometry="geometry", crs=gdf.crs) + + if "nearest_centroid" in modes: + print("Calculating 'nearest_centroid' distances'") + joined = gpd.sjoin_nearest(pts_gdf, centroids_gdf, how="left", distance_col="dist") + adata.obs.loc[joined.index, f"{output_name}distance_to_nearest_centroid"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs.loc[joined.index, f"{output_name}name_of_nearest_centroid"] = joined[ + unique_shape_names_column + ].to_numpy() + + if "nearest_centroid_grouped" in modes: + print("Calculating 'nearest_centroid_grouped' distances'") + for name, group in gdf_polygons.groupby(shape_names_column): + centroids = group.geometry.centroid + labels = group[unique_shape_names_column].tolist() + + centroid_gdf = gpd.GeoDataFrame({"geometry": centroids, unique_shape_names_column: labels}, crs=gdf.crs) + + joined = gpd.sjoin_nearest(pts_gdf, centroid_gdf, how="left", distance_col="dist") + + adata.obs[f"{output_name}distance_to_nearest_centroid_of_{name}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs[f"{output_name}name_of_nearest_centroid_of_{name}"] = joined[unique_shape_names_column] + + if "all_centroids" in modes: + print("Calculating 'all_centroids' distances'") + for _idx, feat in gdf_polygons.reset_index(drop=True).iterrows(): + adata.obs[f"{output_name}distance_to_centroid_of_{feat[unique_shape_names_column]}"] = ( + pts.distance(feat.geometry.centroid).to_numpy() * pixel_size_um + ) + + # Point + if has_points: + if "nearest_point" in modes: + print("Calculating 'nearest_point' distances'") + joined = gpd.sjoin_nearest(pts_gdf, gdf_points, how="left", distance_col="dist") + adata.obs[f"{output_name}distance_to_nearest_point"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}name_of_nearest_point"] = joined[f"{shape_names_column}-unique"] + + if "nearest_point_grouped" in modes: + print("Calculating 'nearest_point_grouped' distances'") + for name, group in gdf_points.groupby(shape_names_column): + joined = gpd.sjoin_nearest(pts_gdf, group, how="left", distance_col="dist") + adata.obs[f"{output_name}distance_to_nearest_point_of_{name}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs[f"{output_name}name_of_nearest_point_of_{name}"] = joined[f"{shape_names_column}-unique"] + + if "all_points" in modes: + print("Calculating 'all_points' distances'") + for _, row in gdf_points.iterrows(): + adata.obs[f"{output_name}distance_to_point_{row[unique_shape_names_column]}"] = ( + pts.distance(row.geometry).to_numpy() * pixel_size_um + ) + + # Add table layer + sdata = add_table_layer( + sdata, + adata=adata, + output_layer=output_table_layer, + region=adata.obs[_REGION_KEY].cat.categories.to_list(), + overwrite=overwrite, + ) + + return sdata diff --git a/src/harpy/table/_table.py b/src/harpy/table/_table.py index afa90d5d..e91318b7 100644 --- a/src/harpy/table/_table.py +++ b/src/harpy/table/_table.py @@ -1,13 +1,12 @@ from __future__ import annotations -from collections.abc import Iterable +from collections.abc import Iterable, Sequence import numpy as np import pandas as pd from anndata import AnnData from spatialdata import SpatialData from spatialdata.models import TableModel -from typing import Optional, Union, Sequence from harpy.shape._shape import filter_shapes_layer from harpy.table._manager import TableLayerManager @@ -269,18 +268,17 @@ def filter_on_size( ------- The updated SpatialData object. """ - sdata = filter_numerical( - sdata = sdata, - labels_layer = labels_layer, - table_layer = table_layer, - output_layer = output_layer, - numerical_column = cellsize_key, - min_value = min_size, - max_value = max_size, - update_shapes_layers = update_shapes_layers, - prefix_filtered_shapes_layer = prefix_filtered_shapes_layer, - overwrite = overwrite, + sdata=sdata, + labels_layer=labels_layer, + table_layer=table_layer, + output_layer=output_layer, + numerical_column=cellsize_key, + min_value=min_size, + max_value=max_size, + update_shapes_layers=update_shapes_layers, + prefix_filtered_shapes_layer=prefix_filtered_shapes_layer, + overwrite=overwrite, ) return sdata @@ -301,7 +299,7 @@ def filter_numerical( """Returns the updated SpatialData object. All cells with a size outside of the min and max size range are removed using the `numerical_column` in `.obs` (cells are kept if min_value ≤ numerical_column ≤ max_value). - + Parameters ---------- sdata @@ -337,12 +335,10 @@ def filter_numerical( process_table_instance = ProcessTable(sdata, labels_layer=labels_layer, table_layer=table_layer) adata = process_table_instance._get_adata().copy() start = adata.shape[0] - + if numerical_column not in adata.obs.columns: - raise ValueError( - f"Column '{numerical_column}' not found in '{table_layer}.obs'. " - ) - + raise ValueError(f"Column '{numerical_column}' not found in '{table_layer}.obs'. ") + if not np.issubdtype(adata.obs[numerical_column].dtype, np.number): raise ValueError( f"Column '{numerical_column}' must be numeric, but dtype is {adata.obs[numerical_column].dtype}." @@ -350,7 +346,7 @@ def filter_numerical( # Filter cells based on min and max values mask = pd.Series(True, index=adata.obs.index) - + if min_value is not None: below = (adata.obs[numerical_column] < min_value).sum() log.info(f"Removed {below} cells below {min_value}.") @@ -359,7 +355,7 @@ def filter_numerical( above = (adata.obs[numerical_column] > max_value).sum() log.info(f"Removed {above} cells above {max_value}.") mask &= adata.obs[numerical_column] <= max_value - + adata = adata[mask, :].copy() sdata = add_table_layer( @@ -393,15 +389,15 @@ def filter_categorical( table_layer: str, output_layer: str, categorical_column: str, - include_values: Optional[Union[str, Sequence[str]]] = None, - exclude_values: Optional[Union[str, Sequence[str]]] = None, + include_values: str | Sequence[str] | None = None, + exclude_values: str | Sequence[str] | None = None, update_shapes_layers: bool = True, prefix_filtered_shapes_layer: str = "filtered", overwrite: bool = False, ) -> SpatialData: """Filter cells based on categorical values in `.obs`. - Removes or keeps cells based on specific values in a categorical column of + Removes or keeps cells based on specific values in a categorical column of `sdata.tables[table_layer].obs`. Parameters @@ -444,7 +440,7 @@ def filter_categorical( process_table_instance = ProcessTable(sdata, labels_layer=labels_layer, table_layer=table_layer) adata = process_table_instance._get_adata().copy() start = adata.shape[0] - + if categorical_column not in adata.obs.columns: raise ValueError(f"Column '{categorical_column}' not found in '{table_layer}.obs'.") From 57cc6a4fcb1d3cfaec382f2fb977fdb84e96afbb Mon Sep 17 00:00:00 2001 From: julienmortier Date: Mon, 3 Nov 2025 15:53:35 +0100 Subject: [PATCH 3/5] transformation bug fix transformation bug fix --- src/harpy/table/__init__.pyi | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 src/harpy/table/__init__.pyi diff --git a/src/harpy/table/__init__.pyi b/src/harpy/table/__init__.pyi old mode 100755 new mode 100644 From 3ad647bf7065bf0b44b159aa667564146bf6ab11 Mon Sep 17 00:00:00 2001 From: julienmortier Date: Tue, 18 Nov 2025 17:24:11 +0100 Subject: [PATCH 4/5] updates --- src/harpy/shape/__init__.pyi | 10 +- src/harpy/shape/_filters.py | 674 ++++++++++++++++--------- src/harpy/shape/_shape.py | 79 +-- src/harpy/table/__init__.pyi | 4 +- src/harpy/table/_region_annotations.py | 384 +++++++------- src/harpy/table/_table.py | 196 ++++--- 6 files changed, 798 insertions(+), 549 deletions(-) diff --git a/src/harpy/shape/__init__.pyi b/src/harpy/shape/__init__.pyi index 7546a934..3a08db60 100644 --- a/src/harpy/shape/__init__.pyi +++ b/src/harpy/shape/__init__.pyi @@ -1,6 +1,6 @@ from ._cell_expansion import create_voronoi_boundaries -from ._filters import filter_by_morphology, filter_by_shapes -from ._shape import add_shapes_layer, filter_shapes_layer, intersect_rectangles, prep_region_annotations, vectorize +from ._shape import add_shapes_layer, filter_shapes_layer, intersect_rectangles, vectorize, prep_region_annotations +from ._filters import morphological_features, filter_shapes_numerical, filter_shapes_categorical, filter_shapes_by_shapes __all__ = [ "add_shapes_layer", @@ -9,6 +9,8 @@ __all__ = [ "intersect_rectangles", "vectorize", "prep_region_annotations", - "filter_by_morphology", - "filter_by_shapes", + "morphological_features", + "filter_shapes_numerical", + "filter_shapes_categorical", + "filter_shapes_by_shapes", ] diff --git a/src/harpy/shape/_filters.py b/src/harpy/shape/_filters.py index f7f8a5c6..f7aeeb56 100755 --- a/src/harpy/shape/_filters.py +++ b/src/harpy/shape/_filters.py @@ -1,276 +1,232 @@ +from typing import Optional, Dict, Union, List, Literal, Sequence +from spatialdata import SpatialData import geopandas as gpd -import numpy as np import pandas as pd -from shapely.geometry import MultiPolygon, Polygon -from spatialdata import SpatialData - +import numpy as np +from shapely.geometry import Polygon, MultiPolygon, Point, MultiPoint, LineString, MultiLineString +from shapely.ops import unary_union from harpy.shape._shape import add_shapes_layer - -def filter_by_morphology( +from spatialdata.transformations import get_transformation + +from harpy.utils.pylogger import get_pylogger +log = get_pylogger(__name__) + +# Extract exterior and interior boundaries +def _extract_exterior_lines(geom): + if isinstance(geom, Polygon): + return [geom.exterior] + elif isinstance(geom, MultiPolygon): + return [poly.exterior for poly in geom.geoms] + return [] + +def _extract_interior_lines(geom): + if isinstance(geom, Polygon): + return list(geom.interiors) + elif isinstance(geom, MultiPolygon): + lines = [] + for poly in geom.geoms: + lines.extend(poly.interiors) + return lines + return [] + +def morphological_features( sdata: SpatialData, shapes_layer: str, output_shapes_layer: str, - filters: dict[str, tuple] | None = None, - keep_unsupported: bool = False, - calculate_all_features: bool = False, - calculate_all_features_grouped: bool = False, shape_names_column: str = "name", + grouped_features: bool = False, pixel_size_um: float = 1.0, - overwrite: bool = True, -): + overwrite: bool = False, +) -> SpatialData: """ - Filter polygons in a SpatialData shapes layer based on morphological features. - It is recommended to run `hp.sh.prep_region_annotations` first when filtering region annotations that - may contain multipolygons, unnamed annotations, etc. + Compute morphological features for polygons in a shapes layer. Only Polygon and MultiPolygon are supported. + It is recommended to run `hp.sh.prep_region_annotations` first when computing features for region annotations that + may contain multipolygons, point geometries, unnamed annotations, etc. + + - `area`: Area of the polygon (px² or µm²). + - `perimeter`: Perimeter length (px or µm). + - `equivalent_diameter`: Diameter of a circle with the same area (px or µm). + - `convex_area`: Area of convex hull (px² or µm²). + - `convex_perimeter`: Perimeter of convex hull (px or µm). + - `circularity`: 4π * area / perimeter². + → 1 for a perfect circle; lower = irregular shape. + - `compactness`: perimeter² / area. + - `solidity`: area / convex_area. + → Low values mean concave or fragmented shapes. + - `convexity`: convex_perimeter / perimeter. + → 1 for perfectly convex shapes. Lower for rough or spiky boundaries. + - `centroid_x`: X-coordinate of the polygon centroid (px). + - `centroid_y`: Y-coordinate of the polygon centroid (px). + - `centroid_dif`: Distance between polygon and convex hull centroids normalized by the polygon area. + → Captures off-centered concavity or asymmetry. + - `major_axis_length`: Length of the longest side of the minimum rotated bounding box (px or µm). + - `minor_axis_length`: Length of the shortest side of the minimum rotated bounding box (px or µm). + - `major_minor_axis_ratio`: major_axis_length / minor_axis_length. + - `num_vertices`: Number of vertices along exterior boundaries. + - `boundary_complexity`: num_vertices / perimeter. + → Normalized measure of boundary irregularity. + - `num_holes`: Count of internal holes. + - `hole_area`: Area covered by internal holes (px² or µm²) Parameters ---------- sdata - SpatialData object containing the input shapes layer. + SpatialData object containing the shapes layer. shapes_layer - Name of the shapes layer in `sdata.shapes` containing polygons. + Name of the input shapes layer. output_shapes_layer - Name of the shapes layer to store the filtered polygons. - filters - Dictionary specifying filtering thresholds. - Each entry should be of the form: - `{ "feature_name": (min_value, max_value) }` - where `None` can be used to skip a bound, e.g.: - `{ "area": (50, 200), "convexity": (None, 0.8) }` - - Supported features: - - `"area"`: Area of the polygon (px²). - → Use to filter out very small or very large polygons. - - `"perimeter"`: Perimeter length (px). - → Use to detect irregular boundaries or fragmented shapes. - - `"circularity"`: 4π * area / perimeter². - → 1 for a perfect circle; lower = irregular shape. - - `"compactness"`: perimeter² / area. - → Shape compactness measure. - - `"convex_area"`: Area of convex hull (px²). - → Useful with solidity and convexity. - - `"solidity"`: area / convex_area. - → Low values mean concave or fragmented shapes. - - `"convexity"`: convex_perimeter / perimeter. - → 1 for perfectly convex shapes. Lower for rough or spiky boundaries. - - `"centroid_dif"`: Distance between polygon and convex hull centroids normalized by the polygon area. - → Captures off-centered concavity or asymmetry. - - `"major_axis_length"`: Length of the longest side of the minimum rotated bounding box (px). - → Use to filter very long or very short shapes. - - `"minor_axis_length"`: Length of the shortest side of the minimum rotated bounding box (px). - → Use to filter very wide or very skinny shapes. - - `"major_minor_axis_ratio"`: major_axis_length / minor_axis_length. - → High values indicate elongated polygons; ~1 for round shapes. - - You can use grouped filters by suffixing features with `-grouped`. - Grouped filters merge polygons sharing the same name in `shape_names_column` before computing morphological features and - can be used interchangibly with regular filters. - - keep_unsupported - Only Polygon and MultiPolygon are supported. Set keep_unsupported to True to skip any unssupported geometries types (e.g. Point), but - keep them in the output_shapes_layer. Set to False to remove them from output_shapes_layer. - calculate_all_features - If True, computes all supported morphological features regardless of which ones are used in `filters` and saves them to `output_shapes_layer`. - If False, only computes and saves morphological features needed for `filters`. - calculate_all_features_grouped - If True, computes all morphological features for merged group geometries (by {shape_names_column}), regardless - of which ones are used in grouped filters. + Name of the output shapes layer to store results. shape_names_column Column name in shapes layer containing geometry names. Required when using grouped filters. - + grouped_features + If True, also computes grouped morphological features by dissolving polygons sharing + the same name in `shape_names_column`. pixel_size_um - Scale factor to convert geometric measurements from pixel units to microns. - - Applied to: - * "area" and "convex_area" → scaled by (pixel_size_um)² + Scale factor to convert geometric measurements from pixel to micron units. + - Applied to: * "perimeter", "major_axis_length", and "minor_axis_length" → scaled by (pixel_size_um) - - Dimensionless ratios (e.g., "circularity", "compactness", "solidity", "convexity", "major_minor_axis_ratio") + * "area" and "convex_area" → scaled by (pixel_size_um)² + - Dimensionless ratios (e.g., "circularity", "compactness", "solidity", "convexity", "centroid_dif", "major_minor_axis_ratio") are unaffected by scaling but are computed using the scaled geometric quantities. + - XY-coordinates are kept in original units. Defaults to 1.0 (no scaling, i.e. units remain in pixels). - Note that this affects the min/max values that need to be specified in `filters`. overwrite - Whether to overwrite an existing shapes layer. + Whether to overwrite existing shapes layer. Returns ------- - SpatialData object with updated shapes layer containing only the filtered polygons. + SpatialData + Object with updated shapes layer containing morphological feature columns. """ - # Get filters and split into individual and grouped filters - filters = filters or {} - - grouped_filters = {k.replace("-grouped", ""): v for k, v in filters.items() if k.endswith("-grouped")} - individual_filters = {k: v for k, v in filters.items() if not k.endswith("-grouped")} - - required_individual = set(individual_filters.keys()) - required_grouped = set(grouped_filters.keys()) - - if calculate_all_features: - required_individual.update( - [ - "area", - "perimeter", - "convex_area", - "circularity", - "compactness", - "solidity", - "convexity", - "centroid_dif", - "major_axis_length", - "minor_axis_length", - "major_minor_axis_ratio", - ] - ) - print("Calculating all supported morphological features (per polygon).") - - if calculate_all_features_grouped: - required_grouped.update( - [ - "area", - "perimeter", - "convex_area", - "circularity", - "compactness", - "solidity", - "convexity", - "centroid_dif", - "major_axis_length", - "minor_axis_length", - "major_minor_axis_ratio", - ] - ) - print("Calculating all supported morphological features (grouped).") - # Create copy of shapes layer gdf = sdata.shapes[shapes_layer].copy() - + # Filter out geometries that are not Polygon or MultiPolygon supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon))) unssuported = len(gdf) - supported_mask.sum() if unssuported > 0: - print(f"Found {unssuported} non-polygon geometries in {shapes_layer}.") - if unssuported == len(gdf): - print("No supported geometries (Polygon, MultiPolygon) found. Exiting...") + raise ValueError(f"Found {unssuported} non-polygon geometries in {shapes_layer}. Consider running `hp.sh.prep_region_annotations` first to clean up geometries.") + if unssuported == len(gdf): + log.warning(f"No supported geometries (Polygon, MultiPolygon) found. Skipping addition of shapes layer '{output_shapes_layer}' to sdata.") return sdata - if keep_unsupported: - gdf_skipped = gdf[~supported_mask].copy() - else: - gdf_skipped = gpd.GeoDataFrame(columns=gdf.columns, geometry=[]) - gdf = gdf[supported_mask].copy() - + + # Multipolygon check + n_multipolygons = gdf.geometry.apply(lambda x: isinstance(x, MultiPolygon)).sum() + if n_multipolygons > 0: + log.warning( + f"Detected {n_multipolygons} MultiPolygon geometries in '{shapes_layer}'. " + "Consider running `hp.sh.prep_region_annotations` first to split MultiPolygons before feature extraction." + ) + # Dissolve polygons by name grouped_gdf = None - if grouped_filters or calculate_all_features_grouped: + if grouped_features: if shape_names_column not in gdf.columns: - raise ValueError("Grouped filters require a 'name' column in the shapes layer.") - + raise ValueError(f"Grouped computation requires column '{shape_names_column}'.") grouped_gdf = gdf.dissolve(by=shape_names_column, as_index=False, aggfunc="first").copy() grouped_gdf["geometry"] = gdf.groupby(shape_names_column).geometry.apply(lambda x: x.unary_union).values - print(f"Polygons merged by {shape_names_column}, resulting in {len(grouped_gdf)} groups.") - - # Compute metrics - def _compute_morphological_features(gdf, required, pixel_size_um, suffix): - if any(filter in required for filter in ["area", "circularity", "compactness", "solidity", "centroid_dif"]): - gdf[f"area{suffix}"] = gdf.geometry.area * pixel_size_um**2 - - if any(filter in required for filter in ["perimeter", "circularity", "compactness", "convexity"]): - gdf[f"perimeter{suffix}"] = gdf.geometry.length * pixel_size_um - - if any(filter in required for filter in ["convex_area", "solidity"]): - gdf[f"convex_area{suffix}"] = gdf.geometry.convex_hull.area * pixel_size_um**2 - - if "circularity" in required: - gdf[f"circularity{suffix}"] = 4 * np.pi * gdf[f"area{suffix}"] / (gdf[f"perimeter{suffix}"] ** 2) - - if "compactness" in required: - gdf[f"compactness{suffix}"] = (gdf[f"perimeter{suffix}"] ** 2) / gdf[f"area{suffix}"] - - if "solidity" in required: - gdf[f"solidity{suffix}"] = gdf[f"area{suffix}"] / gdf[f"convex_area{suffix}"] - - if "centroid_dif" in required: - gdf[f"centroid_x{suffix}"] = gdf.geometry.centroid.x - gdf[f"centroid_y{suffix}"] = gdf.geometry.centroid.y - hull_centroids = gdf.geometry.convex_hull.centroid - gdf[f"centroid_dif{suffix}"] = np.sqrt( - (gdf[f"centroid_x{suffix}"] - hull_centroids.x) ** 2 - + (gdf[f"centroid_y{suffix}"] - hull_centroids.y) ** 2 - ) / np.sqrt(gdf[f"area{suffix}"]) - - if "convexity" in required: - gdf[f"convexity{suffix}"] = (gdf.geometry.convex_hull.length * pixel_size_um) / gdf[f"perimeter{suffix}"] - - # Get bounding box lengths from minimum rotated rectangle - if any(filter in required for filter in ["major_axis_length", "minor_axis_length", "major_minor_axis_ratio"]): - - def _rotated_rect_axes(geom): - try: - rect = geom.minimum_rotated_rectangle - x, y = rect.exterior.coords.xy - # Compute edge lengths - edges = np.sqrt(np.diff(x) ** 2 + np.diff(y) ** 2) - edges = np.sort(edges[:-1]) # drop duplicate closing edge - minor, major = edges[0], edges[1] - ratio = major / minor if minor > 0 else np.nan - return major, minor, ratio - except Exception: - return np.nan, np.nan, np.nan - - results = np.array([_rotated_rect_axes(g) for g in gdf.geometry]) - major_axis_length, minor_axis_length, major_minor_axis_ratio = results.T - if any(filter in required for filter in ["major_axis_length", "major_minor_axis_ratio"]): - gdf[f"major_axis_length{suffix}"] = major_axis_length * pixel_size_um - if any(filter in required for filter in ["minor_axis_length", "major_minor_axis_ratio"]): - gdf[f"minor_axis_length{suffix}"] = minor_axis_length * pixel_size_um - if "major_minor_axis_ratio" in required: - gdf[f"major_minor_axis_ratio{suffix}"] = major_minor_axis_ratio + log.info(f"Merged {len(gdf)} polygons into {len(grouped_gdf)} groups by '{shape_names_column}'.") + + # Compute features + def _compute_features(gdf: gpd.GeoDataFrame, pixel_size_um, suffix: str = ""): + # Base area & perimeter + gdf[f"area{suffix}"] = gdf.geometry.area * pixel_size_um**2 + gdf[f"perimeter{suffix}"] = gdf.geometry.length * pixel_size_um + + # Equivalent diameter + gdf[f"equivalent_diameter{suffix}"] = np.sqrt(4 * gdf[f"area{suffix}"] / np.pi) + + # Convex geometry + convex_hulls = gdf.geometry.convex_hull + gdf[f"convex_area{suffix}"] = convex_hulls.area * pixel_size_um**2 + gdf[f"convex_perimeter{suffix}"] = convex_hulls.length * pixel_size_um + + # Derived metrics + gdf[f"circularity{suffix}"] = 4 * np.pi * gdf[f"area{suffix}"] / (gdf[f"perimeter{suffix}"] ** 2) + gdf[f"compactness{suffix}"] = (gdf[f"perimeter{suffix}"] ** 2) / gdf[f"area{suffix}"] + gdf[f"solidity{suffix}"] = gdf[f"area{suffix}"] / gdf[f"convex_area{suffix}"] + gdf[f"convexity{suffix}"] = gdf[f"convex_perimeter{suffix}"] / gdf[f"perimeter{suffix}"] + + # Centroid-based metrics + gdf[f"centroid_x{suffix}"] = gdf.geometry.centroid.x + gdf[f"centroid_y{suffix}"] = gdf.geometry.centroid.y + hull_centroids = convex_hulls.centroid + gdf[f"centroid_dif{suffix}"] = np.sqrt( + (gdf[f"centroid_x{suffix}"] - hull_centroids.x) ** 2 + + (gdf[f"centroid_y{suffix}"] - hull_centroids.y) ** 2 + ) / np.sqrt(gdf[f"area{suffix}"]) + + # Major/minor axis (from rotated rectangle) + def _rotated_rect_axes(geom): + try: + rect = geom.minimum_rotated_rectangle + x, y = rect.exterior.coords.xy + edges = np.sqrt(np.diff(x)**2 + np.diff(y)**2) + edges = np.sort(edges[:-1]) # remove duplicate closing edge + minor, major = edges[0], edges[1] + ratio = major / minor if minor > 0 else np.nan + return major, minor, ratio + except Exception: + return np.nan, np.nan, np.nan + + results = np.array([_rotated_rect_axes(g) for g in gdf.geometry]) + major_axis_length, minor_axis_length, major_minor_axis_ratio = results.T + gdf[f"major_axis_length{suffix}"] = major_axis_length * pixel_size_um + gdf[f"minor_axis_length{suffix}"] = minor_axis_length * pixel_size_um + gdf[f"major_minor_axis_ratio{suffix}"] = major_minor_axis_ratio + + # Vertex metrics + def _num_vertices(geom): + exteriors = _extract_exterior_lines(geom) + return sum(len(line.coords) for line in exteriors) + + gdf[f"num_vertices{suffix}"] = gdf.geometry.apply(_num_vertices) + gdf[f"boundary_complexity{suffix}"] = ( + gdf[f"num_vertices{suffix}"] / gdf[f"perimeter{suffix}"] + ) + # Hole topology + def _num_holes(geom): + return len(_extract_interior_lines(geom)) + + def _hole_area(g): + interiors = _extract_interior_lines(g) + if not interiors: + return 0.0 + try: + holes = [Polygon(ring) for ring in interiors] + total_hole_area = sum(h.area for h in holes) + return total_hole_area + except Exception: + return np.nan + + gdf[f"num_holes{suffix}"] = gdf.geometry.apply(_num_holes) + gdf[f"hole_area{suffix}"] = gdf.geometry.apply(_hole_area) * pixel_size_um**2 + return gdf - gdf = _compute_morphological_features(gdf, required_individual, pixel_size_um, "") - grouped_gdf = _compute_morphological_features(grouped_gdf, required_grouped, pixel_size_um, "-grouped") + gdf = _compute_features(gdf, pixel_size_um, "") + + if grouped_features and not grouped_gdf.empty: + grouped_gdf = _compute_features(grouped_gdf, pixel_size_um, "-grouped") - # Merge grouped metrics back into main gdf - if grouped_gdf is not None and not grouped_gdf.empty: grouped_metrics = [col for col in grouped_gdf.columns if col.endswith("-grouped")] - gdf = gdf.merge(grouped_gdf[[shape_names_column] + grouped_metrics], on=shape_names_column, how="left") - - # Apply filters - print(f"Applying morphological filter(s) on {len(gdf)} polygons...") - mask = np.ones(len(gdf), dtype=bool) - if filters: - for feature, (min_val, max_val) in filters.items(): - print(f"\nFiltering by '{feature}': {min_val} ≤ value ≤ {max_val}") - if feature not in gdf.columns: - raise KeyError(f"Feature '{feature}' was not computed. Check your spelling or supported feature list.") - - # Apply lower bound - if min_val is not None: - to_remove = (gdf[feature] < min_val) & mask - removed_low = to_remove.sum() - mask &= gdf[feature] >= min_val - print(f" - Removed {removed_low} polygons with {feature} ≤ {min_val}") - - # Apply upper bound - if max_val is not None: - to_remove = (gdf[feature] > max_val) & mask - removed_high = to_remove.sum() - mask &= gdf[feature] <= max_val - print(f" - Removed {removed_high} polygons with {feature} ≥ {max_val}") - - remaining = mask.sum() - print(f" → Remaining after filtering '{feature}': {remaining} polygons") - - filtered_gdf = gdf[mask] - print(f"\nKept {len(filtered_gdf)} / {len(gdf)} polygons after morphological filters.") - - # Add filtered shapes layer - input_gdf = pd.concat([filtered_gdf, gdf_skipped], ignore_index=False) + + grouped_lookup = grouped_gdf.set_index(shape_names_column) + for col in grouped_metrics: + gdf[col] = gdf[shape_names_column].map(grouped_lookup[col]) + + + # sanity check. If this sanity check would fail in spatialdata at some point, then pass transformation to transformations parameter of add_shapes_layer. + assert get_transformation(gdf, get_all=True) == get_transformation(sdata[shapes_layer], get_all=True) + sdata = add_shapes_layer( sdata, - input=input_gdf, + input=gdf, output_layer=output_shapes_layer, overwrite=overwrite, ) @@ -278,19 +234,188 @@ def _rotated_rect_axes(geom): return sdata -def filter_by_shapes( +def filter_shapes_numerical( + sdata: SpatialData, + shapes_layer: str, + output_shapes_layer: str, + numerical_column: str, + min_value: float | int | None = None, + max_value: float | int | None = None, + overwrite: bool = False, +) -> SpatialData: + """ + All polygons with a size outside of the min and max size range are removed using the `numerical_column` in `shapes_layer`. + Polygons are kept if `min_value` ≤ `numerical_column` ≤ `max_value`. + + Parameters + ---------- + sdata + SpatialData object containing the shapes layer. + shapes_layer + Name of the input shapes layer. + output_shapes_layer + Name of the output shapes layer to store the filtered polygons. + numerical_column + Name of numerical column in the shapes layer used for filtering. + min_value + minimum value of `numerical_column` (inclusive). If None, lower bound is ignored. + max_value + maximum value of `numerical_column` (inclusive). If None, upper bound is ignored. + overwrite + Whether to overwrite existing output layer. + + Returns + ------- + SpatialData + Object with updated shapes layer containing only polygons meeting criteria. + """ + gdf = sdata.shapes[shapes_layer].copy() + + if numerical_column not in gdf.columns: + raise ValueError( + f"Column '{numerical_column}' not found in '{shapes_layer}.obs'. " + ) + + if not np.issubdtype(gdf[numerical_column].dtype, np.number): + raise ValueError( + f"Column '{numerical_column}' must be numeric, but dtype is {gdf[numerical_column].dtype}." + ) + + # Filter cells based on min and max values + start = len(gdf) + mask = pd.Series(True, index=gdf.index) + + if min_value is not None: + below = (gdf[numerical_column] < min_value).sum() + log.info(f"Removed {below} cells below {min_value}.") + mask &= gdf[numerical_column] >= min_value + if max_value is not None: + above = (gdf[numerical_column] > max_value).sum() + log.info(f"Removed {above} cells above {max_value}.") + mask &= gdf[numerical_column] <= max_value + + filtered_gdf = gdf[mask].copy() + kept = len(filtered_gdf) + removed = start - len(filtered_gdf) + log.info(f"Removed {removed}/{start} polygons outside {min_value}–{max_value} for '{numerical_column}' (kept {kept}).") + + # sanity check. If this sanity check would fail in spatialdata at some point, then pass transformation to transformations parameter of add_shapes_layer. + assert get_transformation(filtered_gdf, get_all=True) == get_transformation(sdata[shapes_layer], get_all=True) + + # Add to SpatialData + sdata = add_shapes_layer( + sdata, + input=filtered_gdf, + output_layer=output_shapes_layer, + overwrite=overwrite, + ) + + return sdata + + +def filter_shapes_categorical( + sdata: SpatialData, + shapes_layer: str, + output_shapes_layer: str, + categorical_column: Optional[str] = None, + include_values: Optional[Union[str, Sequence[str]]] = None, + exclude_values: Optional[Union[str, Sequence[str]]] = None, + overwrite: bool = False, +) -> SpatialData: + """ + Filter polygons in a shapes layer based on categorical values or index. + + Parameters + ---------- + sdata + SpatialData object containing the shapes layer. + shapes_layer + Name of the input shapes layer. + output_shapes_layer + Name of the output shapes layer to store filtered polygons. + categorical_column + Name of the categorical column in the shapes layer. If None, filtering is performed based on the index. + include_values + Value(s) to keep. Only polygons whose `categorical_column` matches one of these + values will be kept. Mutually exclusive with `exclude_values`. + exclude_values + Value(s) to remove. polygons whose `categorical_column` matches one of these + values will be removed. Mutually exclusive with `include_values`. + overwrite + If True, overwrites the `output_shapes_layer` if it already exists in `sdata`. + + Returns + ------- + SpatialData + Object with updated shapes layer containing filtered polygons. + """ + if include_values is not None and exclude_values is not None: + raise ValueError("Specify only one of 'include_values' or 'exclude_values'.") + + gdf = sdata.shapes[shapes_layer].copy() + + if categorical_column is not None and categorical_column not in gdf.columns: + raise ValueError(f"Column '{categorical_column}' not found in '{shapes_layer}'.") + + # Ensure include/exclude are lists + if isinstance(include_values, (str, int)): + include_values = [include_values] + if isinstance(exclude_values, (str, int)): + exclude_values = [exclude_values] + + # Filter + start = len(gdf) + mask = pd.Series(True, index=gdf.index) + + if categorical_column is not None: + # Filtering by column + if include_values is not None: + mask &= gdf[categorical_column].isin(include_values) + elif exclude_values is not None: + mask &= ~gdf[categorical_column].isin(exclude_values) + else: + # Filtering by index + if include_values is not None: + mask &= gdf.index.isin(include_values) + elif exclude_values is not None: + mask &= ~gdf.index.isin(exclude_values) + + filtered_gdf = gdf[mask].copy() + kept = len(filtered_gdf) + removed = start - kept + if categorical_column is None: + categorical_column = "index" + log.info(f"Removed {removed}/{start} polygons based on '{categorical_column}' filtering (kept {kept}).") + + # sanity check. If this sanity check would fail in spatialdata at some point, then pass transformation to transformations parameter of add_shapes_layer. + assert get_transformation(filtered_gdf, get_all=True) == get_transformation(sdata[shapes_layer], get_all=True) + + # Add to SpatialData + sdata = add_shapes_layer( + sdata, + input=filtered_gdf, + output_layer=output_shapes_layer, + overwrite=overwrite, + ) + + return sdata + + +def filter_shapes_by_shapes( sdata: SpatialData, target_shapes_layer: str, mask_shapes_layer: str, output_shapes_layer: str, - shape_names_column: str | None = None, - shape_names: str | list[str] | None = None, - keep_intersecting: bool = True, + shape_names_column: Optional[str] = None, + shape_names: Optional[Union[str, List[str]]] = None, + mode: Literal["intersects", "within", "centroid_within", "touches", "disjoint", "overlap_fraction", "edge", "outer_edge", "inner_edge"] = "centroid_within", + keep: bool = True, + threshold: Optional[float] = None, overwrite: bool = False, ): """ - Filter polygons in a target shapes layer (typically containg segmention boundaries) based on intersection with polygons in a mask layer - (typically containing region annotations). + Filter polygons in a target shapes layer based on geometric relationships to polygons of a mask shapes layer. + A typical use-case would be to filter cell segmentations based on their location within annotated regions. Parameters ---------- @@ -306,19 +431,33 @@ def filter_by_shapes( Optional column in mask_shapes_layer to select specific polygons. shape_names Name or list of names of polygons in mask_shapes_layer to use for filtering. Ignored if shape_names_column is None. - keep_intersecting - If True, keeps polygons that intersect the mask. - If False, removes polygons that intersect the mask. + mode + Geometric relationship to use for filtering. + Supported modes: + - `"intersects"`: Keep/remove polygons that intersect mask polygons (i.e. have partial or full overlap). + - `"within"`: Keep/remove polygons that are fully contained within mask polygons. + - `"centroid_within"`: Keep/remove polygons whose centroids are within mask polygons. + - `"touches"`: Keep/remove polygons that touch the boundary of mask polygons but do not overlap. + - `"disjoint"`: Keep/remove polygons that have no overlap with mask polygons at all. + - `"overlap_fraction"`: Keep/remove polygons whose overlap fraction with mask polygons exceeds a threshold. + - `"edge"`: Keep/remove polygons that partially overlap either the outer or inner edge of the mask polygons. + - `"outer_edge"`: Keep/remove polygons that overlap the outer edge (shell) of the mask polygons. + - `"inner_edge"`: Keep/remove polygons that overlap the inner edge (holes) of the mask polygons. + keep + Whether to keep (True) or remove (False) polygons matching the condition. + threshold + Threshold to use for overlap_fraction. overwrite If True, overwrites the output shapes layer if it exists. - + Returns ------- SpatialData object with filtered shapes layer. """ + # Copy target layer target_gdf = sdata.shapes[target_shapes_layer].copy() - + # Copy mask layer mask_gdf = sdata.shapes[mask_shapes_layer].copy() @@ -331,30 +470,65 @@ def filter_by_shapes( mask_gdf = mask_gdf[mask_gdf[shape_names_column].isin(shape_names)].copy() if mask_gdf.empty: raise ValueError(f"No geometries found in '{mask_shapes_layer}' matching {shape_names}.") - + # Build mask union mask_union = mask_gdf.unary_union - # Compute intersection boolean - target_gdf["intersects_mask"] = target_gdf.geometry.intersects(mask_union) + # Compute condition + if mode == "intersects": + condition = target_gdf.geometry.intersects(mask_union) + elif mode == "within": + condition = target_gdf.geometry.within(mask_union) + elif mode == "centroid_within": + condition = target_gdf.geometry.centroid.within(mask_union) + elif mode == "touches": + condition = target_gdf.geometry.touches(mask_union) + elif mode == "disjoint": + condition = target_gdf.geometry.disjoint(mask_union) + elif mode == "overlap_fraction": + if threshold is None: + raise ValueError("Must specify 'threshold' for overlap_fraction mode.") + intersecting = target_gdf[target_gdf.geometry.intersects(mask_union)] + overlaps = intersecting.geometry.intersection(mask_union) + fraction = overlaps.area / intersecting.geometry.area + condition = pd.Series(False, index=target_gdf.index) + condition.loc[intersecting.index] = fraction > threshold + elif mode == "edge": + condition = target_gdf.geometry.intersects(mask_union) & ~target_gdf.geometry.within(mask_union) + elif mode in ["outer_edge", "inner_edge"]: + outer_boundaries = [] + inner_boundaries = [] + + if isinstance(mask_union, gpd.GeoSeries): + for geom in mask_union: + outer_boundaries.extend(_extract_exterior_lines(geom)) + inner_boundaries.extend(_extract_interior_lines(geom)) + else: + outer_boundaries.extend(_extract_exterior_lines(mask_union)) + inner_boundaries.extend(_extract_interior_lines(mask_union)) + + outer_boundary_union = unary_union([LineString(boundary) for boundary in outer_boundaries]) if outer_boundaries else None + inner_boundary_union = unary_union([LineString(boundary) for boundary in inner_boundaries]) if inner_boundaries else None + + if mode == "outer_edge": + condition = target_gdf.geometry.intersects(outer_boundary_union) + elif mode == "inner_edge": + condition = target_gdf.geometry.intersects(inner_boundary_union) if inner_boundary_union is not None else False - # Filter depending on mode - if keep_intersecting: - filtered_gdf = target_gdf[target_gdf["intersects_mask"]].copy() else: - filtered_gdf = target_gdf[~target_gdf["intersects_mask"]].copy() + raise ValueError(f"Unsupported mode '{mode}'.") - filtered_gdf.drop(columns=["intersects_mask"], inplace=True) + # Apply keep/remove + filtered_gdf = target_gdf[condition if keep else ~condition].copy() removed = len(target_gdf) - len(filtered_gdf) - if keep_intersecting: - print( - f"Kept {len(filtered_gdf)} / {len(target_gdf)} geometries intersecting '{mask_shapes_layer}' (removed {removed})." - ) + if keep: + log.info(f"Kept {len(filtered_gdf)} / {len(target_gdf)} geometries (removed {removed}).") else: - print( - f"Removed {removed} / {len(target_gdf)} geometries intersecting '{mask_shapes_layer}' (kept {len(filtered_gdf)})." - ) + log.info(f"Removed {removed} / {len(target_gdf)} geometries (kept {len(filtered_gdf)}).") + + # sanity check. If this sanity check would fail in spatialdata at some point, then pass transformation to transformations parameter of add_shapes_layer. + assert get_transformation(filtered_gdf, get_all=True) == get_transformation(sdata[target_shapes_layer], get_all=True) # Add to SpatialData sdata = add_shapes_layer( diff --git a/src/harpy/shape/_shape.py b/src/harpy/shape/_shape.py index f758110e..e6a0cf75 100644 --- a/src/harpy/shape/_shape.py +++ b/src/harpy/shape/_shape.py @@ -1,9 +1,8 @@ from __future__ import annotations -import numpy as np from dask.array import Array from geopandas import GeoDataFrame -from shapely.geometry import GeometryCollection, MultiPolygon, Point, Polygon +from shapely.geometry import GeometryCollection, MultiPolygon, Polygon, Point, MultiPoint, LineString, MultiLineString from spatialdata import SpatialData from spatialdata.models._utils import MappingToCoordinateSystem_t from spatialdata.transformations import get_transformation @@ -11,6 +10,10 @@ from harpy.image._image import _get_spatial_element from harpy.shape._manager import ShapesLayerManager +import numpy as np + +from harpy.utils.pylogger import get_pylogger +log = get_pylogger(__name__) def vectorize( sdata, @@ -181,7 +184,7 @@ def intersect_rectangles(rect1: list[int | float], rect2: list[int | float]) -> else: return None - + def prep_region_annotations( sdata: SpatialData, shapes_layer: str, @@ -193,13 +196,14 @@ def prep_region_annotations( overwrite: bool = False, ): """ - Prepares region annotations in a shapes layer for `hp.sh.filter_by_morphology`, `hp.tb.assign_cells_to_shapes` and `hp.tb.compute_distance_to_shapes`. + Prepares region annotations in a shapes layer for `hp.sh.morphological_features`, `hp.tb.assign_cells_to_shapes` and `hp.tb.compute_distance_to_shapes`. Operations performed: - Ensures a shape name column exists and fills missing names. - Converts Points with a `radius` column into circular polygons. Points without a `radius` column will be preserved as Points. - Slightly erodes polygons to avoid shared edges. - Explodes multipolygons into separate single polygons. - Generates unique names for shapes with duplicate base names. + - Shift indices by +1 if index 0 is present (index 0 is reserved for background in labels layers). Parameters ---------- @@ -214,73 +218,85 @@ def prep_region_annotations( unnamed Name to be assigned to any unnamed geometries in `shape_names_column`. Defaults to 'unnamed'. unique_shape_names_column - Column name in which unique names will be created for single polygons by appending a counter to the original name in `shape_names_column` for polygons with the same name. Note - that multipolygons will be split in individual polygons and each will get a unique name based on the original name of the multipolygon. Unique names will be stored in + Column name in which unique names will be created for single polygons by appending a counter to the original name in `shape_names_column` for polygons with the same name. Note + that multipolygons will be split in individual polygons and each will get a unique name based on the original name of the multipolygon. Unique names will be stored in `{shape_names_column}-unique` in the updated shapes layer. erosion - Number of pixels to erode polygons by. This can avoid problems with overlapping edges of geometries when calculating distances. Default is 0.5 (i.e. erosion by 0.5 pixels). + Number of pixels to erode polygons by. This can avoid problems with overlapping edges of geometries when calculating distances. Default is 0.5 (i.e. erosion by 0.5 pixels). overwrite If True, overwrites the `output_shapes_layer` if it already exists in `sdata`. - + Returns ------- Modified `sdata` object with updated updated shapes layer. """ + # Create copy of shapes layer gdf = sdata.shapes[shapes_layer].copy() - print(f"Found {len(gdf)} geometries in {shapes_layer}.") - + log.info(f"Found {len(gdf)} geometries in {shapes_layer}.") + # Ensure shape_names_column exist if shape_names_column not in gdf.columns: gdf[shape_names_column] = unnamed - + gdf[shape_names_column] = gdf[shape_names_column].fillna("").astype(str) unnamed_mask = gdf[shape_names_column] == "" - for _i, idx in enumerate(gdf[unnamed_mask].index): + for i, idx in enumerate(gdf[unnamed_mask].index): gdf.at[idx, shape_names_column] = unnamed - + # Convert Points with a radius column to circular polygons def _point_to_circle(geom, radius=None): if isinstance(geom, Point) and radius is not None: return geom.buffer(radius, resolution=16) return geom - + if "radius" in gdf.columns: mask_points_with_radius = gdf.geometry.geom_type.eq("Point") & gdf["radius"].notna() n_converted = mask_points_with_radius.sum() if n_converted > 0: - print(f"Converting {n_converted} Point geometries with 'radius' to circular polygons.") + log.info(f"Converting {n_converted} Point geometries with 'radius' to circular polygons.") gdf.loc[mask_points_with_radius, "geometry"] = gdf.loc[mask_points_with_radius].apply( - lambda row: _point_to_circle(row.geometry, getattr(row, "radius", None)), axis=1 + lambda row: _point_to_circle(row.geometry, getattr(row, "radius", None)), + axis=1 ) - + # Slightly erode all polygons (this avoids any shared borders between polygons) gdf["geometry"] = gdf.geometry.apply( - lambda geom: geom.buffer(-erosion, join_style=2, resolution=16) - if geom.geom_type in ["Polygon", "MultiPolygon"] - else geom + lambda geom: geom.buffer( + -erosion, + join_style=2, + resolution=16) + if geom.geom_type in ["Polygon", "MultiPolygon"] else geom ) - polygon_mask = ~gdf.geometry.is_empty - removed = len(gdf) - polygon_mask.sum() - gdf = gdf[polygon_mask] # drop any polygons that collapsed to empty - if removed > 0: - print(f"Removed {removed} polygons that collapsed to empty after erosion.") - + polygon_mask = ~gdf.geometry.is_empty + removed = len(gdf) - polygon_mask.sum() + gdf = gdf[polygon_mask] # drop any polygons that collapsed to empty + if removed > 0: + log.info(f"Removed {removed} polygons that collapsed to empty after erosion.") + # Explode multipolygons into single polygons (this allows us to treat multipolygons as unique polygons) n_multipolygons = gdf.geometry.apply(lambda g: g.geom_type == "MultiPolygon").sum() gdf = gdf.explode(index_parts=False).reset_index(drop=True) n_after = len(gdf) - print( - f"Split {n_multipolygons} multipolygons into individual polygons. Total number of geometries after splitting multipolgons is {n_after}." - ) - + log.info(f"Split {n_multipolygons} multipolygons into individual polygons. Total number of geometries after splitting multipolgons is {n_after}.") + # Create unique names sizes = gdf.groupby(shape_names_column)[shape_names_column].transform("size") counter = gdf.groupby(shape_names_column).cumcount() + 1 gdf[unique_shape_names_column] = np.where( - sizes.eq(1), gdf[shape_names_column], gdf[shape_names_column] + counter.astype(str) + sizes.eq(1), + gdf[shape_names_column], + gdf[shape_names_column] + counter.astype(str) ) + + # Avoid index 0 + if 0 in gdf.index: + log.warning("Index 0 found in shapes layer — shifting all indices by +1 since index 0 is reserved for background in labels layers.") + gdf.index = gdf.index + 1 + + # sanity check. If this sanity check would fail in spatialdata at some point, then pass transformation to transformations parameter of add_shapes_layer. + assert get_transformation(gdf, get_all=True) == get_transformation(sdata[shapes_layer], get_all=True) # Add filtered shapes layer sdata = add_shapes_layer( @@ -291,3 +307,4 @@ def _point_to_circle(geom, radius=None): ) return sdata + diff --git a/src/harpy/table/__init__.pyi b/src/harpy/table/__init__.pyi index a254ba3a..6df1c2eb 100644 --- a/src/harpy/table/__init__.pyi +++ b/src/harpy/table/__init__.pyi @@ -4,14 +4,14 @@ from ._annotation import cluster_cleanliness, score_genes, score_genes_iter from ._clustering import kmeans, leiden from ._enrichment import nhood_enrichment from ._preprocess import preprocess_proteomics, preprocess_transcriptomics -from ._region_annotations import assign_cells_to_shapes, compute_distance_to_shapes from ._regionprops import add_regionprop_features -from ._table import add_table_layer, correct_marker_genes, filter_categorical, filter_numerical, filter_on_size +from ._table import add_table_layer, correct_marker_genes, filter_on_size, filter_numerical, filter_categorical from .cell_clustering._clustering import flowsom from .cell_clustering._preprocess import cell_clustering_preprocess from .cell_clustering._weighted_channel_expression import weighted_channel_expression from .pixel_clustering._cluster_intensity import cluster_intensity from .pixel_clustering._neighbors import spatial_pixel_neighbors +from ._region_annotations import assign_cells_to_shapes, compute_distance_to_shapes __all__ = [ "add_table_layer", diff --git a/src/harpy/table/_region_annotations.py b/src/harpy/table/_region_annotations.py index 23eb70c3..3b2171eb 100755 --- a/src/harpy/table/_region_annotations.py +++ b/src/harpy/table/_region_annotations.py @@ -1,13 +1,13 @@ -from typing import Literal - +from spatialdata import SpatialData import geopandas as gpd import pandas as pd -from shapely.geometry import LineString, MultiLineString, MultiPolygon, Point, Polygon -from spatialdata import SpatialData - +from typing import Literal +from shapely.geometry import Polygon, MultiPolygon, Point, MultiPoint, LineString, MultiLineString from harpy.table._table import add_table_layer -from harpy.utils._keys import _REGION_KEY +from harpy.utils._keys import _REGION_KEY, _INSTANCE_KEY +from harpy.utils.pylogger import get_pylogger +log = get_pylogger(__name__) def assign_cells_to_shapes( sdata: SpatialData, @@ -20,7 +20,7 @@ def assign_cells_to_shapes( mode: Literal["original_names", "unique_names", "both"] = "original_names", create_column_per_shape: bool = False, overlap_tolerance: float = 0.1, - spatial_key: str = "spatial", + spatial_key: str = 'spatial', xy_columns: tuple = None, overwrite: bool = False, ): @@ -33,7 +33,7 @@ def assign_cells_to_shapes( The SpatialData object containing the input table layer and shapes layer. shapes_layer The shapes layer in `sdata.shapes` to use as input. - table_layer + table_layer The table layer in `sdata.tables` to use as input. output_table_layer The output table layer in `sdata.tables` to which the updated table layer will be written. @@ -57,24 +57,25 @@ def assign_cells_to_shapes( Key in `sdata.tables[table_layer].obsm` containing spatial coordinates. Ignored if `xy_columns` is provided. xy_columns Tuple of column names in `sdata.tables[table_layer].obs` containing the x and y coordinates the cells. - If None, defaults to using coordinates from `sdata.tables[table_layer].obsm[spatial_key]`. + If None, defaults to using coordinates from `sdata.tables[table_layer].obsm[spatial_key]`. overwrite If True, overwrites the `output_table_layer` and/or `output_shapes_layer` if it already exists in `sdata`. - + Notes - ----- - - Only `Polygon` and `MultiPolygon` geometries are supported. Non-polygon geometries (e.g., `Point`, `LineString`) are skipped. - + ------- + - Only `Polygon` and `MultiPolygon` geometries are supported. Non-polygon geometries (e.g., `Point`, `LineString`) are skipped. + Returns ------- Modified `sdata` object with updated table layer. """ + if not output_column and not create_column_per_shape: raise ValueError("Specify `output_column` or set `create_column_per_shape=True`.") - + # Create copy of shapes layer gdf = sdata.shapes[shapes_layer].copy() - + # Create copy of table layer adata = sdata.tables[table_layer].copy() @@ -82,22 +83,20 @@ def assign_cells_to_shapes( supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon))) skipped = len(gdf) - supported_mask.sum() if skipped > 0: - print(f"Skipped {skipped} non-polygon geometries in {shapes_layer}.") - if skipped == len(gdf): - print("No supported geometries (Polygon, MultiPolygon) found.") + log.info(f"Skipped {skipped} non-polygon geometries in {shapes_layer}.") + if skipped == len(gdf): + log.warning(f"No supported geometries (Polygon, MultiPolygon) found. Skipping addition of table layer '{output_table_layer}' to sdata.") return sdata - + gdf = gdf[supported_mask].copy().reset_index(drop=True) # Check for overlapping polygons total_area = gdf.geometry.area.sum() union_area = gdf.geometry.unary_union.area if total_area - union_area > overlap_tolerance and not create_column_per_shape: - raise ValueError( - f"Overlapping polygons detected in {shapes_layer}. Correct polygons or use create_column_per_shape." - ) + raise ValueError(f"Overlapping polygons detected in {shapes_layer}. Correct polygons or use create_column_per_shape.") elif total_area - union_area > 0 and not create_column_per_shape: - print(f"Overlaps detected (Δ={total_area - union_area:.3f}), below tolerance threshold {overlap_tolerance}.") + log.warning(f"Overlaps detected (Δ={total_area - union_area:.3f}), below tolerance threshold {overlap_tolerance}.") # Get cell coordinates if xy_columns is not None: @@ -105,7 +104,8 @@ def assign_cells_to_shapes( coords = adata.obs[[x_col, y_col]].to_numpy() else: if spatial_key not in adata.obsm: - raise KeyError(f"No spatial coordinates found in `obsm['{spatial_key}']` and `xy_columns` not provided.") + raise KeyError( + f"No spatial coordinates found in `obsm['{spatial_key}']` and `xy_columns` not provided.") coords = adata.obsm[spatial_key] if coords.shape[1] != 2: raise ValueError(f"`obsm['{spatial_key}']` must have shape (n_cells, 2).") @@ -113,19 +113,19 @@ def assign_cells_to_shapes( # Function to assign cell to a polygon def _assign_region(x, y, gdf, column, sindex): point = Point(x, y) - + # Quick bounding box search to prefilter the cells candidate_idx = list(sindex.intersection(point.bounds)) if not candidate_idx: return None - + # Slower point‑in‑polygon search candidate_gdf = gdf.iloc[candidate_idx] match = candidate_gdf[candidate_gdf.contains(point)] - + if match.empty: return None - + return match[column].iloc[0] # Assign cells to polygons @@ -152,8 +152,10 @@ def _assign_region(x, y, gdf, column, sindex): if gdf_shape.empty: continue - sidx_shape = gdf_shape.sindex # Build spatial index - assigned = [_assign_region(x, y, gdf_shape, name_column, sidx_shape) for x, y in coords] + sidx_shape = gdf_shape.sindex # Build spatial index + assigned = [ + _assign_region(x, y, gdf_shape, name_column, sidx_shape) for x, y in coords + ] adata.obs[col_name] = assigned else: @@ -166,22 +168,24 @@ def _assign_region(x, y, gdf, column, sindex): for output_column_name, name_column in run_assign_dict.items(): # Assign one shape name per cell (to be avoided with overlapping regions) - sidx = gdf.sindex # Build spatial index - assigned = [_assign_region(x, y, gdf, name_column, sidx) for x, y in coords] + sidx = gdf.sindex # Build spatial index + assigned = [ + _assign_region(x, y, gdf, name_column, sidx) + for x, y in coords + ] adata.obs[output_column_name] = assigned - # Add table layer + # Add table layer sdata = add_table_layer( sdata, - adata=adata, - output_layer=output_table_layer, - region=adata.obs[_REGION_KEY].cat.categories.to_list(), - overwrite=overwrite, + adata = adata, + output_layer = output_table_layer, + region = adata.obs[_REGION_KEY].cat.categories.to_list(), + overwrite = overwrite, ) - + return sdata - def compute_distance_to_shapes( sdata: SpatialData, shapes_layer: str, @@ -191,32 +195,19 @@ def compute_distance_to_shapes( unique_shape_names_column: str = "name-unique", output_name: str = None, pixel_size_um: float = 1.0, - modes: list[ - Literal[ - "nearest_edge", - "nearest_edge_grouped", - "all_edges", - "nearest_outer_edge", - "nearest_outer_edge_grouped", - "all_outer_edges", - "nearest_inner_edge", - "nearest_inner_edge_grouped", - "nearest_inner_edge_grouped_unique", - "all_inner_edges", - "nearest_centroid", - "nearest_centroid_grouped", - "all_centroids", - "nearest_point", - "nearest_point_grouped", - "all_points", - ] - ] = None, - spatial_key: str = "spatial", + modes: list[Literal[ + "nearest_edge", "nearest_edge_grouped", "all_edges", + "nearest_outer_edge", "nearest_outer_edge_grouped", "all_outer_edges", + "nearest_inner_edge", "nearest_inner_edge_grouped", "nearest_inner_edge_grouped_unique", "all_inner_edges", + "nearest_centroid", "nearest_centroid_grouped", "all_centroids", + "nearest_point", "nearest_point_grouped", "all_points" + ]] = ["all_edges"], + spatial_key: str = 'spatial', xy_columns: tuple = None, overwrite: bool = False, ): """ - Compute distances from cells to polygons in a shapes layer and update the sdata table layer. + Compute distances from cells (using their centroid coordinates) to polygons in a shapes layer and update the sdata table layer. It is recommended to run `hp.sh.prep_region_annotations` first. Parameters @@ -225,7 +216,7 @@ def compute_distance_to_shapes( The SpatialData object containing the input table layer and shapes layer. shapes_layer The shapes layer in `sdata.shapes` to use as input. - table_layer + table_layer The table layer in `sdata.tables` to use as input. output_table_layer The output table layer in `sdata.tables` to which the updated table layer will be written. @@ -241,39 +232,39 @@ def compute_distance_to_shapes( Which distance features to calculate. Options include: Edge distances (For Polygon and MultiPolygon geometries) - - `"nearest_edge"`: Distance to the nearest edge (outer or inner) of any polygon. + - `"nearest_edge"`: Distance to the nearest edge (outer or inner) of any polygon. Creates `{output_name}-distance_to_nearest_edge` and `{output_name}-name_of_nearest_edge` columns. - - `"nearest_edge_grouped"`: For each group of shapes with the same name, compute the distance to the edge that is nearest. + - `"nearest_edge_grouped"`: For each group of shapes with the same name, compute the distance to the edge that is nearest. Creates `{output_name}-distance_to_nearest_edge_of_` and `{output_name}-name_of_nearest_edge_of_` columns. - - `"all_edges"`: For each individual polygon, compute the distance to its edge. + - `"all_edges"`: For each individual polygon, compute the distance to its edge. Creates `{output_name}-distance_to_edge_of_` column. - + Outer edge distances (For Polygon and MultiPolygon geometries) - - `"nearest_outer_edge"`: Distance to the nearest outer edge of any polygon. + - `"nearest_outer_edge"`: Distance to the nearest outer edge of any polygon. Creates `{output_name}-distance_to_nearest_outer_edge` and `{output_name}-name_of_nearest_outer_edge` columns. - - `"nearest_outer_edge_grouped"`: For each group of shapes with the same name, compute the distance to the outer edge that is nearest. + - `"nearest_outer_edge_grouped"`: For each group of shapes with the same name, compute the distance to the outer edge that is nearest. Creates `{output_name}-distance_to_nearest_outer_edge_of_` and `{output_name}-name_of_nearest_outer_edge_of_` columns. - - `"all_outer_edges"`: For each individual polygon, compute the distance to its outer edge. + - `"all_outer_edges"`: For each individual polygon, compute the distance to its outer edge. Creates `{output_name}-distance_to_outer_edge_of_` column. - + Inner edge (hole) distances (For Polygon and MultiPolygon geometries) - - `"nearest_inner_edge"`: Distance to the nearest interior edge (“hole”) of any polygon. + - `"nearest_inner_edge"`: Distance to the nearest interior edge (“hole”) of any polygon. Creates `{output_name}-distance_to_nearest_inner_edge` and `{output_name}-name_of_nearest_inner_edge` columns. - - `"nearest_inner_edge_grouped"`: For each group of shapes with the same name, compute the distance to the nearest inner edge of that group. + - `"nearest_inner_edge_grouped"`: For each group of shapes with the same name, compute the distance to the nearest inner edge of that group. Creates `{output_name}-distance_to_nearest_inner_edge_of_` and `{output_name}-name_of_nearest_inner_edge_of_` columns. - - `"nearest_inner_edge_grouped_unique"`: For each individual polygon, compute the distance to the nearest inner edge of that polygon. + - `"nearest_inner_edge_grouped_unique"`: For each individual polygon, compute the distance to the nearest inner edge of that polygon. Creates `{output_name}-distance_to_nearest_inner_edge_of_` and `{output_name}-name_of_nearest_inner_edge_of_` columns. - - `"all_inner_edges"`: For all holes, compute the distance to each inner edge. + - `"all_inner_edges"`: For all holes, compute the distance to each inner edge. Creates `{output_name}-distance_to_inner_edge_of_-hole` column. - + Centroid distances (For Polygon and MultiPolygon geometries) - - `"nearest_centroid"`: Distance to the nearest centroid of any polygon. + - `"nearest_centroid"`: Distance to the nearest centroid of any polygon. Creates `{output_name}-distance_to_nearest_centroid` and `{output_name}-name_of_nearest_centroid` columns. - - `"nearest_centroid_grouped"`: For each group of shapes with the same name, compute the distance to the centroid that is nearest. + - `"nearest_centroid_grouped"`: For each group of shapes with the same name, compute the distance to the centroid that is nearest. Creates `{output_name}-distance_to_nearest_centroid_of_` and `{output_name}-name_of_nearest_centroid_of_` columns. - - `"all_centroids"`: For each individual polygon, compute the distance to its centroid. + - `"all_centroids"`: For each individual polygon, compute the distance to its centroid. Creates `{output_name}-distance_to_centroid_of_` column. - + Point distances (For Point geometries) - `"nearest_point"`: Distance to the nearest point. Creates `{output_name}-distance_to_nearest_point` and `{output_name}-name_of_nearest_point` columns. @@ -281,69 +272,72 @@ def compute_distance_to_shapes( Creates `{output_name}-distance_to_nearest_point_of_` and `{output_name}-name_of_nearest_point_of_` columns. - `"all_points"`: For each individual Point, compute the distance to point coordinates. Creates `{output_name}-distance_to_point_` column. - + spatial_key Key in `sdata.tables[table_layer].obsm` containing spatial coordinates. Ignored if `xy_columns` is provided. xy_columns Tuple of column names in `sdata.tables[table_layer].obs` containing the x and y coordinates the cells. - If None, defaults to using coordinates from `sdata.tables[table_layer].obsm[spatial_key]`. + If None, defaults to using coordinates from `sdata.tables[table_layer].obsm[spatial_key]`. overwrite If True, overwrites the `output_table_layer` if it already exists in `sdata`. - + Notes - ----- + ------- - Only `Polygon`, `MultiPolygon` and `Point` geometries are supported. Other geometries (e.g., `LineString`, `MultiPoint`) are skipped. - + Returns ------- Modified `sdata` object with updated table layer. """ - if modes is None: - modes = ["all_edges"] + if output_name is not None: output_name = f"{output_name}-" elif output_name is None: output_name = "" - + # Create copy of shapes layer gdf = sdata.shapes[shapes_layer].copy() - + # Create copy of table layer adata = sdata.tables[table_layer].copy() - + # Filter out geometries that are not Polygon, MultiPolygon or Point supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon, Point))) skipped = len(gdf) - supported_mask.sum() if skipped > 0: - print(f"Skipped {skipped} geometries in {shapes_layer} that are not Polygon, MultiPolygon or Point.") - + log.info(f"Skipped {skipped} geometries in {shapes_layer} that are not Polygon, MultiPolygon or Point.") + gdf = gdf[supported_mask].copy().reset_index(drop=True) - + # Separate gdf by type gdf_polygons = gdf[gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])].copy() gdf_points = gdf[gdf.geometry.geom_type == "Point"].copy() has_polygons = not gdf_polygons.empty has_points = not gdf_points.empty - + if not has_polygons and not has_points: - print("No supported geometries (Polygon, MultiPolygon, Point) found.") + log.warning(f"No supported geometries (Polygon, MultiPolygon, Point) found. Skipping addition of table layer '{output_table_layer}' to sdata.") return sdata - + # Get cell coordinates if xy_columns is not None: x_col, y_col = xy_columns coords = adata.obs[[x_col, y_col]].to_numpy() else: if spatial_key not in adata.obsm: - raise KeyError(f"No spatial coordinates found in `obsm['{spatial_key}']` and `xy_columns` not provided.") + raise KeyError( + f"No spatial coordinates found in `obsm['{spatial_key}']` and `xy_columns` not provided.") coords = adata.obsm[spatial_key] if coords.shape[1] != 2: raise ValueError(f"`obsm['{spatial_key}']` must have shape (n_cells, 2).") - + # Build point GeoDataFrame - pts = gpd.GeoSeries([Point(x, y) for x, y in coords], crs=gdf.crs) - + pts = gpd.GeoSeries( + [Point(x, y) for x, y in coords], + crs=gdf.crs + ) + pts_gdf = gpd.GeoDataFrame({"geometry": pts}, crs=gdf.crs).set_index(adata.obs.index) # Polygon and MultiPolygon @@ -373,20 +367,29 @@ def _extract_interior_lines(geom): ext_gdf = gpd.GeoDataFrame({unique_shape_names_column: names}, geometry=exteriors, crs=gdf.crs) int_gdf = gpd.GeoDataFrame({unique_shape_names_column: names}, geometry=interiors, crs=gdf.crs) - # Edges (outer and inner) + # Edges (outer and inner) if "nearest_edge" in modes: - print("Calculating 'nearest_edge' distances'") - all_edges_gdf = gpd.GeoDataFrame(pd.concat([ext_gdf, int_gdf], ignore_index=True), crs=gdf.crs) - joined = gpd.sjoin_nearest(pts_gdf, all_edges_gdf, how="left", distance_col="dist") - adata.obs[f"{output_name}distance_to_nearest_edge"] = joined["dist"].to_numpy() * pixel_size_um + log.info("Calculating 'nearest_edge' distances'") + all_edges_gdf = gpd.GeoDataFrame( + pd.concat([ext_gdf, int_gdf], ignore_index=True), + crs=gdf.crs + ) + joined = gpd.sjoin_nearest( + pts_gdf, + all_edges_gdf, + how="left", distance_col="dist" + ) + adata.obs[f"{output_name}distance_to_nearest_edge"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) adata.obs[f"{output_name}name_of_nearest_edge"] = joined[unique_shape_names_column] if "nearest_edge_grouped" in modes: - print("Calculating 'nearest_edge_grouped' distances'") + log.info("Calculating 'nearest_edge_grouped' distances'") for name, group in gdf_polygons.groupby(shape_names_column): edges = [] labels = [] - for _idx, row in group.iterrows(): + for idx, row in group.iterrows(): edge_lines = _extract_exterior_lines(row.geometry) + _extract_interior_lines(row.geometry) edges.extend(edge_lines) labels.extend([row[unique_shape_names_column]] * len(edge_lines)) @@ -394,31 +397,33 @@ def _extract_interior_lines(geom): edge_gdf = gpd.GeoDataFrame({"geometry": edges, unique_shape_names_column: labels}, crs=gdf.crs) joined = gpd.sjoin_nearest(pts_gdf, edge_gdf, how="left", distance_col="dist") - adata.obs[f"{output_name}distance_to_nearest_edge_of_{name}"] = ( - joined["dist"].to_numpy() * pixel_size_um - ) + adata.obs[f"{output_name}distance_to_nearest_edge_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um adata.obs[f"{output_name}name_of_nearest_edge_of_{name}"] = joined[unique_shape_names_column] - + if "all_edges" in modes: - print("Calculating 'all_edges' distances'") + log.info("Calculating 'all_edges' distances'") for _, feat in gdf_polygons.reset_index(drop=True).iterrows(): - adata.obs[f"{output_name}distance_to_edge_of_{feat[unique_shape_names_column]}"] = ( - pts.distance(feat.geometry.boundary).to_numpy() * pixel_size_um - ) + adata.obs[f"{output_name}distance_to_edge_of_{feat[unique_shape_names_column]}"] = pts.distance(feat.geometry.boundary).to_numpy() * pixel_size_um # Outer edges if "nearest_outer_edge" in modes: - print("Calculating 'nearest_outer_edge' distances'") - joined = gpd.sjoin_nearest(pts_gdf, ext_gdf, how="left", distance_col="dist") - adata.obs[f"{output_name}distance_to_nearest_outer_edge"] = joined["dist"].to_numpy() * pixel_size_um + log.info("Calculating 'nearest_outer_edge' distances'") + joined = gpd.sjoin_nearest( + pts_gdf, + ext_gdf, + how="left", distance_col="dist" + ) + adata.obs[f"{output_name}distance_to_nearest_outer_edge"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) adata.obs[f"{output_name}name_of_nearest_outer_edge"] = joined[unique_shape_names_column] - + if "nearest_outer_edge_grouped" in modes: - print("Calculating 'nearest_outer_edge_grouped' distances'") + log.info("Calculating 'nearest_outer_edge_grouped' distances'") for name, group in gdf_polygons.groupby(shape_names_column): edges = [] labels = [] - for _idx, row in group.iterrows(): + for idx, row in group.iterrows(): edge_lines = _extract_exterior_lines(row.geometry) edges.extend(edge_lines) labels.extend([row[unique_shape_names_column]] * len(edge_lines)) @@ -426,17 +431,13 @@ def _extract_interior_lines(geom): edge_gdf = gpd.GeoDataFrame({"geometry": edges, unique_shape_names_column: labels}, crs=gdf.crs) joined = gpd.sjoin_nearest(pts_gdf, edge_gdf, how="left", distance_col="dist") - adata.obs[f"{output_name}distance_to_nearest_outer_edge_of_{name}"] = ( - joined["dist"].to_numpy() * pixel_size_um - ) + adata.obs[f"{output_name}distance_to_nearest_outer_edge_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um adata.obs[f"{output_name}name_of_nearest_outer_edge_of_{name}"] = joined[unique_shape_names_column] if "all_outer_edges" in modes: - print("Calculating 'all_outer_edges' distances'") + log.info("Calculating 'all_outer_edges' distances'") for _, feat in gdf_polygons.reset_index(drop=True).iterrows(): - adata.obs[f"{output_name}distance_to_outer_edge_of_{feat[unique_shape_names_column]}"] = ( - pts.distance(feat.geometry.exterior).to_numpy() * pixel_size_um - ) + adata.obs[f"{output_name}distance_to_outer_edge_of_{feat[unique_shape_names_column]}"] = pts.distance(feat.geometry.exterior).to_numpy() * pixel_size_um # Inner edges hole_geoms = [] @@ -445,8 +446,8 @@ def _extract_interior_lines(geom): for _, row in gdf_polygons.reset_index(drop=True).iterrows(): base = row[unique_shape_names_column] geom = row.geometry - - holes = _extract_interior_lines(geom) + + holes =_extract_interior_lines(geom) if len(holes) == 0: continue @@ -455,27 +456,38 @@ def _extract_interior_lines(geom): hole = holes[0] hole_geoms.append(LineString(hole.coords)) hole_names.append(base) - + else: for i, hole in enumerate(holes, start=1): hole_geoms.append(LineString(hole.coords)) hole_names.append(f"{base}-hole{i}") - holes_gdf = gpd.GeoDataFrame({unique_shape_names_column: hole_names}, geometry=hole_geoms, crs=gdf.crs) + holes_gdf = gpd.GeoDataFrame( + {unique_shape_names_column: hole_names}, + geometry=hole_geoms, + crs=gdf.crs + ) if "nearest_inner_edge" in modes and not holes_gdf.empty: - print("Calculating 'nearest_inner_edge' distances'") - joined = gpd.sjoin_nearest(pts_gdf, holes_gdf, how="left", distance_col="dist") - adata.obs[f"{output_name}distance_to_nearest_inner_edge"] = joined["dist"].to_numpy() * pixel_size_um + log.info("Calculating 'nearest_inner_edge' distances'") + joined = gpd.sjoin_nearest( + pts_gdf, + holes_gdf, + how="left", + distance_col="dist" + ) + adata.obs[f"{output_name}distance_to_nearest_inner_edge"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) adata.obs[f"{output_name}name_of_nearest_inner_edge"] = joined[unique_shape_names_column] if "nearest_inner_edge_grouped" in modes: - print("Calculating 'nearest_inner_edge_grouped' distances'") + log.info("Calculating 'nearest_inner_edge_grouped' distances'") for name, group in gdf_polygons.groupby(shape_names_column): hole_lines = [] hole_labels = [] - for _idx, row in group.iterrows(): + for idx, row in group.iterrows(): base = row[unique_shape_names_column] for i, hole in enumerate(_extract_interior_lines(row.geometry), start=1): hole_lines.append(LineString(hole.coords)) @@ -484,18 +496,19 @@ def _extract_interior_lines(geom): if not hole_lines: continue - hole_gdf = gpd.GeoDataFrame({"geometry": hole_lines, "hole_name": hole_labels}, crs=gdf.crs) + hole_gdf = gpd.GeoDataFrame({ + "geometry": hole_lines, + "hole_name": hole_labels + }, crs=gdf.crs) joined = gpd.sjoin_nearest(pts_gdf, hole_gdf, how="left", distance_col="dist") - adata.obs[f"{output_name}distance_to_nearest_inner_edge_of_{name}"] = ( - joined["dist"].to_numpy() * pixel_size_um - ) + adata.obs[f"{output_name}distance_to_nearest_inner_edge_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um adata.obs[f"{output_name}name_of_nearest_inner_edge_of_{name}"] = joined["hole_name"] if "nearest_inner_edge_grouped_unique" in modes: - print("Calculating 'nearest_inner_edge_grouped_unique' distances'") - for _idx, row in gdf_polygons.iterrows(): + log.info("Calculating 'nearest_inner_edge_grouped_unique' distances'") + for idx, row in gdf_polygons.iterrows(): base = row[unique_shape_names_column] holes = _extract_interior_lines(row.geometry) @@ -503,93 +516,100 @@ def _extract_interior_lines(geom): continue hole_geoms = [LineString(hole.coords) for hole in holes] - hole_names = [f"{base}-hole{i + 1}" for i in range(len(holes))] + hole_names = [f"{base}-hole{i+1}" for i in range(len(holes))] - hole_gdf = gpd.GeoDataFrame({"geometry": hole_geoms, "hole_name": hole_names}, crs=gdf.crs) + hole_gdf = gpd.GeoDataFrame({ + "geometry": hole_geoms, + "hole_name": hole_names + }, crs=gdf.crs) joined = gpd.sjoin_nearest(pts_gdf, hole_gdf, how="left", distance_col="dist") - adata.obs[f"{output_name}distance_to_nearest_inner_edge_of_{base}"] = ( - joined["dist"].to_numpy() * pixel_size_um - ) + adata.obs[f"{output_name}distance_to_nearest_inner_edge_of_{base}"] = joined["dist"].to_numpy() * pixel_size_um adata.obs[f"{output_name}name_of_nearest_inner_edge_of_{base}"] = joined["hole_name"] if "all_inner_edges" in modes and not holes_gdf.empty: - print("Calculating 'all_inner_edges' distances'") + log.info("Calculating 'all_inner_edges' distances'") for _, hole in holes_gdf.iterrows(): hole_name = hole[unique_shape_names_column] col = f"{output_name}distance_to_inner_edge_of_{hole_name}" adata.obs[col] = pts.distance(hole.geometry).to_numpy() * pixel_size_um - # Centroids + # Centroids centroids_df = gdf_polygons.reset_index(drop=True)[[unique_shape_names_column]].copy() centroids_df["geometry"] = gdf_polygons.geometry.centroid.reset_index(drop=True) - centroids_gdf = gpd.GeoDataFrame(centroids_df, geometry="geometry", crs=gdf.crs) + centroids_gdf = gpd.GeoDataFrame( + centroids_df, + geometry="geometry", + crs=gdf.crs + ) if "nearest_centroid" in modes: - print("Calculating 'nearest_centroid' distances'") - joined = gpd.sjoin_nearest(pts_gdf, centroids_gdf, how="left", distance_col="dist") + log.info("Calculating 'nearest_centroid' distances'") + joined = gpd.sjoin_nearest( + pts_gdf, centroids_gdf, + how="left", + distance_col="dist" + ) adata.obs.loc[joined.index, f"{output_name}distance_to_nearest_centroid"] = ( joined["dist"].to_numpy() * pixel_size_um ) - adata.obs.loc[joined.index, f"{output_name}name_of_nearest_centroid"] = joined[ - unique_shape_names_column - ].to_numpy() + adata.obs.loc[joined.index, f"{output_name}name_of_nearest_centroid"] = ( + joined[unique_shape_names_column].to_numpy() + ) if "nearest_centroid_grouped" in modes: - print("Calculating 'nearest_centroid_grouped' distances'") + log.info("Calculating 'nearest_centroid_grouped' distances'") for name, group in gdf_polygons.groupby(shape_names_column): centroids = group.geometry.centroid labels = group[unique_shape_names_column].tolist() - centroid_gdf = gpd.GeoDataFrame({"geometry": centroids, unique_shape_names_column: labels}, crs=gdf.crs) + centroid_gdf = gpd.GeoDataFrame({ + "geometry": centroids, + unique_shape_names_column: labels + }, crs=gdf.crs) joined = gpd.sjoin_nearest(pts_gdf, centroid_gdf, how="left", distance_col="dist") - adata.obs[f"{output_name}distance_to_nearest_centroid_of_{name}"] = ( - joined["dist"].to_numpy() * pixel_size_um - ) + adata.obs[f"{output_name}distance_to_nearest_centroid_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um adata.obs[f"{output_name}name_of_nearest_centroid_of_{name}"] = joined[unique_shape_names_column] if "all_centroids" in modes: - print("Calculating 'all_centroids' distances'") - for _idx, feat in gdf_polygons.reset_index(drop=True).iterrows(): - adata.obs[f"{output_name}distance_to_centroid_of_{feat[unique_shape_names_column]}"] = ( - pts.distance(feat.geometry.centroid).to_numpy() * pixel_size_um - ) + log.info("Calculating 'all_centroids' distances'") + for idx, feat in gdf_polygons.reset_index(drop=True).iterrows(): + adata.obs[f"{output_name}distance_to_centroid_of_{feat[unique_shape_names_column]}"] = pts.distance(feat.geometry.centroid).to_numpy() * pixel_size_um # Point if has_points: if "nearest_point" in modes: - print("Calculating 'nearest_point' distances'") - joined = gpd.sjoin_nearest(pts_gdf, gdf_points, how="left", distance_col="dist") + log.info("Calculating 'nearest_point' distances'") + joined = gpd.sjoin_nearest( + pts_gdf, gdf_points, how="left", distance_col="dist" + ) adata.obs[f"{output_name}distance_to_nearest_point"] = joined["dist"].to_numpy() * pixel_size_um adata.obs[f"{output_name}name_of_nearest_point"] = joined[f"{shape_names_column}-unique"] if "nearest_point_grouped" in modes: - print("Calculating 'nearest_point_grouped' distances'") + log.info("Calculating 'nearest_point_grouped' distances'") for name, group in gdf_points.groupby(shape_names_column): joined = gpd.sjoin_nearest(pts_gdf, group, how="left", distance_col="dist") - adata.obs[f"{output_name}distance_to_nearest_point_of_{name}"] = ( - joined["dist"].to_numpy() * pixel_size_um - ) + adata.obs[f"{output_name}distance_to_nearest_point_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um adata.obs[f"{output_name}name_of_nearest_point_of_{name}"] = joined[f"{shape_names_column}-unique"] if "all_points" in modes: - print("Calculating 'all_points' distances'") + log.info("Calculating 'all_points' distances'") for _, row in gdf_points.iterrows(): - adata.obs[f"{output_name}distance_to_point_{row[unique_shape_names_column]}"] = ( - pts.distance(row.geometry).to_numpy() * pixel_size_um - ) + adata.obs[f"{output_name}distance_to_point_{row[unique_shape_names_column]}"] = pts.distance(row.geometry).to_numpy() * pixel_size_um - # Add table layer + + # Add table layer sdata = add_table_layer( sdata, - adata=adata, - output_layer=output_table_layer, - region=adata.obs[_REGION_KEY].cat.categories.to_list(), - overwrite=overwrite, + adata = adata, + output_layer = output_table_layer, + region = adata.obs[_REGION_KEY].cat.categories.to_list(), + overwrite = overwrite, ) - - return sdata + + return sdata \ No newline at end of file diff --git a/src/harpy/table/_table.py b/src/harpy/table/_table.py index e91318b7..fb00d872 100644 --- a/src/harpy/table/_table.py +++ b/src/harpy/table/_table.py @@ -1,12 +1,13 @@ from __future__ import annotations -from collections.abc import Iterable, Sequence +from collections.abc import Iterable import numpy as np import pandas as pd from anndata import AnnData from spatialdata import SpatialData from spatialdata.models import TableModel +from typing import Optional, Union, Sequence, Literal from harpy.shape._shape import filter_shapes_layer from harpy.table._manager import TableLayerManager @@ -268,17 +269,19 @@ def filter_on_size( ------- The updated SpatialData object. """ + sdata = filter_numerical( - sdata=sdata, - labels_layer=labels_layer, - table_layer=table_layer, - output_layer=output_layer, - numerical_column=cellsize_key, - min_value=min_size, - max_value=max_size, - update_shapes_layers=update_shapes_layers, - prefix_filtered_shapes_layer=prefix_filtered_shapes_layer, - overwrite=overwrite, + sdata = sdata, + labels_layer = labels_layer, + table_layer = table_layer, + output_layer = output_layer, + numerical_column = cellsize_key, + min_value = min_size, + max_value = max_size, + target = "obs", + update_shapes_layers = update_shapes_layers, + prefix_filtered_shapes_layer = prefix_filtered_shapes_layer, + overwrite = overwrite, ) return sdata @@ -292,37 +295,44 @@ def filter_numerical( numerical_column: str, min_value: float | int | None = None, max_value: float | int | None = None, + target: Literal["obs", "var"] = "obs", update_shapes_layers: bool = True, prefix_filtered_shapes_layer: str = "filtered", overwrite: bool = False, ) -> SpatialData: """Returns the updated SpatialData object. - All cells with a size outside of the min and max size range are removed using the `numerical_column` in `.obs` (cells are kept if min_value ≤ numerical_column ≤ max_value). - + All observations (obs) or features (var) with values outside of the min and max size range are removed using the `numerical_column` in `.obs` or `.var`. + observations/features are kept if `min_value` ≤ `numerical_column` ≤ `max_value`. + Parameters ---------- sdata The SpatialData object. labels_layer - The labels layer(s) of `sdata` used to select the cells via the _REGION_KEY in `sdata.tables[table_layer].obs`. + The labels layer(s) of `sdata` used to select the observations via the _REGION_KEY in `sdata.tables[table_layer].obs`. Note that if `output_layer` is equal to `table_layer` and overwrite is True, - cells in `sdata.tables[table_layer]` linked to other `labels_layer` (via the _REGION_KEY), will be removed from `sdata.tables[table_layer]` + observations in `sdata.tables[table_layer]` linked to other `labels_layer` (via the _REGION_KEY), will be removed from `sdata.tables[table_layer]` (also from the backing zarr store if it is backed). table_layer The table layer in `sdata`. output_layer The output table layer in `sdata`. numerical_column - Name of numerical column in `sdata.tables[table_layer].obs` to use for filtering with `min_value` and `max_value`. + Name of numerical column to use for filtering. + If `target='obs'`, this refers to `.obs[numerical_column]`. + If `target='var'`, this refers to `.var[numerical_column]`. + target + Whether to filter observations ('obs') or features ('var'). min_value - minimum value of `numerical_column`. If None, this value is not used for filtering. + minimum value of `numerical_column` (inclusive). If None, lower bound is ignored. max_value - maximum value of `numerical_column`. If None, this value is not used for filtering. + maximum value of `numerical_column` (inclusive). If None, upper bound is ignored. update_shapes_layers Whether to filter the shapes layers associated with `labels_layer`. - If set to `True`, cells that do not appear in resulting `output_layer` (with `_REGION_KEY` equal to `labels_layer`) will be removed from the shapes layers (via `_INSTANCE_KEY`) in the `sdata` object. + If set to `True`, observations that do not appear in resulting `output_layer` (with `_REGION_KEY` equal to `labels_layer`) will be removed from the shapes layers (via `_INSTANCE_KEY`) in the `sdata` object. Filtered shapes will be added to `sdata` with prefix `prefix_filtered_shapes_layer`. + Ignored if `target='var'`. prefix_filtered_shapes_layer prefix to use for filtered shapes layer if update_shapes_layers is True. Defaults to 'filtered'. overwrite @@ -332,31 +342,42 @@ def filter_numerical( ------- The updated SpatialData object. """ + if target not in {"obs", "var"}: + raise ValueError("target must be either 'obs' or 'var'") + process_table_instance = ProcessTable(sdata, labels_layer=labels_layer, table_layer=table_layer) adata = process_table_instance._get_adata().copy() - start = adata.shape[0] - - if numerical_column not in adata.obs.columns: - raise ValueError(f"Column '{numerical_column}' not found in '{table_layer}.obs'. ") - - if not np.issubdtype(adata.obs[numerical_column].dtype, np.number): + start = adata.shape[0] if target == "obs" else adata.shape[1] + + df = getattr(adata, target) + label = "observations" if target == "obs" else "features" + + if numerical_column not in df.columns: raise ValueError( - f"Column '{numerical_column}' must be numeric, but dtype is {adata.obs[numerical_column].dtype}." + f"Column '{numerical_column}' not found in '{table_layer}.{target}'. " + ) + + if not np.issubdtype(df[numerical_column].dtype, np.number): + raise ValueError( + f"Column '{numerical_column}' must be numeric, but dtype is {df[numerical_column].dtype}." ) - # Filter cells based on min and max values - mask = pd.Series(True, index=adata.obs.index) - + # Filter observations based on min and max values + mask = pd.Series(True, index=df.index) + if min_value is not None: - below = (adata.obs[numerical_column] < min_value).sum() - log.info(f"Removed {below} cells below {min_value}.") - mask &= adata.obs[numerical_column] >= min_value + below = (df[numerical_column] < min_value).sum() + log.info(f"Removed {below} {label} below {min_value}.") + mask &= df[numerical_column] >= min_value if max_value is not None: - above = (adata.obs[numerical_column] > max_value).sum() - log.info(f"Removed {above} cells above {max_value}.") - mask &= adata.obs[numerical_column] <= max_value - - adata = adata[mask, :].copy() + above = (df[numerical_column] > max_value).sum() + log.info(f"Removed {above} {label} above {max_value}.") + mask &= df[numerical_column] <= max_value + + if target == "obs": + adata = adata[mask, :].copy() + else: + adata = adata[:, mask].copy() sdata = add_table_layer( sdata, @@ -366,7 +387,7 @@ def filter_numerical( overwrite=overwrite, ) - if update_shapes_layers: + if target == "obs" and update_shapes_layers: for _labels_layer in process_table_instance.labels_layer: sdata = filter_shapes_layer( sdata, @@ -375,9 +396,10 @@ def filter_numerical( prefix_filtered_shapes_layer=prefix_filtered_shapes_layer, ) - filtered = start - adata.shape[0] + removed = start - (adata.shape[0] if target == "obs" else adata.shape[1]) + kept = start - removed log.info( - f"Removed {filtered} / {start} cells based on '{numerical_column}' (min={min_value}, max={max_value}) and kept {adata.shape[0]}." + f"Removed {removed}/{start} {label} based on '{numerical_column}' (min={min_value}, max={max_value}) and kept {kept}." ) return sdata @@ -388,43 +410,49 @@ def filter_categorical( labels_layer: list[str], table_layer: str, output_layer: str, - categorical_column: str, - include_values: str | Sequence[str] | None = None, - exclude_values: str | Sequence[str] | None = None, + categorical_column: Optional[str] = None, + include_values: Optional[Union[str, Sequence[str]]] = None, + exclude_values: Optional[Union[str, Sequence[str]]] = None, + target: Literal["obs", "var"] = "obs", update_shapes_layers: bool = True, prefix_filtered_shapes_layer: str = "filtered", overwrite: bool = False, ) -> SpatialData: - """Filter cells based on categorical values in `.obs`. - - Removes or keeps cells based on specific values in a categorical column of - `sdata.tables[table_layer].obs`. + """ + Removes or keeps observations (obs) or features (var) based on specific values in a categorical column of + `.obs` or `.var`. or based on their index. Parameters ---------- sdata The SpatialData object. labels_layer - The labels layer(s) of `sdata` used to select the cells via the _REGION_KEY in `sdata.tables[table_layer].obs`. + The labels layer(s) of `sdata` used to select the observations via the _REGION_KEY in `sdata.tables[table_layer].obs`. Note that if `output_layer` is equal to `table_layer` and overwrite is True, - cells in `sdata.tables[table_layer]` linked to other `labels_layer` (via the _REGION_KEY), will be removed from `sdata.tables[table_layer]` + observations in `sdata.tables[table_layer]` linked to other `labels_layer` (via the _REGION_KEY), will be removed from `sdata.tables[table_layer]` (also from the backing zarr store if it is backed). table_layer The table layer in `sdata`. output_layer The output table layer in `sdata`. categorical_column - Name of the categorical column in `.obs` to use for filtering. + Name of categorical column to use for filtering. + If `target='obs'`, this refers to `.obs[categorical_column]`. + If `target='var'`, this refers to `.var[categorical_column]`. + If None, filtering is performed based on the index. include_values - Value(s) to keep. Only cells whose `categorical_column` matches one of these + Value(s) to keep. Only observations/features whose `categorical_column` matches one of these values will be kept. Mutually exclusive with `exclude_values`. exclude_values - Value(s) to remove. Cells whose `categorical_column` matches one of these + Value(s) to remove. Only observations/features whose `categorical_column` matches one of these values will be removed. Mutually exclusive with `include_values`. + target + Whether to filter observations ('obs') or features ('var'). update_shapes_layers Whether to filter the shapes layers associated with `labels_layer`. - If set to `True`, cells that do not appear in resulting `output_layer` (with `_REGION_KEY` equal to `labels_layer`) will be removed from the shapes layers (via `_INSTANCE_KEY`) in the `sdata` object. + If set to `True`, observations that do not appear in resulting `output_layer` (with `_REGION_KEY` equal to `labels_layer`) will be removed from the shapes layers (via `_INSTANCE_KEY`) in the `sdata` object. Filtered shapes will be added to `sdata` with prefix `prefix_filtered_shapes_layer`. + Ignored if `target='var'`. prefix_filtered_shapes_layer prefix to use for filtered shapes layer if update_shapes_layers is True. Defaults to 'filtered'. overwrite @@ -437,12 +465,18 @@ def filter_categorical( if include_values is not None and exclude_values is not None: raise ValueError("Specify only one of 'include_values' or 'exclude_values'.") + if target not in {"obs", "var"}: + raise ValueError("target must be either 'obs' or 'var'") + process_table_instance = ProcessTable(sdata, labels_layer=labels_layer, table_layer=table_layer) adata = process_table_instance._get_adata().copy() - start = adata.shape[0] - - if categorical_column not in adata.obs.columns: - raise ValueError(f"Column '{categorical_column}' not found in '{table_layer}.obs'.") + start = adata.shape[0] if target == "obs" else adata.shape[1] + + df = getattr(adata, target) + label = "observations" if target == "obs" else "features" + + if categorical_column is not None and categorical_column not in df.columns: + raise ValueError(f"Column '{categorical_column}' not found in '{table_layer}.{target}'.") # Ensure include/exclude are lists if isinstance(include_values, str): @@ -451,21 +485,25 @@ def filter_categorical( exclude_values = [exclude_values] # Filter - mask = pd.Series(True, index=adata.obs.index) - - if include_values is not None: - kept = adata.obs[categorical_column].isin(include_values).sum() - removed = (~adata.obs[categorical_column].isin(include_values)).sum() - log.info(f"Found {kept} cells in {include_values} to keep.") - mask &= adata.obs[categorical_column].isin(include_values) - - elif exclude_values is not None: - removed = adata.obs[categorical_column].isin(exclude_values).sum() - kept = (~adata.obs[categorical_column].isin(exclude_values)).sum() - log.info(f"Found {removed} cells in {exclude_values} to remove.") - mask &= ~adata.obs[categorical_column].isin(exclude_values) - - adata = adata[mask, :].copy() + mask = pd.Series(True, index=df.index) + + if categorical_column is not None: + # Filtering by column + if include_values is not None: + mask &= df[categorical_column].isin(include_values) + elif exclude_values is not None: + mask &= ~df[categorical_column].isin(exclude_values) + else: + # Filtering by index + if include_values is not None: + mask &= df.index.isin(include_values) + elif exclude_values is not None: + mask &= ~df.index.isin(exclude_values) + + if target == "obs": + adata = adata[mask, :].copy() + else: + adata = adata[:, mask].copy() sdata = add_table_layer( sdata, @@ -475,7 +513,7 @@ def filter_categorical( overwrite=overwrite, ) - if update_shapes_layers: + if target == "obs" and update_shapes_layers: for _labels_layer in process_table_instance.labels_layer: sdata = filter_shapes_layer( sdata, @@ -484,13 +522,11 @@ def filter_categorical( prefix_filtered_shapes_layer=prefix_filtered_shapes_layer, ) - filtered = start - adata.shape[0] - log.info( - f"Removed {filtered} / {start} cells based on '{categorical_column}' " - f"({'included' if include_values is not None else 'excluded'}: " - f"{include_values if include_values is not None else exclude_values}) " - f"and kept {adata.shape[0]}." - ) + kept = (adata.shape[0] if target == "obs" else adata.shape[1]) + removed = start - kept + if categorical_column is None: + categorical_column = "index" + log.info(f"Removed {removed}/{start} {label} based on '{categorical_column}' filtering (kept {kept}).") return sdata From cbeb7dde941e72f0e9eb61cd63cd55a2960693ac Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 18 Nov 2025 16:24:43 +0000 Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/harpy/shape/__init__.pyi | 9 +- src/harpy/shape/_filters.py | 162 +++++++----- src/harpy/shape/_shape.py | 71 +++-- src/harpy/table/__init__.pyi | 4 +- src/harpy/table/_region_annotations.py | 353 ++++++++++++------------- src/harpy/table/_table.py | 65 +++-- 6 files changed, 335 insertions(+), 329 deletions(-) diff --git a/src/harpy/shape/__init__.pyi b/src/harpy/shape/__init__.pyi index 3a08db60..bace2cb3 100644 --- a/src/harpy/shape/__init__.pyi +++ b/src/harpy/shape/__init__.pyi @@ -1,6 +1,11 @@ from ._cell_expansion import create_voronoi_boundaries -from ._shape import add_shapes_layer, filter_shapes_layer, intersect_rectangles, vectorize, prep_region_annotations -from ._filters import morphological_features, filter_shapes_numerical, filter_shapes_categorical, filter_shapes_by_shapes +from ._filters import ( + filter_shapes_by_shapes, + filter_shapes_categorical, + filter_shapes_numerical, + morphological_features, +) +from ._shape import add_shapes_layer, filter_shapes_layer, intersect_rectangles, prep_region_annotations, vectorize __all__ = [ "add_shapes_layer", diff --git a/src/harpy/shape/_filters.py b/src/harpy/shape/_filters.py index f7aeeb56..027c77ac 100755 --- a/src/harpy/shape/_filters.py +++ b/src/harpy/shape/_filters.py @@ -1,17 +1,20 @@ -from typing import Optional, Dict, Union, List, Literal, Sequence -from spatialdata import SpatialData +from collections.abc import Sequence +from typing import Literal + import geopandas as gpd -import pandas as pd import numpy as np -from shapely.geometry import Polygon, MultiPolygon, Point, MultiPoint, LineString, MultiLineString +import pandas as pd +from shapely.geometry import LineString, MultiPolygon, Polygon from shapely.ops import unary_union -from harpy.shape._shape import add_shapes_layer - +from spatialdata import SpatialData from spatialdata.transformations import get_transformation +from harpy.shape._shape import add_shapes_layer from harpy.utils.pylogger import get_pylogger + log = get_pylogger(__name__) + # Extract exterior and interior boundaries def _extract_exterior_lines(geom): if isinstance(geom, Polygon): @@ -20,6 +23,7 @@ def _extract_exterior_lines(geom): return [poly.exterior for poly in geom.geoms] return [] + def _extract_interior_lines(geom): if isinstance(geom, Polygon): return list(geom.interiors) @@ -30,6 +34,7 @@ def _extract_interior_lines(geom): return lines return [] + def morphological_features( sdata: SpatialData, shapes_layer: str, @@ -43,30 +48,30 @@ def morphological_features( Compute morphological features for polygons in a shapes layer. Only Polygon and MultiPolygon are supported. It is recommended to run `hp.sh.prep_region_annotations` first when computing features for region annotations that may contain multipolygons, point geometries, unnamed annotations, etc. - - - `area`: Area of the polygon (px² or µm²). - - `perimeter`: Perimeter length (px or µm). - - `equivalent_diameter`: Diameter of a circle with the same area (px or µm). - - `convex_area`: Area of convex hull (px² or µm²). + + - `area`: Area of the polygon (px² or µm²). + - `perimeter`: Perimeter length (px or µm). + - `equivalent_diameter`: Diameter of a circle with the same area (px or µm). + - `convex_area`: Area of convex hull (px² or µm²). - `convex_perimeter`: Perimeter of convex hull (px or µm). - - `circularity`: 4π * area / perimeter². + - `circularity`: 4π * area / perimeter². → 1 for a perfect circle; lower = irregular shape. - - `compactness`: perimeter² / area. - - `solidity`: area / convex_area. + - `compactness`: perimeter² / area. + - `solidity`: area / convex_area. → Low values mean concave or fragmented shapes. - - `convexity`: convex_perimeter / perimeter. + - `convexity`: convex_perimeter / perimeter. → 1 for perfectly convex shapes. Lower for rough or spiky boundaries. - - `centroid_x`: X-coordinate of the polygon centroid (px). + - `centroid_x`: X-coordinate of the polygon centroid (px). - `centroid_y`: Y-coordinate of the polygon centroid (px). - - `centroid_dif`: Distance between polygon and convex hull centroids normalized by the polygon area. + - `centroid_dif`: Distance between polygon and convex hull centroids normalized by the polygon area. → Captures off-centered concavity or asymmetry. - - `major_axis_length`: Length of the longest side of the minimum rotated bounding box (px or µm). - - `minor_axis_length`: Length of the shortest side of the minimum rotated bounding box (px or µm). - - `major_minor_axis_ratio`: major_axis_length / minor_axis_length. - - `num_vertices`: Number of vertices along exterior boundaries. - - `boundary_complexity`: num_vertices / perimeter. - → Normalized measure of boundary irregularity. - - `num_holes`: Count of internal holes. + - `major_axis_length`: Length of the longest side of the minimum rotated bounding box (px or µm). + - `minor_axis_length`: Length of the shortest side of the minimum rotated bounding box (px or µm). + - `major_minor_axis_ratio`: major_axis_length / minor_axis_length. + - `num_vertices`: Number of vertices along exterior boundaries. + - `boundary_complexity`: num_vertices / perimeter. + → Normalized measure of boundary irregularity. + - `num_holes`: Count of internal holes. - `hole_area`: Area covered by internal holes (px² or µm²) Parameters @@ -84,10 +89,10 @@ def morphological_features( the same name in `shape_names_column`. pixel_size_um Scale factor to convert geometric measurements from pixel to micron units. - - Applied to: + - Applied to: * "perimeter", "major_axis_length", and "minor_axis_length" → scaled by (pixel_size_um) - * "area" and "convex_area" → scaled by (pixel_size_um)² - - Dimensionless ratios (e.g., "circularity", "compactness", "solidity", "convexity", "centroid_dif", "major_minor_axis_ratio") + * "area" and "convex_area" → scaled by (pixel_size_um)² + - Dimensionless ratios (e.g., "circularity", "compactness", "solidity", "convexity", "centroid_dif", "major_minor_axis_ratio") are unaffected by scaling but are computed using the scaled geometric quantities. - XY-coordinates are kept in original units. Defaults to 1.0 (no scaling, i.e. units remain in pixels). @@ -101,19 +106,23 @@ def morphological_features( """ # Create copy of shapes layer gdf = sdata.shapes[shapes_layer].copy() - + # Filter out geometries that are not Polygon or MultiPolygon supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon))) unssuported = len(gdf) - supported_mask.sum() if unssuported > 0: - raise ValueError(f"Found {unssuported} non-polygon geometries in {shapes_layer}. Consider running `hp.sh.prep_region_annotations` first to clean up geometries.") - if unssuported == len(gdf): - log.warning(f"No supported geometries (Polygon, MultiPolygon) found. Skipping addition of shapes layer '{output_shapes_layer}' to sdata.") + raise ValueError( + f"Found {unssuported} non-polygon geometries in {shapes_layer}. Consider running `hp.sh.prep_region_annotations` first to clean up geometries." + ) + if unssuported == len(gdf): + log.warning( + f"No supported geometries (Polygon, MultiPolygon) found. Skipping addition of shapes layer '{output_shapes_layer}' to sdata." + ) return sdata gdf = gdf[supported_mask].copy() - + # Multipolygon check n_multipolygons = gdf.geometry.apply(lambda x: isinstance(x, MultiPolygon)).sum() if n_multipolygons > 0: @@ -121,7 +130,7 @@ def morphological_features( f"Detected {n_multipolygons} MultiPolygon geometries in '{shapes_layer}'. " "Consider running `hp.sh.prep_region_annotations` first to split MultiPolygons before feature extraction." ) - + # Dissolve polygons by name grouped_gdf = None if grouped_features: @@ -156,8 +165,7 @@ def _compute_features(gdf: gpd.GeoDataFrame, pixel_size_um, suffix: str = ""): gdf[f"centroid_y{suffix}"] = gdf.geometry.centroid.y hull_centroids = convex_hulls.centroid gdf[f"centroid_dif{suffix}"] = np.sqrt( - (gdf[f"centroid_x{suffix}"] - hull_centroids.x) ** 2 + - (gdf[f"centroid_y{suffix}"] - hull_centroids.y) ** 2 + (gdf[f"centroid_x{suffix}"] - hull_centroids.x) ** 2 + (gdf[f"centroid_y{suffix}"] - hull_centroids.y) ** 2 ) / np.sqrt(gdf[f"area{suffix}"]) # Major/minor axis (from rotated rectangle) @@ -165,7 +173,7 @@ def _rotated_rect_axes(geom): try: rect = geom.minimum_rotated_rectangle x, y = rect.exterior.coords.xy - edges = np.sqrt(np.diff(x)**2 + np.diff(y)**2) + edges = np.sqrt(np.diff(x) ** 2 + np.diff(y) ** 2) edges = np.sort(edges[:-1]) # remove duplicate closing edge minor, major = edges[0], edges[1] ratio = major / minor if minor > 0 else np.nan @@ -185,14 +193,12 @@ def _num_vertices(geom): return sum(len(line.coords) for line in exteriors) gdf[f"num_vertices{suffix}"] = gdf.geometry.apply(_num_vertices) - gdf[f"boundary_complexity{suffix}"] = ( - gdf[f"num_vertices{suffix}"] / gdf[f"perimeter{suffix}"] - ) + gdf[f"boundary_complexity{suffix}"] = gdf[f"num_vertices{suffix}"] / gdf[f"perimeter{suffix}"] # Hole topology def _num_holes(geom): return len(_extract_interior_lines(geom)) - + def _hole_area(g): interiors = _extract_interior_lines(g) if not interiors: @@ -206,11 +212,11 @@ def _hole_area(g): gdf[f"num_holes{suffix}"] = gdf.geometry.apply(_num_holes) gdf[f"hole_area{suffix}"] = gdf.geometry.apply(_hole_area) * pixel_size_um**2 - + return gdf gdf = _compute_features(gdf, pixel_size_um, "") - + if grouped_features and not grouped_gdf.empty: grouped_gdf = _compute_features(grouped_gdf, pixel_size_um, "-grouped") @@ -220,7 +226,6 @@ def _hole_area(g): for col in grouped_metrics: gdf[col] = gdf[shape_names_column].map(grouped_lookup[col]) - # sanity check. If this sanity check would fail in spatialdata at some point, then pass transformation to transformations parameter of add_shapes_layer. assert get_transformation(gdf, get_all=True) == get_transformation(sdata[shapes_layer], get_all=True) @@ -272,14 +277,10 @@ def filter_shapes_numerical( gdf = sdata.shapes[shapes_layer].copy() if numerical_column not in gdf.columns: - raise ValueError( - f"Column '{numerical_column}' not found in '{shapes_layer}.obs'. " - ) - + raise ValueError(f"Column '{numerical_column}' not found in '{shapes_layer}.obs'. ") + if not np.issubdtype(gdf[numerical_column].dtype, np.number): - raise ValueError( - f"Column '{numerical_column}' must be numeric, but dtype is {gdf[numerical_column].dtype}." - ) + raise ValueError(f"Column '{numerical_column}' must be numeric, but dtype is {gdf[numerical_column].dtype}.") # Filter cells based on min and max values start = len(gdf) @@ -297,7 +298,9 @@ def filter_shapes_numerical( filtered_gdf = gdf[mask].copy() kept = len(filtered_gdf) removed = start - len(filtered_gdf) - log.info(f"Removed {removed}/{start} polygons outside {min_value}–{max_value} for '{numerical_column}' (kept {kept}).") + log.info( + f"Removed {removed}/{start} polygons outside {min_value}–{max_value} for '{numerical_column}' (kept {kept})." + ) # sanity check. If this sanity check would fail in spatialdata at some point, then pass transformation to transformations parameter of add_shapes_layer. assert get_transformation(filtered_gdf, get_all=True) == get_transformation(sdata[shapes_layer], get_all=True) @@ -317,9 +320,9 @@ def filter_shapes_categorical( sdata: SpatialData, shapes_layer: str, output_shapes_layer: str, - categorical_column: Optional[str] = None, - include_values: Optional[Union[str, Sequence[str]]] = None, - exclude_values: Optional[Union[str, Sequence[str]]] = None, + categorical_column: str | None = None, + include_values: str | Sequence[str] | None = None, + exclude_values: str | Sequence[str] | None = None, overwrite: bool = False, ) -> SpatialData: """ @@ -353,7 +356,7 @@ def filter_shapes_categorical( raise ValueError("Specify only one of 'include_values' or 'exclude_values'.") gdf = sdata.shapes[shapes_layer].copy() - + if categorical_column is not None and categorical_column not in gdf.columns: raise ValueError(f"Column '{categorical_column}' not found in '{shapes_layer}'.") @@ -379,7 +382,7 @@ def filter_shapes_categorical( mask &= gdf.index.isin(include_values) elif exclude_values is not None: mask &= ~gdf.index.isin(exclude_values) - + filtered_gdf = gdf[mask].copy() kept = len(filtered_gdf) removed = start - kept @@ -406,15 +409,25 @@ def filter_shapes_by_shapes( target_shapes_layer: str, mask_shapes_layer: str, output_shapes_layer: str, - shape_names_column: Optional[str] = None, - shape_names: Optional[Union[str, List[str]]] = None, - mode: Literal["intersects", "within", "centroid_within", "touches", "disjoint", "overlap_fraction", "edge", "outer_edge", "inner_edge"] = "centroid_within", + shape_names_column: str | None = None, + shape_names: str | list[str] | None = None, + mode: Literal[ + "intersects", + "within", + "centroid_within", + "touches", + "disjoint", + "overlap_fraction", + "edge", + "outer_edge", + "inner_edge", + ] = "centroid_within", keep: bool = True, - threshold: Optional[float] = None, + threshold: float | None = None, overwrite: bool = False, ): """ - Filter polygons in a target shapes layer based on geometric relationships to polygons of a mask shapes layer. + Filter polygons in a target shapes layer based on geometric relationships to polygons of a mask shapes layer. A typical use-case would be to filter cell segmentations based on their location within annotated regions. Parameters @@ -449,15 +462,14 @@ def filter_shapes_by_shapes( Threshold to use for overlap_fraction. overwrite If True, overwrites the output shapes layer if it exists. - + Returns ------- SpatialData object with filtered shapes layer. """ - # Copy target layer target_gdf = sdata.shapes[target_shapes_layer].copy() - + # Copy mask layer mask_gdf = sdata.shapes[mask_shapes_layer].copy() @@ -470,7 +482,7 @@ def filter_shapes_by_shapes( mask_gdf = mask_gdf[mask_gdf[shape_names_column].isin(shape_names)].copy() if mask_gdf.empty: raise ValueError(f"No geometries found in '{mask_shapes_layer}' matching {shape_names}.") - + # Build mask union mask_union = mask_gdf.unary_union @@ -493,7 +505,7 @@ def filter_shapes_by_shapes( fraction = overlaps.area / intersecting.geometry.area condition = pd.Series(False, index=target_gdf.index) condition.loc[intersecting.index] = fraction > threshold - elif mode == "edge": + elif mode == "edge": condition = target_gdf.geometry.intersects(mask_union) & ~target_gdf.geometry.within(mask_union) elif mode in ["outer_edge", "inner_edge"]: outer_boundaries = [] @@ -507,13 +519,19 @@ def filter_shapes_by_shapes( outer_boundaries.extend(_extract_exterior_lines(mask_union)) inner_boundaries.extend(_extract_interior_lines(mask_union)) - outer_boundary_union = unary_union([LineString(boundary) for boundary in outer_boundaries]) if outer_boundaries else None - inner_boundary_union = unary_union([LineString(boundary) for boundary in inner_boundaries]) if inner_boundaries else None + outer_boundary_union = ( + unary_union([LineString(boundary) for boundary in outer_boundaries]) if outer_boundaries else None + ) + inner_boundary_union = ( + unary_union([LineString(boundary) for boundary in inner_boundaries]) if inner_boundaries else None + ) if mode == "outer_edge": condition = target_gdf.geometry.intersects(outer_boundary_union) elif mode == "inner_edge": - condition = target_gdf.geometry.intersects(inner_boundary_union) if inner_boundary_union is not None else False + condition = ( + target_gdf.geometry.intersects(inner_boundary_union) if inner_boundary_union is not None else False + ) else: raise ValueError(f"Unsupported mode '{mode}'.") @@ -526,9 +544,11 @@ def filter_shapes_by_shapes( log.info(f"Kept {len(filtered_gdf)} / {len(target_gdf)} geometries (removed {removed}).") else: log.info(f"Removed {removed} / {len(target_gdf)} geometries (kept {len(filtered_gdf)}).") - + # sanity check. If this sanity check would fail in spatialdata at some point, then pass transformation to transformations parameter of add_shapes_layer. - assert get_transformation(filtered_gdf, get_all=True) == get_transformation(sdata[target_shapes_layer], get_all=True) + assert get_transformation(filtered_gdf, get_all=True) == get_transformation( + sdata[target_shapes_layer], get_all=True + ) # Add to SpatialData sdata = add_shapes_layer( diff --git a/src/harpy/shape/_shape.py b/src/harpy/shape/_shape.py index e6a0cf75..52eabd24 100644 --- a/src/harpy/shape/_shape.py +++ b/src/harpy/shape/_shape.py @@ -1,20 +1,20 @@ from __future__ import annotations +import numpy as np from dask.array import Array from geopandas import GeoDataFrame -from shapely.geometry import GeometryCollection, MultiPolygon, Polygon, Point, MultiPoint, LineString, MultiLineString +from shapely.geometry import GeometryCollection, MultiPolygon, Point, Polygon from spatialdata import SpatialData from spatialdata.models._utils import MappingToCoordinateSystem_t from spatialdata.transformations import get_transformation from harpy.image._image import _get_spatial_element from harpy.shape._manager import ShapesLayerManager - -import numpy as np - from harpy.utils.pylogger import get_pylogger + log = get_pylogger(__name__) + def vectorize( sdata, labels_layer: str, @@ -184,7 +184,7 @@ def intersect_rectangles(rect1: list[int | float], rect2: list[int | float]) -> else: return None - + def prep_region_annotations( sdata: SpatialData, shapes_layer: str, @@ -218,38 +218,37 @@ def prep_region_annotations( unnamed Name to be assigned to any unnamed geometries in `shape_names_column`. Defaults to 'unnamed'. unique_shape_names_column - Column name in which unique names will be created for single polygons by appending a counter to the original name in `shape_names_column` for polygons with the same name. Note - that multipolygons will be split in individual polygons and each will get a unique name based on the original name of the multipolygon. Unique names will be stored in + Column name in which unique names will be created for single polygons by appending a counter to the original name in `shape_names_column` for polygons with the same name. Note + that multipolygons will be split in individual polygons and each will get a unique name based on the original name of the multipolygon. Unique names will be stored in `{shape_names_column}-unique` in the updated shapes layer. erosion - Number of pixels to erode polygons by. This can avoid problems with overlapping edges of geometries when calculating distances. Default is 0.5 (i.e. erosion by 0.5 pixels). + Number of pixels to erode polygons by. This can avoid problems with overlapping edges of geometries when calculating distances. Default is 0.5 (i.e. erosion by 0.5 pixels). overwrite If True, overwrites the `output_shapes_layer` if it already exists in `sdata`. - + Returns ------- Modified `sdata` object with updated updated shapes layer. """ - # Create copy of shapes layer gdf = sdata.shapes[shapes_layer].copy() log.info(f"Found {len(gdf)} geometries in {shapes_layer}.") - + # Ensure shape_names_column exist if shape_names_column not in gdf.columns: gdf[shape_names_column] = unnamed - + gdf[shape_names_column] = gdf[shape_names_column].fillna("").astype(str) unnamed_mask = gdf[shape_names_column] == "" - for i, idx in enumerate(gdf[unnamed_mask].index): + for _i, idx in enumerate(gdf[unnamed_mask].index): gdf.at[idx, shape_names_column] = unnamed - + # Convert Points with a radius column to circular polygons def _point_to_circle(geom, radius=None): if isinstance(geom, Point) and radius is not None: return geom.buffer(radius, resolution=16) return geom - + if "radius" in gdf.columns: mask_points_with_radius = gdf.geometry.geom_type.eq("Point") & gdf["radius"].notna() n_converted = mask_points_with_radius.sum() @@ -257,44 +256,43 @@ def _point_to_circle(geom, radius=None): if n_converted > 0: log.info(f"Converting {n_converted} Point geometries with 'radius' to circular polygons.") gdf.loc[mask_points_with_radius, "geometry"] = gdf.loc[mask_points_with_radius].apply( - lambda row: _point_to_circle(row.geometry, getattr(row, "radius", None)), - axis=1 + lambda row: _point_to_circle(row.geometry, getattr(row, "radius", None)), axis=1 ) - + # Slightly erode all polygons (this avoids any shared borders between polygons) gdf["geometry"] = gdf.geometry.apply( - lambda geom: geom.buffer( - -erosion, - join_style=2, - resolution=16) - if geom.geom_type in ["Polygon", "MultiPolygon"] else geom + lambda geom: geom.buffer(-erosion, join_style=2, resolution=16) + if geom.geom_type in ["Polygon", "MultiPolygon"] + else geom ) - polygon_mask = ~gdf.geometry.is_empty - removed = len(gdf) - polygon_mask.sum() - gdf = gdf[polygon_mask] # drop any polygons that collapsed to empty - if removed > 0: + polygon_mask = ~gdf.geometry.is_empty + removed = len(gdf) - polygon_mask.sum() + gdf = gdf[polygon_mask] # drop any polygons that collapsed to empty + if removed > 0: log.info(f"Removed {removed} polygons that collapsed to empty after erosion.") - + # Explode multipolygons into single polygons (this allows us to treat multipolygons as unique polygons) n_multipolygons = gdf.geometry.apply(lambda g: g.geom_type == "MultiPolygon").sum() gdf = gdf.explode(index_parts=False).reset_index(drop=True) n_after = len(gdf) - log.info(f"Split {n_multipolygons} multipolygons into individual polygons. Total number of geometries after splitting multipolgons is {n_after}.") - + log.info( + f"Split {n_multipolygons} multipolygons into individual polygons. Total number of geometries after splitting multipolgons is {n_after}." + ) + # Create unique names sizes = gdf.groupby(shape_names_column)[shape_names_column].transform("size") counter = gdf.groupby(shape_names_column).cumcount() + 1 gdf[unique_shape_names_column] = np.where( - sizes.eq(1), - gdf[shape_names_column], - gdf[shape_names_column] + counter.astype(str) + sizes.eq(1), gdf[shape_names_column], gdf[shape_names_column] + counter.astype(str) ) - + # Avoid index 0 if 0 in gdf.index: - log.warning("Index 0 found in shapes layer — shifting all indices by +1 since index 0 is reserved for background in labels layers.") + log.warning( + "Index 0 found in shapes layer — shifting all indices by +1 since index 0 is reserved for background in labels layers." + ) gdf.index = gdf.index + 1 - + # sanity check. If this sanity check would fail in spatialdata at some point, then pass transformation to transformations parameter of add_shapes_layer. assert get_transformation(gdf, get_all=True) == get_transformation(sdata[shapes_layer], get_all=True) @@ -307,4 +305,3 @@ def _point_to_circle(geom, radius=None): ) return sdata - diff --git a/src/harpy/table/__init__.pyi b/src/harpy/table/__init__.pyi index 6df1c2eb..a254ba3a 100644 --- a/src/harpy/table/__init__.pyi +++ b/src/harpy/table/__init__.pyi @@ -4,14 +4,14 @@ from ._annotation import cluster_cleanliness, score_genes, score_genes_iter from ._clustering import kmeans, leiden from ._enrichment import nhood_enrichment from ._preprocess import preprocess_proteomics, preprocess_transcriptomics +from ._region_annotations import assign_cells_to_shapes, compute_distance_to_shapes from ._regionprops import add_regionprop_features -from ._table import add_table_layer, correct_marker_genes, filter_on_size, filter_numerical, filter_categorical +from ._table import add_table_layer, correct_marker_genes, filter_categorical, filter_numerical, filter_on_size from .cell_clustering._clustering import flowsom from .cell_clustering._preprocess import cell_clustering_preprocess from .cell_clustering._weighted_channel_expression import weighted_channel_expression from .pixel_clustering._cluster_intensity import cluster_intensity from .pixel_clustering._neighbors import spatial_pixel_neighbors -from ._region_annotations import assign_cells_to_shapes, compute_distance_to_shapes __all__ = [ "add_table_layer", diff --git a/src/harpy/table/_region_annotations.py b/src/harpy/table/_region_annotations.py index 3b2171eb..78944fff 100755 --- a/src/harpy/table/_region_annotations.py +++ b/src/harpy/table/_region_annotations.py @@ -1,14 +1,17 @@ -from spatialdata import SpatialData +from typing import Literal + import geopandas as gpd import pandas as pd -from typing import Literal -from shapely.geometry import Polygon, MultiPolygon, Point, MultiPoint, LineString, MultiLineString -from harpy.table._table import add_table_layer -from harpy.utils._keys import _REGION_KEY, _INSTANCE_KEY +from shapely.geometry import LineString, MultiLineString, MultiPolygon, Point, Polygon +from spatialdata import SpatialData +from harpy.table._table import add_table_layer +from harpy.utils._keys import _REGION_KEY from harpy.utils.pylogger import get_pylogger + log = get_pylogger(__name__) + def assign_cells_to_shapes( sdata: SpatialData, shapes_layer: str, @@ -20,7 +23,7 @@ def assign_cells_to_shapes( mode: Literal["original_names", "unique_names", "both"] = "original_names", create_column_per_shape: bool = False, overlap_tolerance: float = 0.1, - spatial_key: str = 'spatial', + spatial_key: str = "spatial", xy_columns: tuple = None, overwrite: bool = False, ): @@ -33,7 +36,7 @@ def assign_cells_to_shapes( The SpatialData object containing the input table layer and shapes layer. shapes_layer The shapes layer in `sdata.shapes` to use as input. - table_layer + table_layer The table layer in `sdata.tables` to use as input. output_table_layer The output table layer in `sdata.tables` to which the updated table layer will be written. @@ -57,25 +60,24 @@ def assign_cells_to_shapes( Key in `sdata.tables[table_layer].obsm` containing spatial coordinates. Ignored if `xy_columns` is provided. xy_columns Tuple of column names in `sdata.tables[table_layer].obs` containing the x and y coordinates the cells. - If None, defaults to using coordinates from `sdata.tables[table_layer].obsm[spatial_key]`. + If None, defaults to using coordinates from `sdata.tables[table_layer].obsm[spatial_key]`. overwrite If True, overwrites the `output_table_layer` and/or `output_shapes_layer` if it already exists in `sdata`. - + Notes - ------- - - Only `Polygon` and `MultiPolygon` geometries are supported. Non-polygon geometries (e.g., `Point`, `LineString`) are skipped. - + ----- + - Only `Polygon` and `MultiPolygon` geometries are supported. Non-polygon geometries (e.g., `Point`, `LineString`) are skipped. + Returns ------- Modified `sdata` object with updated table layer. """ - if not output_column and not create_column_per_shape: raise ValueError("Specify `output_column` or set `create_column_per_shape=True`.") - + # Create copy of shapes layer gdf = sdata.shapes[shapes_layer].copy() - + # Create copy of table layer adata = sdata.tables[table_layer].copy() @@ -84,19 +86,25 @@ def assign_cells_to_shapes( skipped = len(gdf) - supported_mask.sum() if skipped > 0: log.info(f"Skipped {skipped} non-polygon geometries in {shapes_layer}.") - if skipped == len(gdf): - log.warning(f"No supported geometries (Polygon, MultiPolygon) found. Skipping addition of table layer '{output_table_layer}' to sdata.") + if skipped == len(gdf): + log.warning( + f"No supported geometries (Polygon, MultiPolygon) found. Skipping addition of table layer '{output_table_layer}' to sdata." + ) return sdata - + gdf = gdf[supported_mask].copy().reset_index(drop=True) # Check for overlapping polygons total_area = gdf.geometry.area.sum() union_area = gdf.geometry.unary_union.area if total_area - union_area > overlap_tolerance and not create_column_per_shape: - raise ValueError(f"Overlapping polygons detected in {shapes_layer}. Correct polygons or use create_column_per_shape.") + raise ValueError( + f"Overlapping polygons detected in {shapes_layer}. Correct polygons or use create_column_per_shape." + ) elif total_area - union_area > 0 and not create_column_per_shape: - log.warning(f"Overlaps detected (Δ={total_area - union_area:.3f}), below tolerance threshold {overlap_tolerance}.") + log.warning( + f"Overlaps detected (Δ={total_area - union_area:.3f}), below tolerance threshold {overlap_tolerance}." + ) # Get cell coordinates if xy_columns is not None: @@ -104,8 +112,7 @@ def assign_cells_to_shapes( coords = adata.obs[[x_col, y_col]].to_numpy() else: if spatial_key not in adata.obsm: - raise KeyError( - f"No spatial coordinates found in `obsm['{spatial_key}']` and `xy_columns` not provided.") + raise KeyError(f"No spatial coordinates found in `obsm['{spatial_key}']` and `xy_columns` not provided.") coords = adata.obsm[spatial_key] if coords.shape[1] != 2: raise ValueError(f"`obsm['{spatial_key}']` must have shape (n_cells, 2).") @@ -113,19 +120,19 @@ def assign_cells_to_shapes( # Function to assign cell to a polygon def _assign_region(x, y, gdf, column, sindex): point = Point(x, y) - + # Quick bounding box search to prefilter the cells candidate_idx = list(sindex.intersection(point.bounds)) if not candidate_idx: return None - + # Slower point‑in‑polygon search candidate_gdf = gdf.iloc[candidate_idx] match = candidate_gdf[candidate_gdf.contains(point)] - + if match.empty: return None - + return match[column].iloc[0] # Assign cells to polygons @@ -152,10 +159,8 @@ def _assign_region(x, y, gdf, column, sindex): if gdf_shape.empty: continue - sidx_shape = gdf_shape.sindex # Build spatial index - assigned = [ - _assign_region(x, y, gdf_shape, name_column, sidx_shape) for x, y in coords - ] + sidx_shape = gdf_shape.sindex # Build spatial index + assigned = [_assign_region(x, y, gdf_shape, name_column, sidx_shape) for x, y in coords] adata.obs[col_name] = assigned else: @@ -168,24 +173,22 @@ def _assign_region(x, y, gdf, column, sindex): for output_column_name, name_column in run_assign_dict.items(): # Assign one shape name per cell (to be avoided with overlapping regions) - sidx = gdf.sindex # Build spatial index - assigned = [ - _assign_region(x, y, gdf, name_column, sidx) - for x, y in coords - ] + sidx = gdf.sindex # Build spatial index + assigned = [_assign_region(x, y, gdf, name_column, sidx) for x, y in coords] adata.obs[output_column_name] = assigned - # Add table layer + # Add table layer sdata = add_table_layer( sdata, - adata = adata, - output_layer = output_table_layer, - region = adata.obs[_REGION_KEY].cat.categories.to_list(), - overwrite = overwrite, + adata=adata, + output_layer=output_table_layer, + region=adata.obs[_REGION_KEY].cat.categories.to_list(), + overwrite=overwrite, ) - + return sdata + def compute_distance_to_shapes( sdata: SpatialData, shapes_layer: str, @@ -195,14 +198,27 @@ def compute_distance_to_shapes( unique_shape_names_column: str = "name-unique", output_name: str = None, pixel_size_um: float = 1.0, - modes: list[Literal[ - "nearest_edge", "nearest_edge_grouped", "all_edges", - "nearest_outer_edge", "nearest_outer_edge_grouped", "all_outer_edges", - "nearest_inner_edge", "nearest_inner_edge_grouped", "nearest_inner_edge_grouped_unique", "all_inner_edges", - "nearest_centroid", "nearest_centroid_grouped", "all_centroids", - "nearest_point", "nearest_point_grouped", "all_points" - ]] = ["all_edges"], - spatial_key: str = 'spatial', + modes: list[ + Literal[ + "nearest_edge", + "nearest_edge_grouped", + "all_edges", + "nearest_outer_edge", + "nearest_outer_edge_grouped", + "all_outer_edges", + "nearest_inner_edge", + "nearest_inner_edge_grouped", + "nearest_inner_edge_grouped_unique", + "all_inner_edges", + "nearest_centroid", + "nearest_centroid_grouped", + "all_centroids", + "nearest_point", + "nearest_point_grouped", + "all_points", + ] + ] = None, + spatial_key: str = "spatial", xy_columns: tuple = None, overwrite: bool = False, ): @@ -216,7 +232,7 @@ def compute_distance_to_shapes( The SpatialData object containing the input table layer and shapes layer. shapes_layer The shapes layer in `sdata.shapes` to use as input. - table_layer + table_layer The table layer in `sdata.tables` to use as input. output_table_layer The output table layer in `sdata.tables` to which the updated table layer will be written. @@ -232,39 +248,39 @@ def compute_distance_to_shapes( Which distance features to calculate. Options include: Edge distances (For Polygon and MultiPolygon geometries) - - `"nearest_edge"`: Distance to the nearest edge (outer or inner) of any polygon. + - `"nearest_edge"`: Distance to the nearest edge (outer or inner) of any polygon. Creates `{output_name}-distance_to_nearest_edge` and `{output_name}-name_of_nearest_edge` columns. - - `"nearest_edge_grouped"`: For each group of shapes with the same name, compute the distance to the edge that is nearest. + - `"nearest_edge_grouped"`: For each group of shapes with the same name, compute the distance to the edge that is nearest. Creates `{output_name}-distance_to_nearest_edge_of_` and `{output_name}-name_of_nearest_edge_of_` columns. - - `"all_edges"`: For each individual polygon, compute the distance to its edge. + - `"all_edges"`: For each individual polygon, compute the distance to its edge. Creates `{output_name}-distance_to_edge_of_` column. - + Outer edge distances (For Polygon and MultiPolygon geometries) - - `"nearest_outer_edge"`: Distance to the nearest outer edge of any polygon. + - `"nearest_outer_edge"`: Distance to the nearest outer edge of any polygon. Creates `{output_name}-distance_to_nearest_outer_edge` and `{output_name}-name_of_nearest_outer_edge` columns. - - `"nearest_outer_edge_grouped"`: For each group of shapes with the same name, compute the distance to the outer edge that is nearest. + - `"nearest_outer_edge_grouped"`: For each group of shapes with the same name, compute the distance to the outer edge that is nearest. Creates `{output_name}-distance_to_nearest_outer_edge_of_` and `{output_name}-name_of_nearest_outer_edge_of_` columns. - - `"all_outer_edges"`: For each individual polygon, compute the distance to its outer edge. + - `"all_outer_edges"`: For each individual polygon, compute the distance to its outer edge. Creates `{output_name}-distance_to_outer_edge_of_` column. - + Inner edge (hole) distances (For Polygon and MultiPolygon geometries) - - `"nearest_inner_edge"`: Distance to the nearest interior edge (“hole”) of any polygon. + - `"nearest_inner_edge"`: Distance to the nearest interior edge (“hole”) of any polygon. Creates `{output_name}-distance_to_nearest_inner_edge` and `{output_name}-name_of_nearest_inner_edge` columns. - - `"nearest_inner_edge_grouped"`: For each group of shapes with the same name, compute the distance to the nearest inner edge of that group. + - `"nearest_inner_edge_grouped"`: For each group of shapes with the same name, compute the distance to the nearest inner edge of that group. Creates `{output_name}-distance_to_nearest_inner_edge_of_` and `{output_name}-name_of_nearest_inner_edge_of_` columns. - - `"nearest_inner_edge_grouped_unique"`: For each individual polygon, compute the distance to the nearest inner edge of that polygon. + - `"nearest_inner_edge_grouped_unique"`: For each individual polygon, compute the distance to the nearest inner edge of that polygon. Creates `{output_name}-distance_to_nearest_inner_edge_of_` and `{output_name}-name_of_nearest_inner_edge_of_` columns. - - `"all_inner_edges"`: For all holes, compute the distance to each inner edge. + - `"all_inner_edges"`: For all holes, compute the distance to each inner edge. Creates `{output_name}-distance_to_inner_edge_of_-hole` column. - + Centroid distances (For Polygon and MultiPolygon geometries) - - `"nearest_centroid"`: Distance to the nearest centroid of any polygon. + - `"nearest_centroid"`: Distance to the nearest centroid of any polygon. Creates `{output_name}-distance_to_nearest_centroid` and `{output_name}-name_of_nearest_centroid` columns. - - `"nearest_centroid_grouped"`: For each group of shapes with the same name, compute the distance to the centroid that is nearest. + - `"nearest_centroid_grouped"`: For each group of shapes with the same name, compute the distance to the centroid that is nearest. Creates `{output_name}-distance_to_nearest_centroid_of_` and `{output_name}-name_of_nearest_centroid_of_` columns. - - `"all_centroids"`: For each individual polygon, compute the distance to its centroid. + - `"all_centroids"`: For each individual polygon, compute the distance to its centroid. Creates `{output_name}-distance_to_centroid_of_` column. - + Point distances (For Point geometries) - `"nearest_point"`: Distance to the nearest point. Creates `{output_name}-distance_to_nearest_point` and `{output_name}-name_of_nearest_point` columns. @@ -272,72 +288,71 @@ def compute_distance_to_shapes( Creates `{output_name}-distance_to_nearest_point_of_` and `{output_name}-name_of_nearest_point_of_` columns. - `"all_points"`: For each individual Point, compute the distance to point coordinates. Creates `{output_name}-distance_to_point_` column. - + spatial_key Key in `sdata.tables[table_layer].obsm` containing spatial coordinates. Ignored if `xy_columns` is provided. xy_columns Tuple of column names in `sdata.tables[table_layer].obs` containing the x and y coordinates the cells. - If None, defaults to using coordinates from `sdata.tables[table_layer].obsm[spatial_key]`. + If None, defaults to using coordinates from `sdata.tables[table_layer].obsm[spatial_key]`. overwrite If True, overwrites the `output_table_layer` if it already exists in `sdata`. - + Notes - ------- + ----- - Only `Polygon`, `MultiPolygon` and `Point` geometries are supported. Other geometries (e.g., `LineString`, `MultiPoint`) are skipped. - + Returns ------- Modified `sdata` object with updated table layer. """ - + if modes is None: + modes = ["all_edges"] if output_name is not None: output_name = f"{output_name}-" elif output_name is None: output_name = "" - + # Create copy of shapes layer gdf = sdata.shapes[shapes_layer].copy() - + # Create copy of table layer adata = sdata.tables[table_layer].copy() - + # Filter out geometries that are not Polygon, MultiPolygon or Point supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon, Point))) skipped = len(gdf) - supported_mask.sum() if skipped > 0: log.info(f"Skipped {skipped} geometries in {shapes_layer} that are not Polygon, MultiPolygon or Point.") - + gdf = gdf[supported_mask].copy().reset_index(drop=True) - + # Separate gdf by type gdf_polygons = gdf[gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])].copy() gdf_points = gdf[gdf.geometry.geom_type == "Point"].copy() has_polygons = not gdf_polygons.empty has_points = not gdf_points.empty - + if not has_polygons and not has_points: - log.warning(f"No supported geometries (Polygon, MultiPolygon, Point) found. Skipping addition of table layer '{output_table_layer}' to sdata.") + log.warning( + f"No supported geometries (Polygon, MultiPolygon, Point) found. Skipping addition of table layer '{output_table_layer}' to sdata." + ) return sdata - + # Get cell coordinates if xy_columns is not None: x_col, y_col = xy_columns coords = adata.obs[[x_col, y_col]].to_numpy() else: if spatial_key not in adata.obsm: - raise KeyError( - f"No spatial coordinates found in `obsm['{spatial_key}']` and `xy_columns` not provided.") + raise KeyError(f"No spatial coordinates found in `obsm['{spatial_key}']` and `xy_columns` not provided.") coords = adata.obsm[spatial_key] if coords.shape[1] != 2: raise ValueError(f"`obsm['{spatial_key}']` must have shape (n_cells, 2).") - + # Build point GeoDataFrame - pts = gpd.GeoSeries( - [Point(x, y) for x, y in coords], - crs=gdf.crs - ) - + pts = gpd.GeoSeries([Point(x, y) for x, y in coords], crs=gdf.crs) + pts_gdf = gpd.GeoDataFrame({"geometry": pts}, crs=gdf.crs).set_index(adata.obs.index) # Polygon and MultiPolygon @@ -367,21 +382,12 @@ def _extract_interior_lines(geom): ext_gdf = gpd.GeoDataFrame({unique_shape_names_column: names}, geometry=exteriors, crs=gdf.crs) int_gdf = gpd.GeoDataFrame({unique_shape_names_column: names}, geometry=interiors, crs=gdf.crs) - # Edges (outer and inner) + # Edges (outer and inner) if "nearest_edge" in modes: log.info("Calculating 'nearest_edge' distances'") - all_edges_gdf = gpd.GeoDataFrame( - pd.concat([ext_gdf, int_gdf], ignore_index=True), - crs=gdf.crs - ) - joined = gpd.sjoin_nearest( - pts_gdf, - all_edges_gdf, - how="left", distance_col="dist" - ) - adata.obs[f"{output_name}distance_to_nearest_edge"] = ( - joined["dist"].to_numpy() * pixel_size_um - ) + all_edges_gdf = gpd.GeoDataFrame(pd.concat([ext_gdf, int_gdf], ignore_index=True), crs=gdf.crs) + joined = gpd.sjoin_nearest(pts_gdf, all_edges_gdf, how="left", distance_col="dist") + adata.obs[f"{output_name}distance_to_nearest_edge"] = joined["dist"].to_numpy() * pixel_size_um adata.obs[f"{output_name}name_of_nearest_edge"] = joined[unique_shape_names_column] if "nearest_edge_grouped" in modes: @@ -389,7 +395,7 @@ def _extract_interior_lines(geom): for name, group in gdf_polygons.groupby(shape_names_column): edges = [] labels = [] - for idx, row in group.iterrows(): + for _idx, row in group.iterrows(): edge_lines = _extract_exterior_lines(row.geometry) + _extract_interior_lines(row.geometry) edges.extend(edge_lines) labels.extend([row[unique_shape_names_column]] * len(edge_lines)) @@ -397,33 +403,31 @@ def _extract_interior_lines(geom): edge_gdf = gpd.GeoDataFrame({"geometry": edges, unique_shape_names_column: labels}, crs=gdf.crs) joined = gpd.sjoin_nearest(pts_gdf, edge_gdf, how="left", distance_col="dist") - adata.obs[f"{output_name}distance_to_nearest_edge_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}distance_to_nearest_edge_of_{name}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) adata.obs[f"{output_name}name_of_nearest_edge_of_{name}"] = joined[unique_shape_names_column] - + if "all_edges" in modes: log.info("Calculating 'all_edges' distances'") for _, feat in gdf_polygons.reset_index(drop=True).iterrows(): - adata.obs[f"{output_name}distance_to_edge_of_{feat[unique_shape_names_column]}"] = pts.distance(feat.geometry.boundary).to_numpy() * pixel_size_um + adata.obs[f"{output_name}distance_to_edge_of_{feat[unique_shape_names_column]}"] = ( + pts.distance(feat.geometry.boundary).to_numpy() * pixel_size_um + ) # Outer edges if "nearest_outer_edge" in modes: log.info("Calculating 'nearest_outer_edge' distances'") - joined = gpd.sjoin_nearest( - pts_gdf, - ext_gdf, - how="left", distance_col="dist" - ) - adata.obs[f"{output_name}distance_to_nearest_outer_edge"] = ( - joined["dist"].to_numpy() * pixel_size_um - ) + joined = gpd.sjoin_nearest(pts_gdf, ext_gdf, how="left", distance_col="dist") + adata.obs[f"{output_name}distance_to_nearest_outer_edge"] = joined["dist"].to_numpy() * pixel_size_um adata.obs[f"{output_name}name_of_nearest_outer_edge"] = joined[unique_shape_names_column] - + if "nearest_outer_edge_grouped" in modes: log.info("Calculating 'nearest_outer_edge_grouped' distances'") for name, group in gdf_polygons.groupby(shape_names_column): edges = [] labels = [] - for idx, row in group.iterrows(): + for _idx, row in group.iterrows(): edge_lines = _extract_exterior_lines(row.geometry) edges.extend(edge_lines) labels.extend([row[unique_shape_names_column]] * len(edge_lines)) @@ -431,13 +435,17 @@ def _extract_interior_lines(geom): edge_gdf = gpd.GeoDataFrame({"geometry": edges, unique_shape_names_column: labels}, crs=gdf.crs) joined = gpd.sjoin_nearest(pts_gdf, edge_gdf, how="left", distance_col="dist") - adata.obs[f"{output_name}distance_to_nearest_outer_edge_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}distance_to_nearest_outer_edge_of_{name}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) adata.obs[f"{output_name}name_of_nearest_outer_edge_of_{name}"] = joined[unique_shape_names_column] if "all_outer_edges" in modes: log.info("Calculating 'all_outer_edges' distances'") for _, feat in gdf_polygons.reset_index(drop=True).iterrows(): - adata.obs[f"{output_name}distance_to_outer_edge_of_{feat[unique_shape_names_column]}"] = pts.distance(feat.geometry.exterior).to_numpy() * pixel_size_um + adata.obs[f"{output_name}distance_to_outer_edge_of_{feat[unique_shape_names_column]}"] = ( + pts.distance(feat.geometry.exterior).to_numpy() * pixel_size_um + ) # Inner edges hole_geoms = [] @@ -446,8 +454,8 @@ def _extract_interior_lines(geom): for _, row in gdf_polygons.reset_index(drop=True).iterrows(): base = row[unique_shape_names_column] geom = row.geometry - - holes =_extract_interior_lines(geom) + + holes = _extract_interior_lines(geom) if len(holes) == 0: continue @@ -456,29 +464,18 @@ def _extract_interior_lines(geom): hole = holes[0] hole_geoms.append(LineString(hole.coords)) hole_names.append(base) - + else: for i, hole in enumerate(holes, start=1): hole_geoms.append(LineString(hole.coords)) hole_names.append(f"{base}-hole{i}") - holes_gdf = gpd.GeoDataFrame( - {unique_shape_names_column: hole_names}, - geometry=hole_geoms, - crs=gdf.crs - ) + holes_gdf = gpd.GeoDataFrame({unique_shape_names_column: hole_names}, geometry=hole_geoms, crs=gdf.crs) if "nearest_inner_edge" in modes and not holes_gdf.empty: log.info("Calculating 'nearest_inner_edge' distances'") - joined = gpd.sjoin_nearest( - pts_gdf, - holes_gdf, - how="left", - distance_col="dist" - ) - adata.obs[f"{output_name}distance_to_nearest_inner_edge"] = ( - joined["dist"].to_numpy() * pixel_size_um - ) + joined = gpd.sjoin_nearest(pts_gdf, holes_gdf, how="left", distance_col="dist") + adata.obs[f"{output_name}distance_to_nearest_inner_edge"] = joined["dist"].to_numpy() * pixel_size_um adata.obs[f"{output_name}name_of_nearest_inner_edge"] = joined[unique_shape_names_column] if "nearest_inner_edge_grouped" in modes: @@ -487,7 +484,7 @@ def _extract_interior_lines(geom): hole_lines = [] hole_labels = [] - for idx, row in group.iterrows(): + for _idx, row in group.iterrows(): base = row[unique_shape_names_column] for i, hole in enumerate(_extract_interior_lines(row.geometry), start=1): hole_lines.append(LineString(hole.coords)) @@ -496,19 +493,18 @@ def _extract_interior_lines(geom): if not hole_lines: continue - hole_gdf = gpd.GeoDataFrame({ - "geometry": hole_lines, - "hole_name": hole_labels - }, crs=gdf.crs) + hole_gdf = gpd.GeoDataFrame({"geometry": hole_lines, "hole_name": hole_labels}, crs=gdf.crs) joined = gpd.sjoin_nearest(pts_gdf, hole_gdf, how="left", distance_col="dist") - adata.obs[f"{output_name}distance_to_nearest_inner_edge_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}distance_to_nearest_inner_edge_of_{name}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) adata.obs[f"{output_name}name_of_nearest_inner_edge_of_{name}"] = joined["hole_name"] if "nearest_inner_edge_grouped_unique" in modes: log.info("Calculating 'nearest_inner_edge_grouped_unique' distances'") - for idx, row in gdf_polygons.iterrows(): + for _idx, row in gdf_polygons.iterrows(): base = row[unique_shape_names_column] holes = _extract_interior_lines(row.geometry) @@ -516,16 +512,15 @@ def _extract_interior_lines(geom): continue hole_geoms = [LineString(hole.coords) for hole in holes] - hole_names = [f"{base}-hole{i+1}" for i in range(len(holes))] + hole_names = [f"{base}-hole{i + 1}" for i in range(len(holes))] - hole_gdf = gpd.GeoDataFrame({ - "geometry": hole_geoms, - "hole_name": hole_names - }, crs=gdf.crs) + hole_gdf = gpd.GeoDataFrame({"geometry": hole_geoms, "hole_name": hole_names}, crs=gdf.crs) joined = gpd.sjoin_nearest(pts_gdf, hole_gdf, how="left", distance_col="dist") - adata.obs[f"{output_name}distance_to_nearest_inner_edge_of_{base}"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}distance_to_nearest_inner_edge_of_{base}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) adata.obs[f"{output_name}name_of_nearest_inner_edge_of_{base}"] = joined["hole_name"] if "all_inner_edges" in modes and not holes_gdf.empty: @@ -535,29 +530,21 @@ def _extract_interior_lines(geom): col = f"{output_name}distance_to_inner_edge_of_{hole_name}" adata.obs[col] = pts.distance(hole.geometry).to_numpy() * pixel_size_um - # Centroids + # Centroids centroids_df = gdf_polygons.reset_index(drop=True)[[unique_shape_names_column]].copy() centroids_df["geometry"] = gdf_polygons.geometry.centroid.reset_index(drop=True) - centroids_gdf = gpd.GeoDataFrame( - centroids_df, - geometry="geometry", - crs=gdf.crs - ) + centroids_gdf = gpd.GeoDataFrame(centroids_df, geometry="geometry", crs=gdf.crs) if "nearest_centroid" in modes: log.info("Calculating 'nearest_centroid' distances'") - joined = gpd.sjoin_nearest( - pts_gdf, centroids_gdf, - how="left", - distance_col="dist" - ) + joined = gpd.sjoin_nearest(pts_gdf, centroids_gdf, how="left", distance_col="dist") adata.obs.loc[joined.index, f"{output_name}distance_to_nearest_centroid"] = ( joined["dist"].to_numpy() * pixel_size_um ) - adata.obs.loc[joined.index, f"{output_name}name_of_nearest_centroid"] = ( - joined[unique_shape_names_column].to_numpy() - ) + adata.obs.loc[joined.index, f"{output_name}name_of_nearest_centroid"] = joined[ + unique_shape_names_column + ].to_numpy() if "nearest_centroid_grouped" in modes: log.info("Calculating 'nearest_centroid_grouped' distances'") @@ -565,28 +552,27 @@ def _extract_interior_lines(geom): centroids = group.geometry.centroid labels = group[unique_shape_names_column].tolist() - centroid_gdf = gpd.GeoDataFrame({ - "geometry": centroids, - unique_shape_names_column: labels - }, crs=gdf.crs) + centroid_gdf = gpd.GeoDataFrame({"geometry": centroids, unique_shape_names_column: labels}, crs=gdf.crs) joined = gpd.sjoin_nearest(pts_gdf, centroid_gdf, how="left", distance_col="dist") - adata.obs[f"{output_name}distance_to_nearest_centroid_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}distance_to_nearest_centroid_of_{name}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) adata.obs[f"{output_name}name_of_nearest_centroid_of_{name}"] = joined[unique_shape_names_column] if "all_centroids" in modes: log.info("Calculating 'all_centroids' distances'") - for idx, feat in gdf_polygons.reset_index(drop=True).iterrows(): - adata.obs[f"{output_name}distance_to_centroid_of_{feat[unique_shape_names_column]}"] = pts.distance(feat.geometry.centroid).to_numpy() * pixel_size_um + for _idx, feat in gdf_polygons.reset_index(drop=True).iterrows(): + adata.obs[f"{output_name}distance_to_centroid_of_{feat[unique_shape_names_column]}"] = ( + pts.distance(feat.geometry.centroid).to_numpy() * pixel_size_um + ) # Point if has_points: if "nearest_point" in modes: log.info("Calculating 'nearest_point' distances'") - joined = gpd.sjoin_nearest( - pts_gdf, gdf_points, how="left", distance_col="dist" - ) + joined = gpd.sjoin_nearest(pts_gdf, gdf_points, how="left", distance_col="dist") adata.obs[f"{output_name}distance_to_nearest_point"] = joined["dist"].to_numpy() * pixel_size_um adata.obs[f"{output_name}name_of_nearest_point"] = joined[f"{shape_names_column}-unique"] @@ -594,22 +580,25 @@ def _extract_interior_lines(geom): log.info("Calculating 'nearest_point_grouped' distances'") for name, group in gdf_points.groupby(shape_names_column): joined = gpd.sjoin_nearest(pts_gdf, group, how="left", distance_col="dist") - adata.obs[f"{output_name}distance_to_nearest_point_of_{name}"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}distance_to_nearest_point_of_{name}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) adata.obs[f"{output_name}name_of_nearest_point_of_{name}"] = joined[f"{shape_names_column}-unique"] if "all_points" in modes: log.info("Calculating 'all_points' distances'") for _, row in gdf_points.iterrows(): - adata.obs[f"{output_name}distance_to_point_{row[unique_shape_names_column]}"] = pts.distance(row.geometry).to_numpy() * pixel_size_um - + adata.obs[f"{output_name}distance_to_point_{row[unique_shape_names_column]}"] = ( + pts.distance(row.geometry).to_numpy() * pixel_size_um + ) - # Add table layer + # Add table layer sdata = add_table_layer( sdata, - adata = adata, - output_layer = output_table_layer, - region = adata.obs[_REGION_KEY].cat.categories.to_list(), - overwrite = overwrite, + adata=adata, + output_layer=output_table_layer, + region=adata.obs[_REGION_KEY].cat.categories.to_list(), + overwrite=overwrite, ) - - return sdata \ No newline at end of file + + return sdata diff --git a/src/harpy/table/_table.py b/src/harpy/table/_table.py index fb00d872..69f1dbe4 100644 --- a/src/harpy/table/_table.py +++ b/src/harpy/table/_table.py @@ -1,13 +1,13 @@ from __future__ import annotations -from collections.abc import Iterable +from collections.abc import Iterable, Sequence +from typing import Literal import numpy as np import pandas as pd from anndata import AnnData from spatialdata import SpatialData from spatialdata.models import TableModel -from typing import Optional, Union, Sequence, Literal from harpy.shape._shape import filter_shapes_layer from harpy.table._manager import TableLayerManager @@ -269,19 +269,18 @@ def filter_on_size( ------- The updated SpatialData object. """ - sdata = filter_numerical( - sdata = sdata, - labels_layer = labels_layer, - table_layer = table_layer, - output_layer = output_layer, - numerical_column = cellsize_key, - min_value = min_size, - max_value = max_size, - target = "obs", - update_shapes_layers = update_shapes_layers, - prefix_filtered_shapes_layer = prefix_filtered_shapes_layer, - overwrite = overwrite, + sdata=sdata, + labels_layer=labels_layer, + table_layer=table_layer, + output_layer=output_layer, + numerical_column=cellsize_key, + min_value=min_size, + max_value=max_size, + target="obs", + update_shapes_layers=update_shapes_layers, + prefix_filtered_shapes_layer=prefix_filtered_shapes_layer, + overwrite=overwrite, ) return sdata @@ -304,7 +303,7 @@ def filter_numerical( All observations (obs) or features (var) with values outside of the min and max size range are removed using the `numerical_column` in `.obs` or `.var`. observations/features are kept if `min_value` ≤ `numerical_column` ≤ `max_value`. - + Parameters ---------- sdata @@ -344,27 +343,23 @@ def filter_numerical( """ if target not in {"obs", "var"}: raise ValueError("target must be either 'obs' or 'var'") - + process_table_instance = ProcessTable(sdata, labels_layer=labels_layer, table_layer=table_layer) adata = process_table_instance._get_adata().copy() start = adata.shape[0] if target == "obs" else adata.shape[1] - + df = getattr(adata, target) label = "observations" if target == "obs" else "features" - + if numerical_column not in df.columns: - raise ValueError( - f"Column '{numerical_column}' not found in '{table_layer}.{target}'. " - ) - + raise ValueError(f"Column '{numerical_column}' not found in '{table_layer}.{target}'. ") + if not np.issubdtype(df[numerical_column].dtype, np.number): - raise ValueError( - f"Column '{numerical_column}' must be numeric, but dtype is {df[numerical_column].dtype}." - ) + raise ValueError(f"Column '{numerical_column}' must be numeric, but dtype is {df[numerical_column].dtype}.") # Filter observations based on min and max values mask = pd.Series(True, index=df.index) - + if min_value is not None: below = (df[numerical_column] < min_value).sum() log.info(f"Removed {below} {label} below {min_value}.") @@ -373,7 +368,7 @@ def filter_numerical( above = (df[numerical_column] > max_value).sum() log.info(f"Removed {above} {label} above {max_value}.") mask &= df[numerical_column] <= max_value - + if target == "obs": adata = adata[mask, :].copy() else: @@ -410,16 +405,16 @@ def filter_categorical( labels_layer: list[str], table_layer: str, output_layer: str, - categorical_column: Optional[str] = None, - include_values: Optional[Union[str, Sequence[str]]] = None, - exclude_values: Optional[Union[str, Sequence[str]]] = None, + categorical_column: str | None = None, + include_values: str | Sequence[str] | None = None, + exclude_values: str | Sequence[str] | None = None, target: Literal["obs", "var"] = "obs", update_shapes_layers: bool = True, prefix_filtered_shapes_layer: str = "filtered", overwrite: bool = False, ) -> SpatialData: """ - Removes or keeps observations (obs) or features (var) based on specific values in a categorical column of + Removes or keeps observations (obs) or features (var) based on specific values in a categorical column of `.obs` or `.var`. or based on their index. Parameters @@ -471,10 +466,10 @@ def filter_categorical( process_table_instance = ProcessTable(sdata, labels_layer=labels_layer, table_layer=table_layer) adata = process_table_instance._get_adata().copy() start = adata.shape[0] if target == "obs" else adata.shape[1] - + df = getattr(adata, target) label = "observations" if target == "obs" else "features" - + if categorical_column is not None and categorical_column not in df.columns: raise ValueError(f"Column '{categorical_column}' not found in '{table_layer}.{target}'.") @@ -486,7 +481,7 @@ def filter_categorical( # Filter mask = pd.Series(True, index=df.index) - + if categorical_column is not None: # Filtering by column if include_values is not None: @@ -522,7 +517,7 @@ def filter_categorical( prefix_filtered_shapes_layer=prefix_filtered_shapes_layer, ) - kept = (adata.shape[0] if target == "obs" else adata.shape[1]) + kept = adata.shape[0] if target == "obs" else adata.shape[1] removed = start - kept if categorical_column is None: categorical_column = "index"