diff --git a/src/anomalib/data/utils/path.py b/src/anomalib/data/utils/path.py index 80c73a0f68..7bc61b27fe 100644 --- a/src/anomalib/data/utils/path.py +++ b/src/anomalib/data/utils/path.py @@ -146,7 +146,7 @@ def validate_path( path: str | Path, base_dir: str | Path | None = None, should_exist: bool = True, - accepted_extensions: tuple[str, ...] | None = None, + extensions: tuple[str, ...] | None = None, ) -> Path: """Validate the path. @@ -154,7 +154,7 @@ def validate_path( path (str | Path): Path to validate. base_dir (str | Path): Base directory to restrict file access. should_exist (bool): If True, do not raise an exception if the path does not exist. - accepted_extensions (tuple[str, ...] | None): Accepted extensions for the path. An exception is raised if the + extensions (tuple[str, ...] | None): Accepted extensions for the path. An exception is raised if the path does not have one of the accepted extensions. If None, no check is performed. Defaults to None. Returns: @@ -221,8 +221,8 @@ def validate_path( raise PermissionError(msg) # Check if the path has one of the accepted extensions - if accepted_extensions is not None and path.suffix not in accepted_extensions: - msg = f"Path extension is not accepted. Accepted extensions: {accepted_extensions}. Path: {path}" + if extensions is not None and path.suffix not in extensions: + msg = f"Path extension is not accepted. Accepted extensions: {extensions}. Path: {path}" raise ValueError(msg) return path diff --git a/src/anomalib/metrics/__init__.py b/src/anomalib/metrics/__init__.py index 1ecfd1cb5e..1b85b941d1 100644 --- a/src/anomalib/metrics/__init__.py +++ b/src/anomalib/metrics/__init__.py @@ -11,7 +11,7 @@ import torchmetrics from omegaconf import DictConfig, ListConfig -from . import per_image +from . import pimo from .anomaly_score_distribution import AnomalyScoreDistribution from .aupr import AUPR from .aupro import AUPRO @@ -20,7 +20,7 @@ from .f1_max import F1Max from .f1_score import F1Score from .min_max import MinMax -from .per_image import AUPIMO, PIMO +from .pimo import AUPIMO, PIMO from .precision_recall_curve import BinaryPrecisionRecallCurve from .pro import PRO from .threshold import F1AdaptiveThreshold, ManualThreshold @@ -37,7 +37,7 @@ "ManualThreshold", "MinMax", "PRO", - "per_image", + "pimo", "PIMO", "AUPIMO", ] diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py deleted file mode 100644 index 4300bb23f2..0000000000 --- a/src/anomalib/metrics/per_image/utils.py +++ /dev/null @@ -1,483 +0,0 @@ -"""Torch-oriented interfaces for `utils.py`.""" - -# Original Code -# https://github.com/jpcbertoldo/aupimo -# -# Modified -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import itertools -import logging -from collections import OrderedDict -from copy import deepcopy -from typing import TYPE_CHECKING - -import matplotlib as mpl -import pandas as pd -import scipy -import scipy.stats -import torch -from pandas import DataFrame -from torch import Tensor - -from . import _validate -from .enums import StatsAlternativeHypothesis, StatsOutliersPolicy, StatsRepeatedPolicy - -if TYPE_CHECKING: - from .pimo import AUPIMOResult - - -logger = logging.getLogger(__name__) - - -def per_image_scores_stats( - per_image_scores: Tensor, - images_classes: Tensor | None = None, - only_class: int | None = None, - outliers_policy: str | StatsOutliersPolicy = StatsOutliersPolicy.NONE.value, - repeated_policy: str | StatsRepeatedPolicy = StatsRepeatedPolicy.AVOID.value, - repeated_replacement_atol: float = 1e-2, -) -> list[dict[str, str | int | float]]: - """Compute statistics of per-image scores (based on a boxplot's statistics). - - ***Torch-oriented interface for `.utils_numpy.per_image_scores_stats`*** - - For a single per-image metric collection (1 model, 1 dataset), compute statistics (based on a boxplot) - and find the closest image to each statistic. - - This function uses `matplotlib.cbook.boxplot_stats`, which is the same function used by `matplotlib.pyplot.boxplot`. - - ** OUTLIERS ** - Outliers are defined as in a boxplot, i.e. values that are more than 1.5 times the interquartile range (IQR) away - from the Q1 and Q3 quartiles (respectively low and high outliers). The IQR is the difference between Q3 and Q1. - - Outliers are handled according to `outliers_policy`: - - None | "none": do not include outliers. - - "high": only include high outliers. - - "low": only include low outliers. - - "both": include both high and low outliers. - - ** IMAGE INDEX ** - Each statistic is associated with the image whose score is the closest to the statistic's value. - - ** REPEATED VALUES ** - It is possible that two stats have the same value (e.g. the median and the 25th percentile can be the same). - Such cases are handled according to `repeated_policy`: - - None | "none": do not address the issue, so several stats can have the same value and image index. - - "avoid": avoid repeated values by iterativealy looking for other images with similar score, whose score - must be within `repeated_replacement_atol` (absolute tolerance) of the repeated value. - - Args: - per_image_scores (Tensor): 1D Tensor of per-image scores. - images_classes (Tensor | None): - Used to filter statistics to only one class. If None, all images are considered. - If given, 1D Tensor of binary image classes (0 for 'normal', 1 for 'anomalous'). Defaults to None. - only_class (int | None): - Only used if `images_classes` is not None. - If not None, only compute statistics for images of the given class. - `None` means both image classes are used. - Defaults to None. - outliers_policy (str | None): How to handle outliers stats (use them?). See `OutliersPolicy`. Defaults to None. - repeated_policy (str | None): How to handle repeated values in boxplot stats (two stats with same value). - See `RepeatedPolicy`. Defaults to None. - repeated_replacement_atol (float): Absolute tolerance used to replace repeated values. Only used if - `repeated_policy` is not None (or 'none'). Defaults to 1e-2 (1%). - - Returns: - list[dict[str, str | int | float]]: List of boxplot statistics. - - Each dictionary has the following keys: - - 'stat_name': Name of the statistic. Possible values: - - 'mean': Mean of the scores. - - 'med': Median of the scores. - - 'q1': 25th percentile of the scores. - - 'q3': 75th percentile of the scores. - - 'whishi': Upper whisker value. - - 'whislo': Lower whisker value. - - 'outlo_i': low outlier value; `i` is a unique index for each low outlier. - - 'outhi_j': high outlier value; `j` is a unique index for each high outlier. - - 'stat_value': Value of the statistic (same units as `values`). - - 'image_idx': Index of the image in `per_image_scores` whose score is the closest to the statistic's value. - - 'score': The score of the image at index `image_idx` (not necessarily the same as `stat_value`). - - The list is sorted by increasing `stat_value`. - """ - # other validations happen inside `utils_numpy.per_image_scores_stats` - - outliers_policy = StatsOutliersPolicy(outliers_policy) - repeated_policy = StatsRepeatedPolicy(repeated_policy) - _validate.is_per_image_scores(per_image_scores) - - # restrain the images to the class `only_class` if given, else use all images - if images_classes is None: - images_selection_mask = torch.ones_like(per_image_scores, dtype=bool) - - elif only_class is not None: - _validate.is_images_classes(images_classes) - _validate.is_same_shape(per_image_scores, images_classes) - _validate.is_image_class(only_class) - images_selection_mask = images_classes == only_class - - else: - images_selection_mask = torch.ones_like(per_image_scores, dtype=bool) - - # indexes in `per_image_scores` are referred to as `candidate_idx` - # while the indexes in the original array are referred to as `image_idx` - # - `candidate_idx` works for `per_image_scores` and `candidate2image_idx` (see below) - # - `image_idx` works for `images_classes` and `images_idxs_selected` - per_image_scores = per_image_scores[images_selection_mask] - # converts `candidate_idx` to `image_idx` - candidate2image_idx = torch.nonzero(images_selection_mask, as_tuple=True)[0] - - # function used in `matplotlib.boxplot` - boxplot_stats = mpl.cbook.boxplot_stats(per_image_scores)[0] # [0] is for the only boxplot - - # remove unnecessary keys - boxplot_stats = {name: value for name, value in boxplot_stats.items() if name not in {"iqr", "cilo", "cihi"}} - - # unroll `fliers` (outliers), remove unnecessary ones according to `outliers_policy`, - # then add them to `boxplot_stats` with unique keys - outliers = boxplot_stats.pop("fliers") - outliers_lo = outliers[outliers < boxplot_stats["med"]] - outliers_hi = outliers[outliers > boxplot_stats["med"]] - - if outliers_policy in {StatsOutliersPolicy.HIGH, StatsOutliersPolicy.BOTH}: - boxplot_stats = { - **boxplot_stats, - **{f"outhi_{idx:06}": value for idx, value in enumerate(outliers_hi)}, - } - - if outliers_policy in {StatsOutliersPolicy.LOW, StatsOutliersPolicy.BOTH}: - boxplot_stats = { - **boxplot_stats, - **{f"outlo_{idx:06}": value for idx, value in enumerate(outliers_lo)}, - } - - # state variables for the stateful function `append_record` below - images_idxs_selected: set[int] = set() - records: list[dict[str, str | int | float]] = [] - - def append_record(stat_name: str, stat_value: float) -> None: - candidates_sorted = torch.abs(per_image_scores - stat_value).argsort() - candidate_idx = candidates_sorted[0] - image_idx = candidate2image_idx[candidate_idx] - - # handle repeated values - if image_idx not in images_idxs_selected or repeated_policy == StatsRepeatedPolicy.NONE: - pass - - elif repeated_policy == StatsRepeatedPolicy.AVOID: - for other_candidate_idx in candidates_sorted: - other_candidate_image_idx = candidate2image_idx[other_candidate_idx] - if other_candidate_image_idx in images_idxs_selected: - continue - # if the code reaches here, it means that `other_candidate_image_idx` is not in `images_idxs_selected` - # i.e. this image has not been selected yet, so it can be used - other_candidate_score = per_image_scores[other_candidate_idx] - # if the other candidate is not too far from the value, use it - # note that the first choice has not changed, so if no other is selected in the loop - # it will be the first choice - if torch.isclose(other_candidate_score, stat_value, atol=repeated_replacement_atol): - candidate_idx = other_candidate_idx - image_idx = other_candidate_image_idx - break - - images_idxs_selected.add(image_idx) - records.append( - { - "stat_name": stat_name, - "stat_value": float(stat_value), - "image_idx": int(image_idx), - "score": float(per_image_scores[candidate_idx]), - }, - ) - - # loop over the stats from the lowest to the highest value - for stat, val in sorted(boxplot_stats.items(), key=lambda x: x[1]): - append_record(stat, val) - return sorted(records, key=lambda r: r["score"]) - - -def compare_models_pairwise_ttest_rel( - scores_per_model: dict[str, Tensor] - | OrderedDict[str, Tensor] - | dict[str, "AUPIMOResult"] - | OrderedDict[str, "AUPIMOResult"], - alternative: str, - higher_is_better: bool, -) -> tuple[tuple[str, ...], dict[tuple[str, str], float]]: - """Compare all pairs of models using the paired t-test on two related samples (parametric). - - ***Torch-oriented interface for `.numpy_utils.compare_models_pairwise_ttest_rel`*** - - This is a test for the null hypothesis that two repeated samples have identical average (expected) values. - In fact, it tests whether the average of the differences between the two samples is significantly different from 0. - - Refs: - - `scipy.stats.ttest_rel`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html - - Wikipedia page: https://en.wikipedia.org/wiki/Student's_t-test#Dependent_t-test_for_paired_samples - - === - - If an ordered dictionary is given, the models are sorted by the order of the dictionary. - Otherwise, the models are sorted by average SCORE. - - Args: - scores_per_model: Dictionary of `n` models and their per-image scores. - key: model name - value: tensor of shape (num_images,). All `nan` values must be at the same positions. - higher_is_better: Whether higher values of score are better or worse. Defaults to True. - alternative: Alternative hypothesis for the statistical tests. See `confidences` in "Returns" section. - Valid values are `StatsAlternativeHypothesis.ALTERNATIVES`. - - Returns: - (models_ordered, test_results): - - models_ordered: Models sorted by the user (`OrderedDict` input) or automatically (`dict` input). - - Automatic sorting is by average score from best to worst model. - Depending on `higher_is_better`, this corresponds to: - - `higher_is_better=True` ==> descending score order - - `higher_is_better=False` ==> ascending score order - along the indices from 0 to `n-1`. - - - confidences: Dictionary of confidence values for each pair of models. - - For all pairs of indices i and j from 0 to `n-1` such that i != j: - - key: (models_ordered[i], models_ordered[j]) - - value: confidence on the alternative hypothesis. - - For models `models_ordered[i]` and `models_ordered[j]`, the alternative hypothesis is: - - if `less`: model[i] < model[j] - - if `greater`: model[i] > model[j] - - if `two-sided`: model[i] != model[j] - in termos of average score. - """ - _validate.is_scores_per_model(scores_per_model) - scores_per_model_items = [ - ( - model_name, - (scores if isinstance(scores, Tensor) else scores.aupimos), - ) - for model_name, scores in scores_per_model.items() - ] - cls = OrderedDict if isinstance(scores_per_model, OrderedDict) else dict - scores_per_model_with_arrays = cls(scores_per_model_items) - - _validate.is_scores_per_model(scores_per_model_with_arrays) - StatsAlternativeHypothesis(alternative) - - # remove nan values; list of items keeps the order of the OrderedDict - scores_per_model_nonan_items = [ - (model_name, scores[~torch.isnan(scores)]) for model_name, scores in scores_per_model_with_arrays.items() - ] - - # sort models by average value if not an ordered dictionary - # position 0 is assumed the best model - if isinstance(scores_per_model_with_arrays, OrderedDict): - scores_per_model_nonan = OrderedDict(scores_per_model_nonan_items) - else: - scores_per_model_nonan = OrderedDict( - sorted(scores_per_model_nonan_items, key=lambda kv: kv[1].mean(), reverse=higher_is_better), - ) - - models_ordered = tuple(scores_per_model_nonan.keys()) - models_pairs = list(itertools.permutations(models_ordered, 2)) - confidences: dict[tuple[str, str], float] = {} - for model_i, model_j in models_pairs: - values_i = scores_per_model_nonan[model_i] - values_j = scores_per_model_nonan[model_j] - pvalue = scipy.stats.ttest_rel( - values_i, - values_j, - alternative=alternative, - ).pvalue - confidences[model_i, model_j] = 1.0 - float(pvalue) - - return models_ordered, confidences - - -def compare_models_pairwise_wilcoxon( - scores_per_model: dict[str, Tensor] - | OrderedDict[str, Tensor] - | dict[str, "AUPIMOResult"] - | OrderedDict[str, "AUPIMOResult"], - alternative: str, - higher_is_better: bool, - atol: float | None = 1e-3, -) -> tuple[tuple[str, ...], dict[tuple[str, str], float]]: - """Compare all pairs of models using the Wilcoxon signed-rank test (non-parametric). - - ***Torch-oriented interface for `.numpy_utils.compare_models_pairwise_wilcoxon`*** - - Each comparison of two models is a Wilcoxon signed-rank test (null hypothesis is that they are equal). - - It tests whether the distribution of the differences of scores is symmetric about zero in a non-parametric way. - This is like the non-parametric version of the paired t-test. - - Refs: - - `scipy.stats.wilcoxon`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wilcoxon.html#scipy.stats.wilcoxon - - Wikipedia page: https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test - - === - - If an ordered dictionary is given, the models are sorted by the order of the dictionary. - Otherwise, the models are sorted by average RANK. - - Args: - scores_per_model: Dictionary of `n` models and their per-image scores. - key: model name - value: tensor of shape (num_images,). All `nan` values must be at the same positions. - higher_is_better: Whether higher values of score are better or worse. Defaults to True. - alternative: Alternative hypothesis for the statistical tests. See `confidences` in "Returns" section. - Valid values are `StatsAlternativeHypothesis.ALTERNATIVES`. - atol: Absolute tolerance used to consider two scores as equal. Defaults to 1e-3 (0.1%). - When doing a paired test, if the difference between two scores is below `atol`, the difference is - truncated to 0. If `atol` is None, no truncation is done. - - Returns: - (models_ordered, test_results): - - models_ordered: Models sorted by the user (`OrderedDict` input) or automatically (`dict` input). - - Automatic sorting is from "best to worst" model, which corresponds to ascending average rank - along the indices from 0 to `n-1`. - - - confidences: Dictionary of confidence values for each pair of models. - - For all pairs of indices i and j from 0 to `n-1` such that i != j: - - key: (models_ordered[i], models_ordered[j]) - - value: confidence on the alternative hypothesis. - - For models `models_ordered[i]` and `models_ordered[j]`, the alternative hypothesis is: - - if `less`: model[i] < model[j] - - if `greater`: model[i] > model[j] - - if `two-sided`: model[i] != model[j] - in terms of average ranks (not scores!). - """ - _validate.is_scores_per_model(scores_per_model) - scores_per_model_items = [ - ( - model_name, - (scores if isinstance(scores, Tensor) else scores.aupimos), - ) - for model_name, scores in scores_per_model.items() - ] - cls = OrderedDict if isinstance(scores_per_model, OrderedDict) else dict - scores_per_model_with_arrays = cls(scores_per_model_items) - - _validate.is_scores_per_model(scores_per_model_with_arrays) - StatsAlternativeHypothesis(alternative) - - # remove nan values; list of items keeps the order of the OrderedDict - scores_per_model_nonan_items = [ - (model_name, scores[~torch.isnan(scores)]) for model_name, scores in scores_per_model_with_arrays.items() - ] - - # sort models by average value if not an ordered dictionary - # position 0 is assumed the best model - if isinstance(scores_per_model_with_arrays, OrderedDict): - scores_per_model_nonan = OrderedDict(scores_per_model_nonan_items) - else: - # these average ranks will NOT consider `atol` because we want to rank the models anyway - scores_nonan = torch.stack([v for _, v in scores_per_model_nonan_items], axis=0) - avg_ranks = scipy.stats.rankdata( - -scores_nonan if higher_is_better else scores_nonan, - method="average", - axis=0, - ).mean(axis=1) - # ascending order, lower score is better --> best to worst model - argsort_avg_ranks = avg_ranks.argsort() - scores_per_model_nonan = OrderedDict(scores_per_model_nonan_items[idx] for idx in argsort_avg_ranks) - - models_ordered = tuple(scores_per_model_nonan.keys()) - models_pairs = list(itertools.permutations(models_ordered, 2)) - confidences: dict[tuple[str, str], float] = {} - for model_i, model_j in models_pairs: - values_i = scores_per_model_nonan[model_i] - values_j = scores_per_model_nonan[model_j] - diff = values_i - values_j - - if atol is not None: - # make the difference null if below the tolerance - diff[torch.abs(diff) <= atol] = 0.0 - - # extreme case - if (diff == 0).all(): # noqa: SIM108 - pvalue = 1.0 - else: - pvalue = scipy.stats.wilcoxon(diff, alternative=alternative).pvalue - confidences[model_i, model_j] = 1.0 - float(pvalue) - - return models_ordered, confidences - - -def format_pairwise_tests_results( - models_ordered: tuple[str, ...], - confidences: dict[tuple[str, str], float], - model1_as_column: bool = True, - left_to_right: bool = False, - top_to_bottom: bool = False, -) -> DataFrame: - """Format the results of pairwise tests into a square dataframe. - - The confidence values refer to the confidence level (in [0, 1]) on the alternative hypothesis, - which is formulated as "`model1` `model2`", where `` can be '<', '>', or '!='. - - HOW TO READ THE DATAFRAME - ========================= - There are 6 possible ways to read the dataframe, depending on the values of `model1_as_column` and `alternative` - (from the pairwise test function that generated `confidences`). - - *column* and *row* below refer to a generic column and row value (model names) in the dataframe. - - if ( - model1_as_column == True and alternative == 'less' - or model1_as_column == False and alternative == 'greater' - ) - read: "column < row" - equivalently: "row > column" - - elif ( - model1_as_column == True and alternative == 'greater' - or model1_as_column == False and alternative == 'less' - ) - read: "column > row" - equivalently: "row < column" - - else: # alternative == 'two-sided' - read: "column != row" - equivalently: "row != column" - - Args: - models_ordered: The models ordered in a meaningful way, generally from best to worst when automatically ordered. - confidences: The confidence on the alternative hypothesis, as returned by the pairwise test function. - model1_as_column: Whether to put `model1` as column or row in the dataframe. - left_to_right: Whether to order the columns from best to worst model as left to right. - top_to_bottom: Whether to order the rows from best to worst model as top to bottom. - Default column/row ordering is from worst to best model (left to right, top to bottom), - so the upper left corner is the worst model compared to itself, and the bottom right corner is the best - model compared to itself. - - """ - _validate.is_models_ordered(models_ordered) - _validate.is_confidences(confidences) - _validate.joint_validate_models_ordered_and_confidences(models_ordered, confidences) - confidences = deepcopy(confidences) - confidences.update({(model, model): torch.nan for model in models_ordered}) - # `df` stands for `dataframe` - confdf = pd.DataFrame(confidences, index=["confidence"]).T - confdf.index.names = ["model1", "model2"] - confdf = confdf.reset_index() - confdf["model1"] = pd.Categorical(confdf["model1"], categories=models_ordered, ordered=True) - confdf["model2"] = pd.Categorical(confdf["model2"], categories=models_ordered, ordered=True) - # df at this point: 3 columns: model1, model2, confidence - index_model, column_model = ("model2", "model1") if model1_as_column else ("model1", "model2") - confdf = confdf.pivot_table(index=index_model, columns=column_model, values="confidence", dropna=False, sort=False) - # now it is a square dataframe with models as index and columns, and confidence as values - confdf = confdf.sort_index(axis=0, ascending=top_to_bottom) - return confdf.sort_index(axis=1, ascending=left_to_right) - - -def images_classes_from_masks(masks: torch.Tensor) -> torch.Tensor: - """Deduce the image classes from the masks.""" - return (masks == 1).any(axis=(1, 2)).to(torch.int32) diff --git a/src/anomalib/metrics/per_image/__init__.py b/src/anomalib/metrics/pimo/__init__.py similarity index 62% rename from src/anomalib/metrics/per_image/__init__.py rename to src/anomalib/metrics/pimo/__init__.py index b678c3dc40..c131b616c1 100644 --- a/src/anomalib/metrics/per_image/__init__.py +++ b/src/anomalib/metrics/pimo/__init__.py @@ -9,12 +9,6 @@ from .enums import StatsOutliersPolicy, StatsRepeatedPolicy, ThresholdMethod from .pimo import AUPIMO, PIMO, AUPIMOResult, PIMOResult -from .utils import ( - compare_models_pairwise_ttest_rel, - compare_models_pairwise_wilcoxon, - format_pairwise_tests_results, - per_image_scores_stats, -) __all__ = [ # constants @@ -27,9 +21,4 @@ # torchmetrics interfaces "PIMO", "AUPIMO", - # utils - "compare_models_pairwise_ttest_rel", - "compare_models_pairwise_wilcoxon", - "format_pairwise_tests_results", - "per_image_scores_stats", ] diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/pimo/_validate.py similarity index 66% rename from src/anomalib/metrics/per_image/_validate.py rename to src/anomalib/metrics/pimo/_validate.py index e0f8f41d91..c758c6f6ab 100644 --- a/src/anomalib/metrics/per_image/_validate.py +++ b/src/anomalib/metrics/pimo/_validate.py @@ -12,17 +12,12 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from collections import OrderedDict -from typing import TYPE_CHECKING import torch from torch import Tensor from .utils import images_classes_from_masks -if TYPE_CHECKING: - from .pimo import AUPIMOResult - logger = logging.getLogger(__name__) @@ -139,10 +134,6 @@ def is_thresh_bounds(thresh_bounds: tuple[float, float]) -> None: def is_anomaly_maps(anomaly_maps: Tensor) -> None: - if not isinstance(anomaly_maps, Tensor): - msg = f"Expected anomaly maps to be an Tensor, but got {type(anomaly_maps)}" - raise TypeError(msg) - if anomaly_maps.ndim != 3: msg = f"Expected anomaly maps have 3 dimensions (N, H, W), but got {anomaly_maps.ndim} dimensions" raise ValueError(msg) @@ -156,10 +147,6 @@ def is_anomaly_maps(anomaly_maps: Tensor) -> None: def is_masks(masks: Tensor) -> None: - if not isinstance(masks, Tensor): - msg = f"Expected masks to be an Tensor, but got {type(masks)}" - raise TypeError(msg) - if masks.ndim != 3: msg = f"Expected masks have 3 dimensions (N, H, W), but got {masks.ndim} dimensions" raise ValueError(msg) @@ -185,10 +172,6 @@ def is_masks(masks: Tensor) -> None: def is_binclf_curves(binclf_curves: Tensor, valid_threshs: Tensor | None) -> None: - if not isinstance(binclf_curves, Tensor): - msg = f"Expected binclf curves to be an Tensor, but got {type(binclf_curves)}" - raise TypeError(msg) - if binclf_curves.ndim != 4: msg = f"Expected binclf curves to be 4D, but got {binclf_curves.ndim}D" raise ValueError(msg) @@ -229,10 +212,6 @@ def is_binclf_curves(binclf_curves: Tensor, valid_threshs: Tensor | None) -> Non def is_images_classes(images_classes: Tensor) -> None: - if not isinstance(images_classes, Tensor): - msg = f"Expected image classes to be an Tensor, but got {type(images_classes)}." - raise TypeError(msg) - if images_classes.ndim != 1: msg = f"Expected image classes to be 1D, but got {images_classes.ndim}D." raise ValueError(msg) @@ -258,10 +237,6 @@ def is_images_classes(images_classes: Tensor) -> None: def is_rates(rates: Tensor, nan_allowed: bool) -> None: - if not isinstance(rates, Tensor): - msg = f"Expected rates to be an Tensor, but got {type(rates)}." - raise TypeError(msg) - if rates.ndim != 1: msg = f"Expected rates to be 1D, but got {rates.ndim}D." raise ValueError(msg) @@ -307,10 +282,6 @@ def is_rate_curve(rate_curve: Tensor, nan_allowed: bool, decreasing: bool) -> No def is_per_image_rate_curves(rate_curves: Tensor, nan_allowed: bool, decreasing: bool | None) -> None: - if not isinstance(rate_curves, Tensor): - msg = f"Expected per-image rate curves to be an Tensor, but got {type(rate_curves)}." - raise TypeError(msg) - if rate_curves.ndim != 2: msg = f"Expected per-image rate curves to be 2D, but got {rate_curves.ndim}D." raise ValueError(msg) @@ -454,177 +425,3 @@ def is_image_class(image_class: int) -> None: if image_class not in {0, 1}: msg = f"Expected image class to be either 0 for 'normal' or 1 for 'anomalous', but got {image_class}." raise ValueError(msg) - - -def is_models_ordered(models_ordered: tuple[str, ...]) -> None: - if not isinstance(models_ordered, tuple): - msg = f"Expected models ordered to be a tuple, but got {type(models_ordered)}." - raise TypeError(msg) - - if len(models_ordered) < 2: - msg = f"Expected models ordered to have at least 2 models, but got {len(models_ordered)}." - raise ValueError(msg) - - for model_name in models_ordered: - if not isinstance(model_name, str): - msg = f"Expected model name to be a string, but got {type(model_name)} for model {model_name}." - raise TypeError(msg) - - if model_name == "": - msg = "Expected model name to be non-empty, but got empty string." - raise ValueError(msg) - - num_redundant_models = len(models_ordered) - len(set(models_ordered)) - if num_redundant_models > 0: - msg = f"Expected models ordered to have unique models, but got {num_redundant_models} redundant models." - raise ValueError(msg) - - -def is_confidences(confidences: dict[tuple[str, str], float]) -> None: - if not isinstance(confidences, dict): - msg = f"Expected confidences to be a dict, but got {type(confidences)}." - raise TypeError(msg) - - for (model1, model2), confidence in confidences.items(): - if not isinstance(model1, str): - msg = f"Expected model name to be a string, but got {type(model1)} for model {model1}." - raise TypeError(msg) - - if not isinstance(model2, str): - msg = f"Expected model name to be a string, but got {type(model2)} for model {model2}." - raise TypeError(msg) - - if not isinstance(confidence, float): - msg = f"Expected confidence to be a float, but got {type(confidence)} for models {model1} and {model2}." - raise TypeError(msg) - - if not (0 <= confidence <= 1): - msg = f"Expected confidence to be between 0 and 1, but got {confidence} for models {model1} and {model2}." - raise ValueError(msg) - - -def joint_validate_models_ordered_and_confidences( - models_ordered: tuple[str, ...], - confidences: dict[tuple[str, str], float], -) -> None: - num_models = len(models_ordered) - expected_num_pairs = num_models * (num_models - 1) - - if len(confidences) != expected_num_pairs: - msg = f"Expected {expected_num_pairs} pairs of models, but got {len(confidences)} pairs of models." - raise ValueError(msg) - - models_in_confidences = {model for pair_models in confidences for model in pair_models} - - diff = set(models_ordered).symmetric_difference(models_in_confidences) - if len(diff) > 0: - msg = ( - "Expected models in confidences to be the same as models ordered, but got models missing in one" - f"of them: {diff}." - ) - raise ValueError(msg) - - -def is_scores_per_model_tensor(scores_per_model: dict[str, Tensor] | OrderedDict[str, Tensor]) -> None: - first_key_value = None - - for model_name, scores in scores_per_model.items(): - if scores.ndim != 1: - msg = f"Expected scores to be 1D, but got {scores.ndim}D for model {model_name}." - raise ValueError(msg) - - num_valid_scores = scores[~torch.isnan(scores)].numel() - - if num_valid_scores < 1: - msg = f"Expected at least 1 non-nan score, but got {num_valid_scores} for model {model_name}." - raise ValueError(msg) - - if first_key_value is None: - first_key_value = (model_name, scores) - continue - - first_model_name, first_scores = first_key_value - - # same shape - if scores.shape[0] != first_scores.shape[0]: - msg = ( - "Expected scores to have the same number of scores, " - f"but got ({model_name}) {scores.shape[0]} != {first_scores.shape[0]} ({first_model_name})." - ) - raise ValueError(msg) - - # `nan` at the same indices - if (torch.isnan(scores) != torch.isnan(first_scores)).any(): - msg = ( - "Expected `nan` values, if any, to be at the same indices, " - f"but there are differences between models {model_name} and {first_model_name}." - ) - raise ValueError(msg) - - -def is_scores_per_model_aupimoresult( - scores_per_model: dict[str, "AUPIMOResult"] | OrderedDict[str, "AUPIMOResult"], -) -> None: - first_key_value = None - - for model_name, aupimoresult in scores_per_model.items(): - if first_key_value is None: - first_key_value = (model_name, aupimoresult) - continue - - first_model_name, first_aupimoresult = first_key_value - - if aupimoresult.fpr_bounds != first_aupimoresult.fpr_bounds: - msg = ( - "Expected AUPIMOResult objects in scores per model to have the same FPR bounds, " - f"but got ({model_name}) {aupimoresult.fpr_bounds} != " - f"{first_aupimoresult.fpr_bounds} ({first_model_name})." - ) - raise ValueError(msg) - - -def is_scores_per_model( - scores_per_model: dict[str, Tensor] - | OrderedDict[str, Tensor] - | dict[str, "AUPIMOResult"] - | OrderedDict[str, "AUPIMOResult"], -) -> None: - # it has to be imported here to avoid circular imports - from .pimo import AUPIMOResult - - if not isinstance(scores_per_model, dict | OrderedDict): - msg = f"Expected scores per model to be a dictionary or ordered dictionary, but got {type(scores_per_model)}." - raise TypeError(msg) - - if len(scores_per_model) < 2: - msg = f"Expected scores per model to have at least 2 models, but got {len(scores_per_model)}." - raise ValueError(msg) - - if not all(isinstance(model_name, str) for model_name in scores_per_model): - msg = "Expected scores per model to have model names (strings) as keys." - raise TypeError(msg) - - first_instance = next(iter(scores_per_model.values())) - - if ( - isinstance(first_instance, Tensor) - and any(not isinstance(scores, Tensor) for scores in scores_per_model.values()) - ) or ( - isinstance(first_instance, AUPIMOResult) - and any(not isinstance(scores, AUPIMOResult) for scores in scores_per_model.values()) - ): - msg = ( - "Values in the scores per model dict must have the same type for values (Tensor or AUPIMOResult), " - "but more than one type was found." - ) - raise TypeError(msg) - - if isinstance(first_instance, Tensor): - is_scores_per_model_tensor(scores_per_model) - return - - is_scores_per_model_tensor( - {model_name: scores.aupimos for model_name, scores in scores_per_model.items()}, - ) - - is_scores_per_model_aupimoresult(scores_per_model) diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/pimo/binary_classification_curve.py similarity index 90% rename from src/anomalib/metrics/per_image/binclf_curve.py rename to src/anomalib/metrics/pimo/binary_classification_curve.py index 7013eb08a7..325897c701 100644 --- a/src/anomalib/metrics/per_image/binclf_curve.py +++ b/src/anomalib/metrics/pimo/binary_classification_curve.py @@ -18,7 +18,6 @@ import numpy as np import torch -from numpy import ndarray from . import _validate from .enums import ThresholdMethod @@ -26,8 +25,8 @@ logger = logging.getLogger(__name__) -def _binclf_one_curve(scores: ndarray, gts: ndarray, threshs: ndarray) -> ndarray: - """One binary classification matrix at each threshold (PYTHON implementation). +def _binary_classification_curve(scores: np.ndarray, gts: np.ndarray, threshs: np.ndarray) -> np.ndarray: + """One binary classification matrix at each threshold. In the case where the thresholds are given (i.e. not considering all possible thresholds based on the scores), this weird-looking function is faster than the two options in `torchmetrics` on the CPU: @@ -37,12 +36,12 @@ def _binclf_one_curve(scores: ndarray, gts: ndarray, threshs: ndarray) -> ndarra Note: VALIDATION IS NOT DONE HERE. Make sure to validate the arguments before calling this function. Args: - scores (ndarray): Anomaly scores (D,). - gts (ndarray): Binary (bool) ground truth of shape (D,). - threshs (ndarray): Sequence of thresholds in ascending order (K,). + scores (np.ndarray): Anomaly scores (D,). + gts (np.ndarray): Binary (bool) ground truth of shape (D,). + threshs (np.ndarray): Sequence of thresholds in ascending order (K,). Returns: - ndarray: Binary classification matrix curve (K, 2, 2) + np.ndarray: Binary classification matrix curve (K, 2, 2) Details: `anomalib.metrics.per_image.binclf_curve_numpy.binclf_multiple_curves`. """ num_th = len(threshs) @@ -95,14 +94,14 @@ def score_less_than_thresh(score: float, thresh: float) -> bool: ).transpose(0, 2, 1) -def binclf_multiple_curves( +def binary_classification_curve( scores_batch: torch.Tensor, gts_batch: torch.Tensor, threshs: torch.Tensor, ) -> torch.Tensor: - """Multiple binary classification matrix (per-instance scope) at each threshold (shared). + """Returns a binary classification matrix at each threshold for each image in the batch. - This is a wrapper around `_binclf_multiple_curves_python` and `_binclf_multiple_curves_numba`. + This is a wrapper around `_binary_classification_curve`. Validation of the arguments is done here (not in the actual implementation functions). Note: predicted as positive condition is `score >= thresh`. @@ -143,7 +142,7 @@ def binclf_multiple_curves( _validate.is_threshs(threshs) # TODO(ashwinvaidya17): this is kept as numpy for now because it is much faster. # TEMP-0 - result = np.vectorize(_binclf_one_curve, signature="(n),(n),(k)->(k,2,2)")( + result = np.vectorize(_binary_classification_curve, signature="(n),(n),(k)->(k,2,2)")( scores_batch.detach().cpu().numpy(), gts_batch.detach().cpu().numpy(), threshs.detach().cpu().numpy(), @@ -151,12 +150,9 @@ def binclf_multiple_curves( return torch.from_numpy(result).to(scores_batch.device) -# ========================================= PER-IMAGE BINCLF CURVE ========================================= - - -def _get_threshs_minmax_linspace(anomaly_maps: torch.Tensor, num_threshs: int) -> torch.Tensor: +def _get_threshs_minmax_linspace(anomaly_maps: torch.Tensor, num_thresholds: int) -> torch.Tensor: """Get thresholds linearly spaced between the min and max of the anomaly maps.""" - _validate.is_num_threshs_gte2(num_threshs) + _validate.is_num_threshs_gte2(num_thresholds) # this operation can be a bit expensive thresh_low, thresh_high = thresh_bounds = (anomaly_maps.min().item(), anomaly_maps.max().item()) try: @@ -164,17 +160,17 @@ def _get_threshs_minmax_linspace(anomaly_maps: torch.Tensor, num_threshs: int) - except ValueError as ex: msg = f"Invalid threshold bounds computed from the given anomaly maps. Cause: {ex}" raise ValueError(msg) from ex - return torch.linspace(thresh_low, thresh_high, num_threshs, dtype=anomaly_maps.dtype) + return torch.linspace(thresh_low, thresh_high, num_thresholds, dtype=anomaly_maps.dtype) -def per_image_binclf_curve( +def threshold_and_binary_classification_curve( anomaly_maps: torch.Tensor, masks: torch.Tensor, threshs_choice: ThresholdMethod | str = ThresholdMethod.MINMAX_LINSPACE.value, threshs_given: torch.Tensor | None = None, num_threshs: int | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: - """Compute the binary classification matrix of each image in the batch for multiple thresholds (shared). + """Return thresholds and binary classification matrix at each threshold for each image in the batch. Args: anomaly_maps (torch.Tensor): Anomaly score maps of shape (N, H, W) @@ -259,7 +255,7 @@ def per_image_binclf_curve( scores_batch = anomaly_maps.reshape(anomaly_maps.shape[0], -1) gts_batch = masks.reshape(masks.shape[0], -1).to(bool) # make sure it is boolean - binclf_curves = binclf_multiple_curves(scores_batch, gts_batch, threshs) + binclf_curves = binary_classification_curve(scores_batch, gts_batch, threshs) num_images = anomaly_maps.shape[0] diff --git a/src/anomalib/metrics/per_image/dataclasses.py b/src/anomalib/metrics/pimo/dataclasses.py similarity index 100% rename from src/anomalib/metrics/per_image/dataclasses.py rename to src/anomalib/metrics/pimo/dataclasses.py diff --git a/src/anomalib/metrics/per_image/enums.py b/src/anomalib/metrics/pimo/enums.py similarity index 100% rename from src/anomalib/metrics/per_image/enums.py rename to src/anomalib/metrics/pimo/enums.py diff --git a/src/anomalib/metrics/per_image/functional.py b/src/anomalib/metrics/pimo/functional.py similarity index 96% rename from src/anomalib/metrics/per_image/functional.py rename to src/anomalib/metrics/pimo/functional.py index a217333110..ea5d9690f3 100644 --- a/src/anomalib/metrics/per_image/functional.py +++ b/src/anomalib/metrics/pimo/functional.py @@ -15,7 +15,13 @@ import numpy as np import torch -from . import _validate, binclf_curve +from . import _validate +from .binary_classification_curve import ( + _get_threshs_minmax_linspace, + per_image_fpr, + per_image_tpr, + threshold_and_binary_classification_curve, +) from .enums import ThresholdMethod from .utils import images_classes_from_masks @@ -67,14 +73,14 @@ def pimo_curves( # therefore getting a better resolution in terms of FPR quantization # otherwise the function `binclf_curve_numpy.per_image_binclf_curve` would have the range of thresholds # computed from all the images (normal + anomalous) - threshs = binclf_curve._get_threshs_minmax_linspace( # noqa: SLF001 + threshs = _get_threshs_minmax_linspace( anomaly_maps[image_classes == 0], num_threshs, ) # N: number of images, K: number of thresholds # shapes are (K,) and (N, K, 2, 2) - threshs, binclf_curves = binclf_curve.per_image_binclf_curve( + threshs, binclf_curves = threshold_and_binary_classification_curve( anomaly_maps=anomaly_maps, masks=masks, threshs_choice=ThresholdMethod.GIVEN.value, @@ -85,7 +91,7 @@ def pimo_curves( shared_fpr: torch.Tensor # mean-per-image-fpr on normal images # shape -> (N, K) - per_image_fprs_normals = binclf_curve.per_image_fpr(binclf_curves[image_classes == 0]) + per_image_fprs_normals = per_image_fpr(binclf_curves[image_classes == 0]) try: _validate.is_per_image_rate_curves(per_image_fprs_normals, nan_allowed=False, decreasing=True) except ValueError as ex: @@ -98,7 +104,7 @@ def pimo_curves( shared_fpr = per_image_fprs_normals.mean(axis=0) # shape -> (N, K) - per_image_tprs = binclf_curve.per_image_tpr(binclf_curves) + per_image_tprs = per_image_tpr(binclf_curves) return threshs, shared_fpr, per_image_tprs, image_classes diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/pimo/pimo.py similarity index 100% rename from src/anomalib/metrics/per_image/pimo.py rename to src/anomalib/metrics/pimo/pimo.py diff --git a/src/anomalib/metrics/pimo/utils.py b/src/anomalib/metrics/pimo/utils.py new file mode 100644 index 0000000000..f0cac45657 --- /dev/null +++ b/src/anomalib/metrics/pimo/utils.py @@ -0,0 +1,19 @@ +"""Torch-oriented interfaces for `utils.py`.""" + +# Original Code +# https://github.com/jpcbertoldo/aupimo +# +# Modified +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import logging + +import torch + +logger = logging.getLogger(__name__) + + +def images_classes_from_masks(masks: torch.Tensor) -> torch.Tensor: + """Deduce the image classes from the masks.""" + return (masks == 1).any(axis=(1, 2)).to(torch.int32) diff --git a/tests/unit/data/utils/test_path.py b/tests/unit/data/utils/test_path.py index f1764b7373..09f88496ad 100644 --- a/tests/unit/data/utils/test_path.py +++ b/tests/unit/data/utils/test_path.py @@ -81,4 +81,4 @@ def test_no_read_execute_permission() -> None: def test_file_wrongsuffix() -> None: """Test ``validate_path`` raises ValueError for a file with wrong suffix.""" with pytest.raises(ValueError, match="Path extension is not accepted."): - validate_path("file.png", should_exist=False, accepted_extensions=(".json", ".txt")) + validate_path("file.png", should_exist=False, extensions=(".json", ".txt")) diff --git a/tests/unit/metrics/per_image/test_utils.py b/tests/unit/metrics/per_image/test_utils.py deleted file mode 100644 index f08bdd56b9..0000000000 --- a/tests/unit/metrics/per_image/test_utils.py +++ /dev/null @@ -1,307 +0,0 @@ -"""Test `utils.py`.""" - -# Original Code -# https://github.com/jpcbertoldo/aupimo -# -# Modified -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from collections import OrderedDict - -import numpy as np -import pytest -import torch -from torch import Tensor - -from anomalib.metrics.per_image import ( - AUPIMOResult, - StatsOutliersPolicy, - StatsRepeatedPolicy, - compare_models_pairwise_ttest_rel, - compare_models_pairwise_wilcoxon, - format_pairwise_tests_results, - per_image_scores_stats, -) - - -def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: - """Generate test cases.""" - num_images = 100 - # avg is 0.8 - aucs1 = 0.8 * torch.ones(num_images) - # avg ~ 0.7 - aucs2 = torch.linspace(0.6, 0.8, num_images) - # avg ~ 0.6 - aucs3 = torch.sin(torch.linspace(0, torch.pi, num_images)).clip(0, 1) - - mock_aupimoresult_stuff = { - "fpr_lower_bound": 1e-5, - "fpr_upper_bound": 1e-4, - "num_threshs": 1_000, - "thresh_lower_bound": 1.0, - "thresh_upper_bound": 2.0, - } - scores_per_model_dicts = [ - ({"a": aucs1, "b": aucs2},), - ({"a": aucs1, "b": aucs2, "c": aucs3},), - (OrderedDict([("c", aucs1), ("b", aucs2), ("a", aucs3)]),), - ( - { - "a": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs1}), - "b": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs2}), - "c": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs3}), - }, - ), - ( - { - "a": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs1}), - "b": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs2}), - "c": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs3}), - }, - ), - ] - - if ( - metafunc.function is test_compare_models_pairwise_ttest - or metafunc.function is test_compare_models_pairwise_wilcoxon - ): - metafunc.parametrize(("scores_per_model",), scores_per_model_dicts) - metafunc.parametrize( - ("alternative", "higher_is_better"), - [ - ("two-sided", True), - ("two-sided", False), - ("less", False), - ("greater", True), - # not considering the case (less, true) and (greater, false) because it will break - # some assumptions in the assertions but they are possible - ], - ) - - if metafunc.function is test_format_pairwise_tests_results: - metafunc.parametrize(("scores_per_model",), scores_per_model_dicts[:3]) - - -def assert_statsdict_stuff(statdic: dict, max_image_idx: int) -> None: - """Assert stuff about a `statdic`.""" - assert "stat_name" in statdic - stat_name = statdic["stat_name"] - assert stat_name in {"mean", "med", "q1", "q3", "whishi", "whislo"} or stat_name.startswith( - ("outlo_", "outhi_"), - ) - assert "stat_value" in statdic - assert "image_idx" in statdic - image_idx = statdic["image_idx"] - assert 0 <= image_idx <= max_image_idx - - -def test_per_image_scores_stats() -> None: - """Test `per_image_scores_boxplot_stats`.""" - gen = torch.Generator().manual_seed(42) - num_scores = 201 - scores = torch.randn(num_scores, generator=gen) - - stats = per_image_scores_stats(scores) - assert len(stats) == 6 - for statdic in stats: - assert_statsdict_stuff(statdic, num_scores - 1) - - classes = (torch.arange(num_scores) % 3 == 0).to(torch.long) - stats = per_image_scores_stats(scores, classes, only_class=None) - assert len(stats) == 6 - stats = per_image_scores_stats(scores, classes, only_class=0) - assert len(stats) == 6 - stats = per_image_scores_stats(scores, classes, only_class=1) - assert len(stats) == 6 - - stats = per_image_scores_stats(scores, outliers_policy=StatsOutliersPolicy.BOTH) - assert len(stats) == 6 - stats = per_image_scores_stats(scores, outliers_policy=StatsOutliersPolicy.LOW) - assert len(stats) == 6 - stats = per_image_scores_stats(scores, outliers_policy=StatsOutliersPolicy.HIGH) - assert len(stats) == 6 - stats = per_image_scores_stats(scores, outliers_policy=StatsOutliersPolicy.NONE) - assert len(stats) == 6 - - # force repeated values - scores = torch.round(scores * 10) / 10 - stats = per_image_scores_stats(scores, repeated_policy=StatsRepeatedPolicy.AVOID) - assert len(stats) == 6 - stats = per_image_scores_stats( - scores, - classes, - repeated_policy=StatsRepeatedPolicy.AVOID, - repeated_replacement_atol=1e-1, - ) - assert len(stats) == 6 - stats = per_image_scores_stats(scores, repeated_policy=StatsRepeatedPolicy.NONE) - assert len(stats) == 6 - - -def test_per_image_scores_stats_specific_values() -> None: - """Test `per_image_scores_boxplot_stats` with specific values.""" - scores = torch.concatenate( - [ - # whislo = min value is 0.0 - torch.tensor([0.0]), - torch.zeros(98), - # q1 value is 0.0 - torch.tensor([0.0]), - torch.linspace(0.01, 0.29, 98), - # med value is 0.3 - torch.tensor([0.3]), - torch.linspace(0.31, 0.69, 99), - # q3 value is 0.7 - torch.tensor([0.7]), - torch.linspace(0.71, 0.99, 99), - # whishi = max value is 1.0 - torch.tensor([1.0]), - ], - ) - - stats = per_image_scores_stats(scores) - assert len(stats) == 6 - - statdict_whislo = stats[0] - statdict_q1 = stats[1] - statdict_med = stats[2] - statdict_mean = stats[3] - statdict_q3 = stats[4] - statdict_whishi = stats[5] - - assert statdict_whislo["stat_name"] == "whislo" - assert np.isclose(statdict_whislo["stat_value"], 0.0) - - assert statdict_q1["stat_name"] == "q1" - assert np.isclose(statdict_q1["stat_value"], 0.0, atol=1e-2) - - assert statdict_med["stat_name"] == "med" - assert np.isclose(statdict_med["stat_value"], 0.3, atol=1e-2) - - assert statdict_mean["stat_name"] == "mean" - assert np.isclose(statdict_mean["stat_value"], 0.3762, atol=1e-2) - - assert statdict_q3["stat_name"] == "q3" - assert np.isclose(statdict_q3["stat_value"], 0.7, atol=1e-2) - - assert statdict_whishi["stat_name"] == "whishi" - assert statdict_whishi["stat_value"] == 1.0 - - -def test_compare_models_pairwise_ttest(scores_per_model: dict, alternative: str, higher_is_better: bool) -> None: - """Test `compare_models_pairwise_ttest`.""" - models_ordered, confidences = compare_models_pairwise_ttest_rel( - scores_per_model, - alternative=alternative, - higher_is_better=higher_is_better, - ) - assert len(confidences) == (len(models_ordered) * (len(models_ordered) - 1)) - - diff = set(scores_per_model.keys()).symmetric_difference(set(models_ordered)) - assert len(diff) == 0 - - if isinstance(scores_per_model, OrderedDict): - assert models_ordered == tuple(scores_per_model.keys()) - - elif len(scores_per_model) == 2: - assert models_ordered == (("a", "b") if higher_is_better else ("b", "a")) - - elif len(scores_per_model) == 3: - assert models_ordered == (("a", "b", "c") if higher_is_better else ("c", "b", "a")) - - if isinstance(next(iter(scores_per_model.values())), AUPIMOResult): - return - - def copy_and_add_nan(scores: Tensor) -> Tensor: - scores = scores.clone() - scores[5:] = torch.nan - return scores - - # removing samples should reduce the confidences - scores_per_model["a"] = copy_and_add_nan(scores_per_model["a"]) - scores_per_model["b"] = copy_and_add_nan(scores_per_model["b"]) - if "c" in scores_per_model: - scores_per_model["c"] = copy_and_add_nan(scores_per_model["c"]) - - compare_models_pairwise_ttest_rel( - scores_per_model, - alternative=alternative, - higher_is_better=higher_is_better, - ) - - -def test_compare_models_pairwise_wilcoxon(scores_per_model: dict, alternative: str, higher_is_better: bool) -> None: - """Test `compare_models_pairwise_wilcoxon`.""" - models_ordered, confidences = compare_models_pairwise_wilcoxon( - scores_per_model, - alternative=alternative, - higher_is_better=higher_is_better, - ) - assert len(confidences) == (len(models_ordered) * (len(models_ordered) - 1)) - - diff = set(scores_per_model.keys()).symmetric_difference(set(models_ordered)) - assert len(diff) == 0 - - if isinstance(scores_per_model, OrderedDict): - assert models_ordered == tuple(scores_per_model.keys()) - - elif len(scores_per_model) == 2: - assert models_ordered == (("a", "b") if higher_is_better else ("b", "a")) - - elif len(scores_per_model) == 3: - # this one is not trivial without looking at the data, so no assertions - pass - - if isinstance(next(iter(scores_per_model.values())), AUPIMOResult): - return - - def copy_and_add_nan(scores: Tensor) -> Tensor: - scores = scores.clone() - scores[5:] = torch.nan - return scores - - # removing samples should reduce the confidences - scores_per_model["a"] = copy_and_add_nan(scores_per_model["a"]) - scores_per_model["b"] = copy_and_add_nan(scores_per_model["b"]) - if "c" in scores_per_model: - scores_per_model["c"] = copy_and_add_nan(scores_per_model["c"]) - - compare_models_pairwise_wilcoxon( - scores_per_model, - alternative=alternative, - higher_is_better=higher_is_better, - ) - - -def test_format_pairwise_tests_results(scores_per_model: dict) -> None: - """Test `format_pairwise_tests_results`.""" - models_ordered, confidences = compare_models_pairwise_wilcoxon( - scores_per_model, - alternative="greater", - higher_is_better=True, - ) - confidence_df = format_pairwise_tests_results( - models_ordered, - confidences, - model1_as_column=True, - left_to_right=True, - top_to_bottom=True, - ) - assert tuple(confidence_df.columns.tolist()) == models_ordered - assert tuple(confidence_df.index.tolist()) == models_ordered - - models_ordered, confidences = compare_models_pairwise_ttest_rel( - scores_per_model, - alternative="greater", - higher_is_better=True, - ) - confidence_df = format_pairwise_tests_results( - models_ordered, - confidences, - model1_as_column=True, - left_to_right=True, - top_to_bottom=True, - ) - assert tuple(confidence_df.columns.tolist()) == models_ordered - assert tuple(confidence_df.index.tolist()) == models_ordered diff --git a/tests/unit/metrics/per_image/__init__.py b/tests/unit/metrics/pimo/__init__.py similarity index 100% rename from tests/unit/metrics/per_image/__init__.py rename to tests/unit/metrics/pimo/__init__.py diff --git a/tests/unit/metrics/per_image/test_binclf_curve.py b/tests/unit/metrics/pimo/test_binclf_curve.py similarity index 93% rename from tests/unit/metrics/per_image/test_binclf_curve.py rename to tests/unit/metrics/pimo/test_binclf_curve.py index eed53f3248..660421bdba 100644 --- a/tests/unit/metrics/per_image/test_binclf_curve.py +++ b/tests/unit/metrics/pimo/test_binclf_curve.py @@ -12,7 +12,13 @@ import pytest import torch -from anomalib.metrics.per_image import binclf_curve +from anomalib.metrics.pimo.binary_classification_curve import ( + _binary_classification_curve, + binary_classification_curve, + per_image_fpr, + per_image_tpr, + threshold_and_binary_classification_curve, +) def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: @@ -294,7 +300,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: def test__binclf_one_curve(pred: torch.Tensor, gt: torch.Tensor, threshs: torch.Tensor, expected: torch.Tensor) -> None: """Test if `_binclf_one_curve()` returns the expected values.""" - computed = binclf_curve._binclf_one_curve(pred, gt, threshs) + computed = _binary_classification_curve(pred, gt, threshs) assert computed.shape == (threshs.numel(), 2, 2) assert (computed == expected.numpy()).all() @@ -306,7 +312,7 @@ def test__binclf_multiple_curves( expecteds: torch.Tensor, ) -> None: """Test if `_binclf_multiple_curves()` returns the expected values.""" - computed = binclf_curve.binclf_multiple_curves(preds, gts, threshs) + computed = binary_classification_curve(preds, gts, threshs) assert computed.shape == (preds.shape[0], threshs.numel(), 2, 2) assert (computed == expecteds).all() @@ -322,7 +328,7 @@ def test_binclf_multiple_curves( expected_binclf_curves: torch.Tensor, ) -> None: """Test if `binclf_multiple_curves()` returns the expected values.""" - computed = binclf_curve.binclf_multiple_curves( + computed = binary_classification_curve( preds, gts, threshs, @@ -331,26 +337,26 @@ def test_binclf_multiple_curves( assert (computed == expected_binclf_curves).all() # it's ok to have the threhsholds beyond the range of the preds - binclf_curve.binclf_multiple_curves(preds, gts, 2 * threshs) + binary_classification_curve(preds, gts, 2 * threshs) # or inside the bounds without reaching them - binclf_curve.binclf_multiple_curves(preds, gts, 0.5 * threshs) + binary_classification_curve(preds, gts, 0.5 * threshs) # it's also ok to have more threshs than unique values in the preds # add the values in between the threshs threshs_unncessary = 0.5 * (threshs[:-1] + threshs[1:]) threshs_unncessary = torch.concatenate([threshs_unncessary, threshs]) threshs_unncessary = torch.sort(threshs_unncessary)[0] - binclf_curve.binclf_multiple_curves(preds, gts, threshs_unncessary) + binary_classification_curve(preds, gts, threshs_unncessary) # or less - binclf_curve.binclf_multiple_curves(preds, gts, threshs[1:3]) + binary_classification_curve(preds, gts, threshs[1:3]) def test_binclf_multiple_curves_validations(args: list, kwargs: dict, exception: Exception) -> None: """Test if `_binclf_multiple_curves_python()` raises the expected errors.""" with pytest.raises(exception): - binclf_curve.binclf_multiple_curves(*args, **kwargs) + binary_classification_curve(*args, **kwargs) def test_per_image_binclf_curve( @@ -363,7 +369,7 @@ def test_per_image_binclf_curve( expected_binclf_curves: torch.Tensor, ) -> None: """Test if `per_image_binclf_curve()` returns the expected values.""" - computed_threshs, computed_binclf_curves = binclf_curve.per_image_binclf_curve( + computed_threshs, computed_binclf_curves = threshold_and_binary_classification_curve( anomaly_maps, masks, threshs_choice=threshs_choice, @@ -385,7 +391,7 @@ def test_per_image_binclf_curve( def test_per_image_binclf_curve_validations(args: list, kwargs: dict, exception: Exception) -> None: """Test if `per_image_binclf_curve()` raises the expected errors.""" with pytest.raises(exception): - binclf_curve.per_image_binclf_curve(*args, **kwargs) + threshold_and_binary_classification_curve(*args, **kwargs) def test_per_image_binclf_curve_validations_alt(args: list, kwargs: dict, exception: Exception) -> None: @@ -399,8 +405,8 @@ def test_rate_metrics( expected_tprs: torch.Tensor, ) -> None: """Test if rate metrics are computed correctly.""" - tprs = binclf_curve.per_image_tpr(binclf_curves) - fprs = binclf_curve.per_image_fpr(binclf_curves) + tprs = per_image_tpr(binclf_curves) + fprs = per_image_fpr(binclf_curves) assert tprs.shape == expected_tprs.shape assert fprs.shape == expected_fprs.shape diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/pimo/test_pimo.py similarity index 98% rename from tests/unit/metrics/per_image/test_pimo.py rename to tests/unit/metrics/pimo/test_pimo.py index a678bda430..dc40abbb5f 100644 --- a/tests/unit/metrics/per_image/test_pimo.py +++ b/tests/unit/metrics/pimo/test_pimo.py @@ -13,8 +13,7 @@ import torch from torch import Tensor -from anomalib.metrics.per_image import functional, pimo -from anomalib.metrics.per_image.pimo import AUPIMOResult, PIMOResult +from anomalib.metrics.pimo import AUPIMOResult, PIMOResult, functional, pimo def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: