diff --git a/src/anomalib/data/utils/path.py b/src/anomalib/data/utils/path.py
index 80c73a0f68..7bc61b27fe 100644
--- a/src/anomalib/data/utils/path.py
+++ b/src/anomalib/data/utils/path.py
@@ -146,7 +146,7 @@ def validate_path(
     path: str | Path,
     base_dir: str | Path | None = None,
     should_exist: bool = True,
-    accepted_extensions: tuple[str, ...] | None = None,
+    extensions: tuple[str, ...] | None = None,
 ) -> Path:
     """Validate the path.
 
@@ -154,7 +154,7 @@ def validate_path(
         path (str | Path): Path to validate.
         base_dir (str | Path): Base directory to restrict file access.
         should_exist (bool): If True, do not raise an exception if the path does not exist.
-        accepted_extensions (tuple[str, ...] | None): Accepted extensions for the path. An exception is raised if the
+        extensions (tuple[str, ...] | None): Accepted extensions for the path. An exception is raised if the
             path does not have one of the accepted extensions. If None, no check is performed. Defaults to None.
 
     Returns:
@@ -221,8 +221,8 @@ def validate_path(
             raise PermissionError(msg)
 
     # Check if the path has one of the accepted extensions
-    if accepted_extensions is not None and path.suffix not in accepted_extensions:
-        msg = f"Path extension is not accepted. Accepted extensions: {accepted_extensions}. Path: {path}"
+    if extensions is not None and path.suffix not in extensions:
+        msg = f"Path extension is not accepted. Accepted extensions: {extensions}. Path: {path}"
         raise ValueError(msg)
 
     return path
diff --git a/src/anomalib/metrics/__init__.py b/src/anomalib/metrics/__init__.py
index 1ecfd1cb5e..1b85b941d1 100644
--- a/src/anomalib/metrics/__init__.py
+++ b/src/anomalib/metrics/__init__.py
@@ -11,7 +11,7 @@
 import torchmetrics
 from omegaconf import DictConfig, ListConfig
 
-from . import per_image
+from . import pimo
 from .anomaly_score_distribution import AnomalyScoreDistribution
 from .aupr import AUPR
 from .aupro import AUPRO
@@ -20,7 +20,7 @@
 from .f1_max import F1Max
 from .f1_score import F1Score
 from .min_max import MinMax
-from .per_image import AUPIMO, PIMO
+from .pimo import AUPIMO, PIMO
 from .precision_recall_curve import BinaryPrecisionRecallCurve
 from .pro import PRO
 from .threshold import F1AdaptiveThreshold, ManualThreshold
@@ -37,7 +37,7 @@
     "ManualThreshold",
     "MinMax",
     "PRO",
-    "per_image",
+    "pimo",
     "PIMO",
     "AUPIMO",
 ]
diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py
deleted file mode 100644
index 4300bb23f2..0000000000
--- a/src/anomalib/metrics/per_image/utils.py
+++ /dev/null
@@ -1,483 +0,0 @@
-"""Torch-oriented interfaces for `utils.py`."""
-
-# Original Code
-# https://github.com/jpcbertoldo/aupimo
-#
-# Modified
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import itertools
-import logging
-from collections import OrderedDict
-from copy import deepcopy
-from typing import TYPE_CHECKING
-
-import matplotlib as mpl
-import pandas as pd
-import scipy
-import scipy.stats
-import torch
-from pandas import DataFrame
-from torch import Tensor
-
-from . import _validate
-from .enums import StatsAlternativeHypothesis, StatsOutliersPolicy, StatsRepeatedPolicy
-
-if TYPE_CHECKING:
-    from .pimo import AUPIMOResult
-
-
-logger = logging.getLogger(__name__)
-
-
-def per_image_scores_stats(
-    per_image_scores: Tensor,
-    images_classes: Tensor | None = None,
-    only_class: int | None = None,
-    outliers_policy: str | StatsOutliersPolicy = StatsOutliersPolicy.NONE.value,
-    repeated_policy: str | StatsRepeatedPolicy = StatsRepeatedPolicy.AVOID.value,
-    repeated_replacement_atol: float = 1e-2,
-) -> list[dict[str, str | int | float]]:
-    """Compute statistics of per-image scores (based on a boxplot's statistics).
-
-    ***Torch-oriented interface for `.utils_numpy.per_image_scores_stats`***
-
-    For a single per-image metric collection (1 model, 1 dataset), compute statistics (based on a boxplot)
-    and find the closest image to each statistic.
-
-    This function uses `matplotlib.cbook.boxplot_stats`, which is the same function used by `matplotlib.pyplot.boxplot`.
-
-    ** OUTLIERS **
-    Outliers are defined as in a boxplot, i.e. values that are more than 1.5 times the interquartile range (IQR) away
-    from the Q1 and Q3 quartiles (respectively low and high outliers). The IQR is the difference between Q3 and Q1.
-
-    Outliers are handled according to `outliers_policy`:
-        - None | "none": do not include outliers.
-        - "high": only include high outliers.
-        - "low": only include low outliers.
-        - "both": include both high and low outliers.
-
-    ** IMAGE INDEX **
-    Each statistic is associated with the image whose score is the closest to the statistic's value.
-
-    ** REPEATED VALUES **
-    It is possible that two stats have the same value (e.g. the median and the 25th percentile can be the same).
-    Such cases are handled according to `repeated_policy`:
-        - None | "none": do not address the issue, so several stats can have the same value and image index.
-        - "avoid": avoid repeated values by iterativealy looking for other images with similar score, whose score
-                    must be within `repeated_replacement_atol` (absolute tolerance) of the repeated value.
-
-    Args:
-        per_image_scores (Tensor): 1D Tensor of per-image scores.
-        images_classes (Tensor | None):
-            Used to filter statistics to only one class. If None, all images are considered.
-            If given, 1D Tensor of binary image classes (0 for 'normal', 1 for 'anomalous'). Defaults to None.
-        only_class (int | None):
-            Only used if `images_classes` is not None.
-            If not None, only compute statistics for images of the given class.
-            `None` means both image classes are used.
-            Defaults to None.
-        outliers_policy (str | None): How to handle outliers stats (use them?). See `OutliersPolicy`. Defaults to None.
-        repeated_policy (str | None): How to handle repeated values in boxplot stats (two stats with same value).
-                                        See `RepeatedPolicy`. Defaults to None.
-        repeated_replacement_atol (float): Absolute tolerance used to replace repeated values. Only used if
-                                            `repeated_policy` is not None (or 'none'). Defaults to 1e-2 (1%).
-
-    Returns:
-        list[dict[str, str | int | float]]: List of boxplot statistics.
-
-        Each dictionary has the following keys:
-            - 'stat_name': Name of the statistic. Possible values:
-                - 'mean': Mean of the scores.
-                - 'med': Median of the scores.
-                - 'q1': 25th percentile of the scores.
-                - 'q3': 75th percentile of the scores.
-                - 'whishi': Upper whisker value.
-                - 'whislo': Lower whisker value.
-                - 'outlo_i': low outlier value; `i` is a unique index for each low outlier.
-                - 'outhi_j': high outlier value; `j` is a unique index for each high outlier.
-            - 'stat_value': Value of the statistic (same units as `values`).
-            - 'image_idx': Index of the image in `per_image_scores` whose score is the closest to the statistic's value.
-            - 'score': The score of the image at index `image_idx` (not necessarily the same as `stat_value`).
-
-        The list is sorted by increasing `stat_value`.
-    """
-    # other validations happen inside `utils_numpy.per_image_scores_stats`
-
-    outliers_policy = StatsOutliersPolicy(outliers_policy)
-    repeated_policy = StatsRepeatedPolicy(repeated_policy)
-    _validate.is_per_image_scores(per_image_scores)
-
-    # restrain the images to the class `only_class` if given, else use all images
-    if images_classes is None:
-        images_selection_mask = torch.ones_like(per_image_scores, dtype=bool)
-
-    elif only_class is not None:
-        _validate.is_images_classes(images_classes)
-        _validate.is_same_shape(per_image_scores, images_classes)
-        _validate.is_image_class(only_class)
-        images_selection_mask = images_classes == only_class
-
-    else:
-        images_selection_mask = torch.ones_like(per_image_scores, dtype=bool)
-
-    # indexes in `per_image_scores` are referred to as `candidate_idx`
-    # while the indexes in the original array are referred to as `image_idx`
-    #  - `candidate_idx` works for `per_image_scores` and `candidate2image_idx` (see below)
-    #  - `image_idx` works for `images_classes` and `images_idxs_selected`
-    per_image_scores = per_image_scores[images_selection_mask]
-    # converts `candidate_idx` to `image_idx`
-    candidate2image_idx = torch.nonzero(images_selection_mask, as_tuple=True)[0]
-
-    # function used in `matplotlib.boxplot`
-    boxplot_stats = mpl.cbook.boxplot_stats(per_image_scores)[0]  # [0] is for the only boxplot
-
-    # remove unnecessary keys
-    boxplot_stats = {name: value for name, value in boxplot_stats.items() if name not in {"iqr", "cilo", "cihi"}}
-
-    # unroll `fliers` (outliers), remove unnecessary ones according to `outliers_policy`,
-    # then add them to `boxplot_stats` with unique keys
-    outliers = boxplot_stats.pop("fliers")
-    outliers_lo = outliers[outliers < boxplot_stats["med"]]
-    outliers_hi = outliers[outliers > boxplot_stats["med"]]
-
-    if outliers_policy in {StatsOutliersPolicy.HIGH, StatsOutliersPolicy.BOTH}:
-        boxplot_stats = {
-            **boxplot_stats,
-            **{f"outhi_{idx:06}": value for idx, value in enumerate(outliers_hi)},
-        }
-
-    if outliers_policy in {StatsOutliersPolicy.LOW, StatsOutliersPolicy.BOTH}:
-        boxplot_stats = {
-            **boxplot_stats,
-            **{f"outlo_{idx:06}": value for idx, value in enumerate(outliers_lo)},
-        }
-
-    # state variables for the stateful function `append_record` below
-    images_idxs_selected: set[int] = set()
-    records: list[dict[str, str | int | float]] = []
-
-    def append_record(stat_name: str, stat_value: float) -> None:
-        candidates_sorted = torch.abs(per_image_scores - stat_value).argsort()
-        candidate_idx = candidates_sorted[0]
-        image_idx = candidate2image_idx[candidate_idx]
-
-        # handle repeated values
-        if image_idx not in images_idxs_selected or repeated_policy == StatsRepeatedPolicy.NONE:
-            pass
-
-        elif repeated_policy == StatsRepeatedPolicy.AVOID:
-            for other_candidate_idx in candidates_sorted:
-                other_candidate_image_idx = candidate2image_idx[other_candidate_idx]
-                if other_candidate_image_idx in images_idxs_selected:
-                    continue
-                # if the code reaches here, it means that `other_candidate_image_idx` is not in `images_idxs_selected`
-                # i.e. this image has not been selected yet, so it can be used
-                other_candidate_score = per_image_scores[other_candidate_idx]
-                # if the other candidate is not too far from the value, use it
-                # note that the first choice has not changed, so if no other is selected in the loop
-                # it will be the first choice
-                if torch.isclose(other_candidate_score, stat_value, atol=repeated_replacement_atol):
-                    candidate_idx = other_candidate_idx
-                    image_idx = other_candidate_image_idx
-                    break
-
-        images_idxs_selected.add(image_idx)
-        records.append(
-            {
-                "stat_name": stat_name,
-                "stat_value": float(stat_value),
-                "image_idx": int(image_idx),
-                "score": float(per_image_scores[candidate_idx]),
-            },
-        )
-
-    # loop over the stats from the lowest to the highest value
-    for stat, val in sorted(boxplot_stats.items(), key=lambda x: x[1]):
-        append_record(stat, val)
-    return sorted(records, key=lambda r: r["score"])
-
-
-def compare_models_pairwise_ttest_rel(
-    scores_per_model: dict[str, Tensor]
-    | OrderedDict[str, Tensor]
-    | dict[str, "AUPIMOResult"]
-    | OrderedDict[str, "AUPIMOResult"],
-    alternative: str,
-    higher_is_better: bool,
-) -> tuple[tuple[str, ...], dict[tuple[str, str], float]]:
-    """Compare all pairs of models using the paired t-test on two related samples (parametric).
-
-    ***Torch-oriented interface for `.numpy_utils.compare_models_pairwise_ttest_rel`***
-
-    This is a test for the null hypothesis that two repeated samples have identical average (expected) values.
-    In fact, it tests whether the average of the differences between the two samples is significantly different from 0.
-
-    Refs:
-        - `scipy.stats.ttest_rel`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html
-        - Wikipedia page: https://en.wikipedia.org/wiki/Student's_t-test#Dependent_t-test_for_paired_samples
-
-    ===
-
-    If an ordered dictionary is given, the models are sorted by the order of the dictionary.
-    Otherwise, the models are sorted by average SCORE.
-
-    Args:
-        scores_per_model: Dictionary of `n` models and their per-image scores.
-            key: model name
-            value: tensor of shape (num_images,). All `nan` values must be at the same positions.
-        higher_is_better: Whether higher values of score are better or worse. Defaults to True.
-        alternative: Alternative hypothesis for the statistical tests. See `confidences` in "Returns" section.
-                     Valid values are `StatsAlternativeHypothesis.ALTERNATIVES`.
-
-    Returns:
-        (models_ordered, test_results):
-            - models_ordered: Models sorted by the user (`OrderedDict` input) or automatically (`dict` input).
-
-                Automatic sorting is by average score from best to worst model.
-                Depending on `higher_is_better`, this corresponds to:
-                    - `higher_is_better=True` ==> descending score order
-                    - `higher_is_better=False` ==> ascending score order
-                along the indices from 0 to `n-1`.
-
-            - confidences: Dictionary of confidence values for each pair of models.
-
-                For all pairs of indices i and j from 0 to `n-1` such that i != j:
-                    - key: (models_ordered[i], models_ordered[j])
-                    - value: confidence on the alternative hypothesis.
-
-                For models `models_ordered[i]` and `models_ordered[j]`, the alternative hypothesis is:
-                    - if `less`: model[i] < model[j]
-                    - if `greater`: model[i] > model[j]
-                    - if `two-sided`: model[i] != model[j]
-                in termos of average score.
-    """
-    _validate.is_scores_per_model(scores_per_model)
-    scores_per_model_items = [
-        (
-            model_name,
-            (scores if isinstance(scores, Tensor) else scores.aupimos),
-        )
-        for model_name, scores in scores_per_model.items()
-    ]
-    cls = OrderedDict if isinstance(scores_per_model, OrderedDict) else dict
-    scores_per_model_with_arrays = cls(scores_per_model_items)
-
-    _validate.is_scores_per_model(scores_per_model_with_arrays)
-    StatsAlternativeHypothesis(alternative)
-
-    # remove nan values; list of items keeps the order of the OrderedDict
-    scores_per_model_nonan_items = [
-        (model_name, scores[~torch.isnan(scores)]) for model_name, scores in scores_per_model_with_arrays.items()
-    ]
-
-    # sort models by average value if not an ordered dictionary
-    # position 0 is assumed the best model
-    if isinstance(scores_per_model_with_arrays, OrderedDict):
-        scores_per_model_nonan = OrderedDict(scores_per_model_nonan_items)
-    else:
-        scores_per_model_nonan = OrderedDict(
-            sorted(scores_per_model_nonan_items, key=lambda kv: kv[1].mean(), reverse=higher_is_better),
-        )
-
-    models_ordered = tuple(scores_per_model_nonan.keys())
-    models_pairs = list(itertools.permutations(models_ordered, 2))
-    confidences: dict[tuple[str, str], float] = {}
-    for model_i, model_j in models_pairs:
-        values_i = scores_per_model_nonan[model_i]
-        values_j = scores_per_model_nonan[model_j]
-        pvalue = scipy.stats.ttest_rel(
-            values_i,
-            values_j,
-            alternative=alternative,
-        ).pvalue
-        confidences[model_i, model_j] = 1.0 - float(pvalue)
-
-    return models_ordered, confidences
-
-
-def compare_models_pairwise_wilcoxon(
-    scores_per_model: dict[str, Tensor]
-    | OrderedDict[str, Tensor]
-    | dict[str, "AUPIMOResult"]
-    | OrderedDict[str, "AUPIMOResult"],
-    alternative: str,
-    higher_is_better: bool,
-    atol: float | None = 1e-3,
-) -> tuple[tuple[str, ...], dict[tuple[str, str], float]]:
-    """Compare all pairs of models using the Wilcoxon signed-rank test (non-parametric).
-
-    ***Torch-oriented interface for `.numpy_utils.compare_models_pairwise_wilcoxon`***
-
-    Each comparison of two models is a Wilcoxon signed-rank test (null hypothesis is that they are equal).
-
-    It tests whether the distribution of the differences of scores is symmetric about zero in a non-parametric way.
-    This is like the non-parametric version of the paired t-test.
-
-    Refs:
-        - `scipy.stats.wilcoxon`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wilcoxon.html#scipy.stats.wilcoxon
-        - Wikipedia page: https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
-
-    ===
-
-    If an ordered dictionary is given, the models are sorted by the order of the dictionary.
-    Otherwise, the models are sorted by average RANK.
-
-    Args:
-        scores_per_model: Dictionary of `n` models and their per-image scores.
-            key: model name
-            value: tensor of shape (num_images,). All `nan` values must be at the same positions.
-        higher_is_better: Whether higher values of score are better or worse. Defaults to True.
-        alternative: Alternative hypothesis for the statistical tests. See `confidences` in "Returns" section.
-                     Valid values are `StatsAlternativeHypothesis.ALTERNATIVES`.
-        atol: Absolute tolerance used to consider two scores as equal. Defaults to 1e-3 (0.1%).
-              When doing a paired test, if the difference between two scores is below `atol`, the difference is
-              truncated to 0. If `atol` is None, no truncation is done.
-
-    Returns:
-        (models_ordered, test_results):
-            - models_ordered: Models sorted by the user (`OrderedDict` input) or automatically (`dict` input).
-
-                Automatic sorting is from "best to worst" model, which corresponds to ascending average rank
-                along the indices from 0 to `n-1`.
-
-            - confidences: Dictionary of confidence values for each pair of models.
-
-                For all pairs of indices i and j from 0 to `n-1` such that i != j:
-                    - key: (models_ordered[i], models_ordered[j])
-                    - value: confidence on the alternative hypothesis.
-
-                For models `models_ordered[i]` and `models_ordered[j]`, the alternative hypothesis is:
-                    - if `less`: model[i] < model[j]
-                    - if `greater`: model[i] > model[j]
-                    - if `two-sided`: model[i] != model[j]
-                    in terms of average ranks (not scores!).
-    """
-    _validate.is_scores_per_model(scores_per_model)
-    scores_per_model_items = [
-        (
-            model_name,
-            (scores if isinstance(scores, Tensor) else scores.aupimos),
-        )
-        for model_name, scores in scores_per_model.items()
-    ]
-    cls = OrderedDict if isinstance(scores_per_model, OrderedDict) else dict
-    scores_per_model_with_arrays = cls(scores_per_model_items)
-
-    _validate.is_scores_per_model(scores_per_model_with_arrays)
-    StatsAlternativeHypothesis(alternative)
-
-    # remove nan values; list of items keeps the order of the OrderedDict
-    scores_per_model_nonan_items = [
-        (model_name, scores[~torch.isnan(scores)]) for model_name, scores in scores_per_model_with_arrays.items()
-    ]
-
-    # sort models by average value if not an ordered dictionary
-    # position 0 is assumed the best model
-    if isinstance(scores_per_model_with_arrays, OrderedDict):
-        scores_per_model_nonan = OrderedDict(scores_per_model_nonan_items)
-    else:
-        # these average ranks will NOT consider `atol` because we want to rank the models anyway
-        scores_nonan = torch.stack([v for _, v in scores_per_model_nonan_items], axis=0)
-        avg_ranks = scipy.stats.rankdata(
-            -scores_nonan if higher_is_better else scores_nonan,
-            method="average",
-            axis=0,
-        ).mean(axis=1)
-        # ascending order, lower score is better --> best to worst model
-        argsort_avg_ranks = avg_ranks.argsort()
-        scores_per_model_nonan = OrderedDict(scores_per_model_nonan_items[idx] for idx in argsort_avg_ranks)
-
-    models_ordered = tuple(scores_per_model_nonan.keys())
-    models_pairs = list(itertools.permutations(models_ordered, 2))
-    confidences: dict[tuple[str, str], float] = {}
-    for model_i, model_j in models_pairs:
-        values_i = scores_per_model_nonan[model_i]
-        values_j = scores_per_model_nonan[model_j]
-        diff = values_i - values_j
-
-        if atol is not None:
-            # make the difference null if below the tolerance
-            diff[torch.abs(diff) <= atol] = 0.0
-
-        # extreme case
-        if (diff == 0).all():  # noqa: SIM108
-            pvalue = 1.0
-        else:
-            pvalue = scipy.stats.wilcoxon(diff, alternative=alternative).pvalue
-        confidences[model_i, model_j] = 1.0 - float(pvalue)
-
-    return models_ordered, confidences
-
-
-def format_pairwise_tests_results(
-    models_ordered: tuple[str, ...],
-    confidences: dict[tuple[str, str], float],
-    model1_as_column: bool = True,
-    left_to_right: bool = False,
-    top_to_bottom: bool = False,
-) -> DataFrame:
-    """Format the results of pairwise tests into a square dataframe.
-
-    The confidence values refer to the confidence level (in [0, 1]) on the alternative hypothesis,
-    which is formulated as "`model1` <alternative> `model2`", where `<alternative>` can be '<', '>', or '!='.
-
-    HOW TO READ THE DATAFRAME
-    =========================
-    There are 6 possible ways to read the dataframe, depending on the values of `model1_as_column` and `alternative`
-    (from the pairwise test function that generated `confidences`).
-
-    *column* and *row* below refer to a generic column and row value (model names) in the dataframe.
-
-    if (
-        model1_as_column == True and alternative == 'less'
-        or model1_as_column == False and alternative == 'greater'
-    )
-        read: "column < row"
-        equivalently: "row > column"
-
-    elif (
-        model1_as_column == True and alternative == 'greater'
-        or model1_as_column == False and alternative == 'less'
-    )
-        read: "column > row"
-        equivalently: "row < column"
-
-    else:  # alternative == 'two-sided'
-        read: "column != row"
-        equivalently: "row != column"
-
-    Args:
-        models_ordered: The models ordered in a meaningful way, generally from best to worst when automatically ordered.
-        confidences: The confidence on the alternative hypothesis, as returned by the pairwise test function.
-        model1_as_column: Whether to put `model1` as column or row in the dataframe.
-        left_to_right: Whether to order the columns from best to worst model as left to right.
-        top_to_bottom: Whether to order the rows from best to worst model as top to bottom.
-            Default column/row ordering is from worst to best model (left to right, top to bottom),
-            so the upper left corner is the worst model compared to itself, and the bottom right corner is the best
-            model compared to itself.
-
-    """
-    _validate.is_models_ordered(models_ordered)
-    _validate.is_confidences(confidences)
-    _validate.joint_validate_models_ordered_and_confidences(models_ordered, confidences)
-    confidences = deepcopy(confidences)
-    confidences.update({(model, model): torch.nan for model in models_ordered})
-    # `df` stands for `dataframe`
-    confdf = pd.DataFrame(confidences, index=["confidence"]).T
-    confdf.index.names = ["model1", "model2"]
-    confdf = confdf.reset_index()
-    confdf["model1"] = pd.Categorical(confdf["model1"], categories=models_ordered, ordered=True)
-    confdf["model2"] = pd.Categorical(confdf["model2"], categories=models_ordered, ordered=True)
-    # df at this point: 3 columns: model1, model2, confidence
-    index_model, column_model = ("model2", "model1") if model1_as_column else ("model1", "model2")
-    confdf = confdf.pivot_table(index=index_model, columns=column_model, values="confidence", dropna=False, sort=False)
-    # now it is a square dataframe with models as index and columns, and confidence as values
-    confdf = confdf.sort_index(axis=0, ascending=top_to_bottom)
-    return confdf.sort_index(axis=1, ascending=left_to_right)
-
-
-def images_classes_from_masks(masks: torch.Tensor) -> torch.Tensor:
-    """Deduce the image classes from the masks."""
-    return (masks == 1).any(axis=(1, 2)).to(torch.int32)
diff --git a/src/anomalib/metrics/per_image/__init__.py b/src/anomalib/metrics/pimo/__init__.py
similarity index 62%
rename from src/anomalib/metrics/per_image/__init__.py
rename to src/anomalib/metrics/pimo/__init__.py
index b678c3dc40..c131b616c1 100644
--- a/src/anomalib/metrics/per_image/__init__.py
+++ b/src/anomalib/metrics/pimo/__init__.py
@@ -9,12 +9,6 @@
 
 from .enums import StatsOutliersPolicy, StatsRepeatedPolicy, ThresholdMethod
 from .pimo import AUPIMO, PIMO, AUPIMOResult, PIMOResult
-from .utils import (
-    compare_models_pairwise_ttest_rel,
-    compare_models_pairwise_wilcoxon,
-    format_pairwise_tests_results,
-    per_image_scores_stats,
-)
 
 __all__ = [
     # constants
@@ -27,9 +21,4 @@
     # torchmetrics interfaces
     "PIMO",
     "AUPIMO",
-    # utils
-    "compare_models_pairwise_ttest_rel",
-    "compare_models_pairwise_wilcoxon",
-    "format_pairwise_tests_results",
-    "per_image_scores_stats",
 ]
diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/pimo/_validate.py
similarity index 66%
rename from src/anomalib/metrics/per_image/_validate.py
rename to src/anomalib/metrics/pimo/_validate.py
index e0f8f41d91..c758c6f6ab 100644
--- a/src/anomalib/metrics/per_image/_validate.py
+++ b/src/anomalib/metrics/pimo/_validate.py
@@ -12,17 +12,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import logging
-from collections import OrderedDict
-from typing import TYPE_CHECKING
 
 import torch
 from torch import Tensor
 
 from .utils import images_classes_from_masks
 
-if TYPE_CHECKING:
-    from .pimo import AUPIMOResult
-
 logger = logging.getLogger(__name__)
 
 
@@ -139,10 +134,6 @@ def is_thresh_bounds(thresh_bounds: tuple[float, float]) -> None:
 
 
 def is_anomaly_maps(anomaly_maps: Tensor) -> None:
-    if not isinstance(anomaly_maps, Tensor):
-        msg = f"Expected anomaly maps to be an Tensor, but got {type(anomaly_maps)}"
-        raise TypeError(msg)
-
     if anomaly_maps.ndim != 3:
         msg = f"Expected anomaly maps have 3 dimensions (N, H, W), but got {anomaly_maps.ndim} dimensions"
         raise ValueError(msg)
@@ -156,10 +147,6 @@ def is_anomaly_maps(anomaly_maps: Tensor) -> None:
 
 
 def is_masks(masks: Tensor) -> None:
-    if not isinstance(masks, Tensor):
-        msg = f"Expected masks to be an Tensor, but got {type(masks)}"
-        raise TypeError(msg)
-
     if masks.ndim != 3:
         msg = f"Expected masks have 3 dimensions (N, H, W), but got {masks.ndim} dimensions"
         raise ValueError(msg)
@@ -185,10 +172,6 @@ def is_masks(masks: Tensor) -> None:
 
 
 def is_binclf_curves(binclf_curves: Tensor, valid_threshs: Tensor | None) -> None:
-    if not isinstance(binclf_curves, Tensor):
-        msg = f"Expected binclf curves to be an Tensor, but got {type(binclf_curves)}"
-        raise TypeError(msg)
-
     if binclf_curves.ndim != 4:
         msg = f"Expected binclf curves to be 4D, but got {binclf_curves.ndim}D"
         raise ValueError(msg)
@@ -229,10 +212,6 @@ def is_binclf_curves(binclf_curves: Tensor, valid_threshs: Tensor | None) -> Non
 
 
 def is_images_classes(images_classes: Tensor) -> None:
-    if not isinstance(images_classes, Tensor):
-        msg = f"Expected image classes to be an Tensor, but got {type(images_classes)}."
-        raise TypeError(msg)
-
     if images_classes.ndim != 1:
         msg = f"Expected image classes to be 1D, but got {images_classes.ndim}D."
         raise ValueError(msg)
@@ -258,10 +237,6 @@ def is_images_classes(images_classes: Tensor) -> None:
 
 
 def is_rates(rates: Tensor, nan_allowed: bool) -> None:
-    if not isinstance(rates, Tensor):
-        msg = f"Expected rates to be an Tensor, but got {type(rates)}."
-        raise TypeError(msg)
-
     if rates.ndim != 1:
         msg = f"Expected rates to be 1D, but got {rates.ndim}D."
         raise ValueError(msg)
@@ -307,10 +282,6 @@ def is_rate_curve(rate_curve: Tensor, nan_allowed: bool, decreasing: bool) -> No
 
 
 def is_per_image_rate_curves(rate_curves: Tensor, nan_allowed: bool, decreasing: bool | None) -> None:
-    if not isinstance(rate_curves, Tensor):
-        msg = f"Expected per-image rate curves to be an Tensor, but got {type(rate_curves)}."
-        raise TypeError(msg)
-
     if rate_curves.ndim != 2:
         msg = f"Expected per-image rate curves to be 2D, but got {rate_curves.ndim}D."
         raise ValueError(msg)
@@ -454,177 +425,3 @@ def is_image_class(image_class: int) -> None:
     if image_class not in {0, 1}:
         msg = f"Expected image class to be either 0 for 'normal' or 1 for 'anomalous', but got {image_class}."
         raise ValueError(msg)
-
-
-def is_models_ordered(models_ordered: tuple[str, ...]) -> None:
-    if not isinstance(models_ordered, tuple):
-        msg = f"Expected models ordered to be a tuple, but got {type(models_ordered)}."
-        raise TypeError(msg)
-
-    if len(models_ordered) < 2:
-        msg = f"Expected models ordered to have at least 2 models, but got {len(models_ordered)}."
-        raise ValueError(msg)
-
-    for model_name in models_ordered:
-        if not isinstance(model_name, str):
-            msg = f"Expected model name to be a string, but got {type(model_name)} for model {model_name}."
-            raise TypeError(msg)
-
-        if model_name == "":
-            msg = "Expected model name to be non-empty, but got empty string."
-            raise ValueError(msg)
-
-    num_redundant_models = len(models_ordered) - len(set(models_ordered))
-    if num_redundant_models > 0:
-        msg = f"Expected models ordered to have unique models, but got {num_redundant_models} redundant models."
-        raise ValueError(msg)
-
-
-def is_confidences(confidences: dict[tuple[str, str], float]) -> None:
-    if not isinstance(confidences, dict):
-        msg = f"Expected confidences to be a dict, but got {type(confidences)}."
-        raise TypeError(msg)
-
-    for (model1, model2), confidence in confidences.items():
-        if not isinstance(model1, str):
-            msg = f"Expected model name to be a string, but got {type(model1)} for model {model1}."
-            raise TypeError(msg)
-
-        if not isinstance(model2, str):
-            msg = f"Expected model name to be a string, but got {type(model2)} for model {model2}."
-            raise TypeError(msg)
-
-        if not isinstance(confidence, float):
-            msg = f"Expected confidence to be a float, but got {type(confidence)} for models {model1} and {model2}."
-            raise TypeError(msg)
-
-        if not (0 <= confidence <= 1):
-            msg = f"Expected confidence to be between 0 and 1, but got {confidence} for models {model1} and {model2}."
-            raise ValueError(msg)
-
-
-def joint_validate_models_ordered_and_confidences(
-    models_ordered: tuple[str, ...],
-    confidences: dict[tuple[str, str], float],
-) -> None:
-    num_models = len(models_ordered)
-    expected_num_pairs = num_models * (num_models - 1)
-
-    if len(confidences) != expected_num_pairs:
-        msg = f"Expected {expected_num_pairs} pairs of models, but got {len(confidences)} pairs of models."
-        raise ValueError(msg)
-
-    models_in_confidences = {model for pair_models in confidences for model in pair_models}
-
-    diff = set(models_ordered).symmetric_difference(models_in_confidences)
-    if len(diff) > 0:
-        msg = (
-            "Expected models in confidences to be the same as models ordered, but got models missing in one"
-            f"of them: {diff}."
-        )
-        raise ValueError(msg)
-
-
-def is_scores_per_model_tensor(scores_per_model: dict[str, Tensor] | OrderedDict[str, Tensor]) -> None:
-    first_key_value = None
-
-    for model_name, scores in scores_per_model.items():
-        if scores.ndim != 1:
-            msg = f"Expected scores to be 1D, but got {scores.ndim}D for model {model_name}."
-            raise ValueError(msg)
-
-        num_valid_scores = scores[~torch.isnan(scores)].numel()
-
-        if num_valid_scores < 1:
-            msg = f"Expected at least 1 non-nan score, but got {num_valid_scores} for model {model_name}."
-            raise ValueError(msg)
-
-        if first_key_value is None:
-            first_key_value = (model_name, scores)
-            continue
-
-        first_model_name, first_scores = first_key_value
-
-        # same shape
-        if scores.shape[0] != first_scores.shape[0]:
-            msg = (
-                "Expected scores to have the same number of scores, "
-                f"but got ({model_name}) {scores.shape[0]} != {first_scores.shape[0]} ({first_model_name})."
-            )
-            raise ValueError(msg)
-
-        # `nan` at the same indices
-        if (torch.isnan(scores) != torch.isnan(first_scores)).any():
-            msg = (
-                "Expected `nan` values, if any, to be at the same indices, "
-                f"but there are differences between models {model_name} and {first_model_name}."
-            )
-            raise ValueError(msg)
-
-
-def is_scores_per_model_aupimoresult(
-    scores_per_model: dict[str, "AUPIMOResult"] | OrderedDict[str, "AUPIMOResult"],
-) -> None:
-    first_key_value = None
-
-    for model_name, aupimoresult in scores_per_model.items():
-        if first_key_value is None:
-            first_key_value = (model_name, aupimoresult)
-            continue
-
-        first_model_name, first_aupimoresult = first_key_value
-
-        if aupimoresult.fpr_bounds != first_aupimoresult.fpr_bounds:
-            msg = (
-                "Expected AUPIMOResult objects in scores per model to have the same FPR bounds, "
-                f"but got ({model_name}) {aupimoresult.fpr_bounds} != "
-                f"{first_aupimoresult.fpr_bounds} ({first_model_name})."
-            )
-            raise ValueError(msg)
-
-
-def is_scores_per_model(
-    scores_per_model: dict[str, Tensor]
-    | OrderedDict[str, Tensor]
-    | dict[str, "AUPIMOResult"]
-    | OrderedDict[str, "AUPIMOResult"],
-) -> None:
-    # it has to be imported here to avoid circular imports
-    from .pimo import AUPIMOResult
-
-    if not isinstance(scores_per_model, dict | OrderedDict):
-        msg = f"Expected scores per model to be a dictionary or ordered dictionary, but got {type(scores_per_model)}."
-        raise TypeError(msg)
-
-    if len(scores_per_model) < 2:
-        msg = f"Expected scores per model to have at least 2 models, but got {len(scores_per_model)}."
-        raise ValueError(msg)
-
-    if not all(isinstance(model_name, str) for model_name in scores_per_model):
-        msg = "Expected scores per model to have model names (strings) as keys."
-        raise TypeError(msg)
-
-    first_instance = next(iter(scores_per_model.values()))
-
-    if (
-        isinstance(first_instance, Tensor)
-        and any(not isinstance(scores, Tensor) for scores in scores_per_model.values())
-    ) or (
-        isinstance(first_instance, AUPIMOResult)
-        and any(not isinstance(scores, AUPIMOResult) for scores in scores_per_model.values())
-    ):
-        msg = (
-            "Values in the scores per model dict must have the same type for values (Tensor or AUPIMOResult), "
-            "but more than one type was found."
-        )
-        raise TypeError(msg)
-
-    if isinstance(first_instance, Tensor):
-        is_scores_per_model_tensor(scores_per_model)
-        return
-
-    is_scores_per_model_tensor(
-        {model_name: scores.aupimos for model_name, scores in scores_per_model.items()},
-    )
-
-    is_scores_per_model_aupimoresult(scores_per_model)
diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/pimo/binary_classification_curve.py
similarity index 90%
rename from src/anomalib/metrics/per_image/binclf_curve.py
rename to src/anomalib/metrics/pimo/binary_classification_curve.py
index 7013eb08a7..325897c701 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/pimo/binary_classification_curve.py
@@ -18,7 +18,6 @@
 
 import numpy as np
 import torch
-from numpy import ndarray
 
 from . import _validate
 from .enums import ThresholdMethod
@@ -26,8 +25,8 @@
 logger = logging.getLogger(__name__)
 
 
-def _binclf_one_curve(scores: ndarray, gts: ndarray, threshs: ndarray) -> ndarray:
-    """One binary classification matrix at each threshold (PYTHON implementation).
+def _binary_classification_curve(scores: np.ndarray, gts: np.ndarray, threshs: np.ndarray) -> np.ndarray:
+    """One binary classification matrix at each threshold.
 
     In the case where the thresholds are given (i.e. not considering all possible thresholds based on the scores),
     this weird-looking function is faster than the two options in `torchmetrics` on the CPU:
@@ -37,12 +36,12 @@ def _binclf_one_curve(scores: ndarray, gts: ndarray, threshs: ndarray) -> ndarra
     Note: VALIDATION IS NOT DONE HERE. Make sure to validate the arguments before calling this function.
 
     Args:
-        scores (ndarray): Anomaly scores (D,).
-        gts (ndarray): Binary (bool) ground truth of shape (D,).
-        threshs (ndarray): Sequence of thresholds in ascending order (K,).
+        scores (np.ndarray): Anomaly scores (D,).
+        gts (np.ndarray): Binary (bool) ground truth of shape (D,).
+        threshs (np.ndarray): Sequence of thresholds in ascending order (K,).
 
     Returns:
-        ndarray: Binary classification matrix curve (K, 2, 2)
+        np.ndarray: Binary classification matrix curve (K, 2, 2)
         Details: `anomalib.metrics.per_image.binclf_curve_numpy.binclf_multiple_curves`.
     """
     num_th = len(threshs)
@@ -95,14 +94,14 @@ def score_less_than_thresh(score: float, thresh: float) -> bool:
     ).transpose(0, 2, 1)
 
 
-def binclf_multiple_curves(
+def binary_classification_curve(
     scores_batch: torch.Tensor,
     gts_batch: torch.Tensor,
     threshs: torch.Tensor,
 ) -> torch.Tensor:
-    """Multiple binary classification matrix (per-instance scope) at each threshold (shared).
+    """Returns a binary classification matrix at each threshold for each image in the batch.
 
-    This is a wrapper around `_binclf_multiple_curves_python` and `_binclf_multiple_curves_numba`.
+    This is a wrapper around `_binary_classification_curve`.
     Validation of the arguments is done here (not in the actual implementation functions).
 
     Note: predicted as positive condition is `score >= thresh`.
@@ -143,7 +142,7 @@ def binclf_multiple_curves(
     _validate.is_threshs(threshs)
     # TODO(ashwinvaidya17): this is kept as numpy for now because it is much faster.
     # TEMP-0
-    result = np.vectorize(_binclf_one_curve, signature="(n),(n),(k)->(k,2,2)")(
+    result = np.vectorize(_binary_classification_curve, signature="(n),(n),(k)->(k,2,2)")(
         scores_batch.detach().cpu().numpy(),
         gts_batch.detach().cpu().numpy(),
         threshs.detach().cpu().numpy(),
@@ -151,12 +150,9 @@ def binclf_multiple_curves(
     return torch.from_numpy(result).to(scores_batch.device)
 
 
-# ========================================= PER-IMAGE BINCLF CURVE =========================================
-
-
-def _get_threshs_minmax_linspace(anomaly_maps: torch.Tensor, num_threshs: int) -> torch.Tensor:
+def _get_threshs_minmax_linspace(anomaly_maps: torch.Tensor, num_thresholds: int) -> torch.Tensor:
     """Get thresholds linearly spaced between the min and max of the anomaly maps."""
-    _validate.is_num_threshs_gte2(num_threshs)
+    _validate.is_num_threshs_gte2(num_thresholds)
     # this operation can be a bit expensive
     thresh_low, thresh_high = thresh_bounds = (anomaly_maps.min().item(), anomaly_maps.max().item())
     try:
@@ -164,17 +160,17 @@ def _get_threshs_minmax_linspace(anomaly_maps: torch.Tensor, num_threshs: int) -
     except ValueError as ex:
         msg = f"Invalid threshold bounds computed from the given anomaly maps. Cause: {ex}"
         raise ValueError(msg) from ex
-    return torch.linspace(thresh_low, thresh_high, num_threshs, dtype=anomaly_maps.dtype)
+    return torch.linspace(thresh_low, thresh_high, num_thresholds, dtype=anomaly_maps.dtype)
 
 
-def per_image_binclf_curve(
+def threshold_and_binary_classification_curve(
     anomaly_maps: torch.Tensor,
     masks: torch.Tensor,
     threshs_choice: ThresholdMethod | str = ThresholdMethod.MINMAX_LINSPACE.value,
     threshs_given: torch.Tensor | None = None,
     num_threshs: int | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    """Compute the binary classification matrix of each image in the batch for multiple thresholds (shared).
+    """Return thresholds and binary classification matrix at each threshold for each image in the batch.
 
     Args:
         anomaly_maps (torch.Tensor): Anomaly score maps of shape (N, H, W)
@@ -259,7 +255,7 @@ def per_image_binclf_curve(
     scores_batch = anomaly_maps.reshape(anomaly_maps.shape[0], -1)
     gts_batch = masks.reshape(masks.shape[0], -1).to(bool)  # make sure it is boolean
 
-    binclf_curves = binclf_multiple_curves(scores_batch, gts_batch, threshs)
+    binclf_curves = binary_classification_curve(scores_batch, gts_batch, threshs)
 
     num_images = anomaly_maps.shape[0]
 
diff --git a/src/anomalib/metrics/per_image/dataclasses.py b/src/anomalib/metrics/pimo/dataclasses.py
similarity index 100%
rename from src/anomalib/metrics/per_image/dataclasses.py
rename to src/anomalib/metrics/pimo/dataclasses.py
diff --git a/src/anomalib/metrics/per_image/enums.py b/src/anomalib/metrics/pimo/enums.py
similarity index 100%
rename from src/anomalib/metrics/per_image/enums.py
rename to src/anomalib/metrics/pimo/enums.py
diff --git a/src/anomalib/metrics/per_image/functional.py b/src/anomalib/metrics/pimo/functional.py
similarity index 96%
rename from src/anomalib/metrics/per_image/functional.py
rename to src/anomalib/metrics/pimo/functional.py
index a217333110..ea5d9690f3 100644
--- a/src/anomalib/metrics/per_image/functional.py
+++ b/src/anomalib/metrics/pimo/functional.py
@@ -15,7 +15,13 @@
 import numpy as np
 import torch
 
-from . import _validate, binclf_curve
+from . import _validate
+from .binary_classification_curve import (
+    _get_threshs_minmax_linspace,
+    per_image_fpr,
+    per_image_tpr,
+    threshold_and_binary_classification_curve,
+)
 from .enums import ThresholdMethod
 from .utils import images_classes_from_masks
 
@@ -67,14 +73,14 @@ def pimo_curves(
     # therefore getting a better resolution in terms of FPR quantization
     # otherwise the function `binclf_curve_numpy.per_image_binclf_curve` would have the range of thresholds
     # computed from all the images (normal + anomalous)
-    threshs = binclf_curve._get_threshs_minmax_linspace(  # noqa: SLF001
+    threshs = _get_threshs_minmax_linspace(
         anomaly_maps[image_classes == 0],
         num_threshs,
     )
 
     # N: number of images, K: number of thresholds
     # shapes are (K,) and (N, K, 2, 2)
-    threshs, binclf_curves = binclf_curve.per_image_binclf_curve(
+    threshs, binclf_curves = threshold_and_binary_classification_curve(
         anomaly_maps=anomaly_maps,
         masks=masks,
         threshs_choice=ThresholdMethod.GIVEN.value,
@@ -85,7 +91,7 @@ def pimo_curves(
     shared_fpr: torch.Tensor
     # mean-per-image-fpr on normal images
     # shape -> (N, K)
-    per_image_fprs_normals = binclf_curve.per_image_fpr(binclf_curves[image_classes == 0])
+    per_image_fprs_normals = per_image_fpr(binclf_curves[image_classes == 0])
     try:
         _validate.is_per_image_rate_curves(per_image_fprs_normals, nan_allowed=False, decreasing=True)
     except ValueError as ex:
@@ -98,7 +104,7 @@ def pimo_curves(
     shared_fpr = per_image_fprs_normals.mean(axis=0)
 
     # shape -> (N, K)
-    per_image_tprs = binclf_curve.per_image_tpr(binclf_curves)
+    per_image_tprs = per_image_tpr(binclf_curves)
 
     return threshs, shared_fpr, per_image_tprs, image_classes
 
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/pimo/pimo.py
similarity index 100%
rename from src/anomalib/metrics/per_image/pimo.py
rename to src/anomalib/metrics/pimo/pimo.py
diff --git a/src/anomalib/metrics/pimo/utils.py b/src/anomalib/metrics/pimo/utils.py
new file mode 100644
index 0000000000..f0cac45657
--- /dev/null
+++ b/src/anomalib/metrics/pimo/utils.py
@@ -0,0 +1,19 @@
+"""Torch-oriented interfaces for `utils.py`."""
+
+# Original Code
+# https://github.com/jpcbertoldo/aupimo
+#
+# Modified
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def images_classes_from_masks(masks: torch.Tensor) -> torch.Tensor:
+    """Deduce the image classes from the masks."""
+    return (masks == 1).any(axis=(1, 2)).to(torch.int32)
diff --git a/tests/unit/data/utils/test_path.py b/tests/unit/data/utils/test_path.py
index f1764b7373..09f88496ad 100644
--- a/tests/unit/data/utils/test_path.py
+++ b/tests/unit/data/utils/test_path.py
@@ -81,4 +81,4 @@ def test_no_read_execute_permission() -> None:
     def test_file_wrongsuffix() -> None:
         """Test ``validate_path`` raises ValueError for a file with wrong suffix."""
         with pytest.raises(ValueError, match="Path extension is not accepted."):
-            validate_path("file.png", should_exist=False, accepted_extensions=(".json", ".txt"))
+            validate_path("file.png", should_exist=False, extensions=(".json", ".txt"))
diff --git a/tests/unit/metrics/per_image/test_utils.py b/tests/unit/metrics/per_image/test_utils.py
deleted file mode 100644
index f08bdd56b9..0000000000
--- a/tests/unit/metrics/per_image/test_utils.py
+++ /dev/null
@@ -1,307 +0,0 @@
-"""Test `utils.py`."""
-
-# Original Code
-# https://github.com/jpcbertoldo/aupimo
-#
-# Modified
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-from collections import OrderedDict
-
-import numpy as np
-import pytest
-import torch
-from torch import Tensor
-
-from anomalib.metrics.per_image import (
-    AUPIMOResult,
-    StatsOutliersPolicy,
-    StatsRepeatedPolicy,
-    compare_models_pairwise_ttest_rel,
-    compare_models_pairwise_wilcoxon,
-    format_pairwise_tests_results,
-    per_image_scores_stats,
-)
-
-
-def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
-    """Generate test cases."""
-    num_images = 100
-    # avg is 0.8
-    aucs1 = 0.8 * torch.ones(num_images)
-    # avg ~ 0.7
-    aucs2 = torch.linspace(0.6, 0.8, num_images)
-    # avg ~ 0.6
-    aucs3 = torch.sin(torch.linspace(0, torch.pi, num_images)).clip(0, 1)
-
-    mock_aupimoresult_stuff = {
-        "fpr_lower_bound": 1e-5,
-        "fpr_upper_bound": 1e-4,
-        "num_threshs": 1_000,
-        "thresh_lower_bound": 1.0,
-        "thresh_upper_bound": 2.0,
-    }
-    scores_per_model_dicts = [
-        ({"a": aucs1, "b": aucs2},),
-        ({"a": aucs1, "b": aucs2, "c": aucs3},),
-        (OrderedDict([("c", aucs1), ("b", aucs2), ("a", aucs3)]),),
-        (
-            {
-                "a": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs1}),
-                "b": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs2}),
-                "c": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs3}),
-            },
-        ),
-        (
-            {
-                "a": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs1}),
-                "b": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs2}),
-                "c": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs3}),
-            },
-        ),
-    ]
-
-    if (
-        metafunc.function is test_compare_models_pairwise_ttest
-        or metafunc.function is test_compare_models_pairwise_wilcoxon
-    ):
-        metafunc.parametrize(("scores_per_model",), scores_per_model_dicts)
-        metafunc.parametrize(
-            ("alternative", "higher_is_better"),
-            [
-                ("two-sided", True),
-                ("two-sided", False),
-                ("less", False),
-                ("greater", True),
-                # not considering the case (less, true) and (greater, false) because it will break
-                # some assumptions in the assertions but they are possible
-            ],
-        )
-
-    if metafunc.function is test_format_pairwise_tests_results:
-        metafunc.parametrize(("scores_per_model",), scores_per_model_dicts[:3])
-
-
-def assert_statsdict_stuff(statdic: dict, max_image_idx: int) -> None:
-    """Assert stuff about a `statdic`."""
-    assert "stat_name" in statdic
-    stat_name = statdic["stat_name"]
-    assert stat_name in {"mean", "med", "q1", "q3", "whishi", "whislo"} or stat_name.startswith(
-        ("outlo_", "outhi_"),
-    )
-    assert "stat_value" in statdic
-    assert "image_idx" in statdic
-    image_idx = statdic["image_idx"]
-    assert 0 <= image_idx <= max_image_idx
-
-
-def test_per_image_scores_stats() -> None:
-    """Test `per_image_scores_boxplot_stats`."""
-    gen = torch.Generator().manual_seed(42)
-    num_scores = 201
-    scores = torch.randn(num_scores, generator=gen)
-
-    stats = per_image_scores_stats(scores)
-    assert len(stats) == 6
-    for statdic in stats:
-        assert_statsdict_stuff(statdic, num_scores - 1)
-
-    classes = (torch.arange(num_scores) % 3 == 0).to(torch.long)
-    stats = per_image_scores_stats(scores, classes, only_class=None)
-    assert len(stats) == 6
-    stats = per_image_scores_stats(scores, classes, only_class=0)
-    assert len(stats) == 6
-    stats = per_image_scores_stats(scores, classes, only_class=1)
-    assert len(stats) == 6
-
-    stats = per_image_scores_stats(scores, outliers_policy=StatsOutliersPolicy.BOTH)
-    assert len(stats) == 6
-    stats = per_image_scores_stats(scores, outliers_policy=StatsOutliersPolicy.LOW)
-    assert len(stats) == 6
-    stats = per_image_scores_stats(scores, outliers_policy=StatsOutliersPolicy.HIGH)
-    assert len(stats) == 6
-    stats = per_image_scores_stats(scores, outliers_policy=StatsOutliersPolicy.NONE)
-    assert len(stats) == 6
-
-    # force repeated values
-    scores = torch.round(scores * 10) / 10
-    stats = per_image_scores_stats(scores, repeated_policy=StatsRepeatedPolicy.AVOID)
-    assert len(stats) == 6
-    stats = per_image_scores_stats(
-        scores,
-        classes,
-        repeated_policy=StatsRepeatedPolicy.AVOID,
-        repeated_replacement_atol=1e-1,
-    )
-    assert len(stats) == 6
-    stats = per_image_scores_stats(scores, repeated_policy=StatsRepeatedPolicy.NONE)
-    assert len(stats) == 6
-
-
-def test_per_image_scores_stats_specific_values() -> None:
-    """Test `per_image_scores_boxplot_stats` with specific values."""
-    scores = torch.concatenate(
-        [
-            # whislo = min value is 0.0
-            torch.tensor([0.0]),
-            torch.zeros(98),
-            # q1 value is 0.0
-            torch.tensor([0.0]),
-            torch.linspace(0.01, 0.29, 98),
-            # med value is 0.3
-            torch.tensor([0.3]),
-            torch.linspace(0.31, 0.69, 99),
-            # q3 value is 0.7
-            torch.tensor([0.7]),
-            torch.linspace(0.71, 0.99, 99),
-            # whishi = max value is 1.0
-            torch.tensor([1.0]),
-        ],
-    )
-
-    stats = per_image_scores_stats(scores)
-    assert len(stats) == 6
-
-    statdict_whislo = stats[0]
-    statdict_q1 = stats[1]
-    statdict_med = stats[2]
-    statdict_mean = stats[3]
-    statdict_q3 = stats[4]
-    statdict_whishi = stats[5]
-
-    assert statdict_whislo["stat_name"] == "whislo"
-    assert np.isclose(statdict_whislo["stat_value"], 0.0)
-
-    assert statdict_q1["stat_name"] == "q1"
-    assert np.isclose(statdict_q1["stat_value"], 0.0, atol=1e-2)
-
-    assert statdict_med["stat_name"] == "med"
-    assert np.isclose(statdict_med["stat_value"], 0.3, atol=1e-2)
-
-    assert statdict_mean["stat_name"] == "mean"
-    assert np.isclose(statdict_mean["stat_value"], 0.3762, atol=1e-2)
-
-    assert statdict_q3["stat_name"] == "q3"
-    assert np.isclose(statdict_q3["stat_value"], 0.7, atol=1e-2)
-
-    assert statdict_whishi["stat_name"] == "whishi"
-    assert statdict_whishi["stat_value"] == 1.0
-
-
-def test_compare_models_pairwise_ttest(scores_per_model: dict, alternative: str, higher_is_better: bool) -> None:
-    """Test `compare_models_pairwise_ttest`."""
-    models_ordered, confidences = compare_models_pairwise_ttest_rel(
-        scores_per_model,
-        alternative=alternative,
-        higher_is_better=higher_is_better,
-    )
-    assert len(confidences) == (len(models_ordered) * (len(models_ordered) - 1))
-
-    diff = set(scores_per_model.keys()).symmetric_difference(set(models_ordered))
-    assert len(diff) == 0
-
-    if isinstance(scores_per_model, OrderedDict):
-        assert models_ordered == tuple(scores_per_model.keys())
-
-    elif len(scores_per_model) == 2:
-        assert models_ordered == (("a", "b") if higher_is_better else ("b", "a"))
-
-    elif len(scores_per_model) == 3:
-        assert models_ordered == (("a", "b", "c") if higher_is_better else ("c", "b", "a"))
-
-    if isinstance(next(iter(scores_per_model.values())), AUPIMOResult):
-        return
-
-    def copy_and_add_nan(scores: Tensor) -> Tensor:
-        scores = scores.clone()
-        scores[5:] = torch.nan
-        return scores
-
-    # removing samples should reduce the confidences
-    scores_per_model["a"] = copy_and_add_nan(scores_per_model["a"])
-    scores_per_model["b"] = copy_and_add_nan(scores_per_model["b"])
-    if "c" in scores_per_model:
-        scores_per_model["c"] = copy_and_add_nan(scores_per_model["c"])
-
-    compare_models_pairwise_ttest_rel(
-        scores_per_model,
-        alternative=alternative,
-        higher_is_better=higher_is_better,
-    )
-
-
-def test_compare_models_pairwise_wilcoxon(scores_per_model: dict, alternative: str, higher_is_better: bool) -> None:
-    """Test `compare_models_pairwise_wilcoxon`."""
-    models_ordered, confidences = compare_models_pairwise_wilcoxon(
-        scores_per_model,
-        alternative=alternative,
-        higher_is_better=higher_is_better,
-    )
-    assert len(confidences) == (len(models_ordered) * (len(models_ordered) - 1))
-
-    diff = set(scores_per_model.keys()).symmetric_difference(set(models_ordered))
-    assert len(diff) == 0
-
-    if isinstance(scores_per_model, OrderedDict):
-        assert models_ordered == tuple(scores_per_model.keys())
-
-    elif len(scores_per_model) == 2:
-        assert models_ordered == (("a", "b") if higher_is_better else ("b", "a"))
-
-    elif len(scores_per_model) == 3:
-        # this one is not trivial without looking at the data, so no assertions
-        pass
-
-    if isinstance(next(iter(scores_per_model.values())), AUPIMOResult):
-        return
-
-    def copy_and_add_nan(scores: Tensor) -> Tensor:
-        scores = scores.clone()
-        scores[5:] = torch.nan
-        return scores
-
-    # removing samples should reduce the confidences
-    scores_per_model["a"] = copy_and_add_nan(scores_per_model["a"])
-    scores_per_model["b"] = copy_and_add_nan(scores_per_model["b"])
-    if "c" in scores_per_model:
-        scores_per_model["c"] = copy_and_add_nan(scores_per_model["c"])
-
-    compare_models_pairwise_wilcoxon(
-        scores_per_model,
-        alternative=alternative,
-        higher_is_better=higher_is_better,
-    )
-
-
-def test_format_pairwise_tests_results(scores_per_model: dict) -> None:
-    """Test `format_pairwise_tests_results`."""
-    models_ordered, confidences = compare_models_pairwise_wilcoxon(
-        scores_per_model,
-        alternative="greater",
-        higher_is_better=True,
-    )
-    confidence_df = format_pairwise_tests_results(
-        models_ordered,
-        confidences,
-        model1_as_column=True,
-        left_to_right=True,
-        top_to_bottom=True,
-    )
-    assert tuple(confidence_df.columns.tolist()) == models_ordered
-    assert tuple(confidence_df.index.tolist()) == models_ordered
-
-    models_ordered, confidences = compare_models_pairwise_ttest_rel(
-        scores_per_model,
-        alternative="greater",
-        higher_is_better=True,
-    )
-    confidence_df = format_pairwise_tests_results(
-        models_ordered,
-        confidences,
-        model1_as_column=True,
-        left_to_right=True,
-        top_to_bottom=True,
-    )
-    assert tuple(confidence_df.columns.tolist()) == models_ordered
-    assert tuple(confidence_df.index.tolist()) == models_ordered
diff --git a/tests/unit/metrics/per_image/__init__.py b/tests/unit/metrics/pimo/__init__.py
similarity index 100%
rename from tests/unit/metrics/per_image/__init__.py
rename to tests/unit/metrics/pimo/__init__.py
diff --git a/tests/unit/metrics/per_image/test_binclf_curve.py b/tests/unit/metrics/pimo/test_binclf_curve.py
similarity index 93%
rename from tests/unit/metrics/per_image/test_binclf_curve.py
rename to tests/unit/metrics/pimo/test_binclf_curve.py
index eed53f3248..660421bdba 100644
--- a/tests/unit/metrics/per_image/test_binclf_curve.py
+++ b/tests/unit/metrics/pimo/test_binclf_curve.py
@@ -12,7 +12,13 @@
 import pytest
 import torch
 
-from anomalib.metrics.per_image import binclf_curve
+from anomalib.metrics.pimo.binary_classification_curve import (
+    _binary_classification_curve,
+    binary_classification_curve,
+    per_image_fpr,
+    per_image_tpr,
+    threshold_and_binary_classification_curve,
+)
 
 
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
@@ -294,7 +300,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
 
 def test__binclf_one_curve(pred: torch.Tensor, gt: torch.Tensor, threshs: torch.Tensor, expected: torch.Tensor) -> None:
     """Test if `_binclf_one_curve()` returns the expected values."""
-    computed = binclf_curve._binclf_one_curve(pred, gt, threshs)
+    computed = _binary_classification_curve(pred, gt, threshs)
     assert computed.shape == (threshs.numel(), 2, 2)
     assert (computed == expected.numpy()).all()
 
@@ -306,7 +312,7 @@ def test__binclf_multiple_curves(
     expecteds: torch.Tensor,
 ) -> None:
     """Test if `_binclf_multiple_curves()` returns the expected values."""
-    computed = binclf_curve.binclf_multiple_curves(preds, gts, threshs)
+    computed = binary_classification_curve(preds, gts, threshs)
     assert computed.shape == (preds.shape[0], threshs.numel(), 2, 2)
     assert (computed == expecteds).all()
 
@@ -322,7 +328,7 @@ def test_binclf_multiple_curves(
     expected_binclf_curves: torch.Tensor,
 ) -> None:
     """Test if `binclf_multiple_curves()` returns the expected values."""
-    computed = binclf_curve.binclf_multiple_curves(
+    computed = binary_classification_curve(
         preds,
         gts,
         threshs,
@@ -331,26 +337,26 @@ def test_binclf_multiple_curves(
     assert (computed == expected_binclf_curves).all()
 
     # it's ok to have the threhsholds beyond the range of the preds
-    binclf_curve.binclf_multiple_curves(preds, gts, 2 * threshs)
+    binary_classification_curve(preds, gts, 2 * threshs)
 
     # or inside the bounds without reaching them
-    binclf_curve.binclf_multiple_curves(preds, gts, 0.5 * threshs)
+    binary_classification_curve(preds, gts, 0.5 * threshs)
 
     # it's also ok to have more threshs than unique values in the preds
     # add the values in between the threshs
     threshs_unncessary = 0.5 * (threshs[:-1] + threshs[1:])
     threshs_unncessary = torch.concatenate([threshs_unncessary, threshs])
     threshs_unncessary = torch.sort(threshs_unncessary)[0]
-    binclf_curve.binclf_multiple_curves(preds, gts, threshs_unncessary)
+    binary_classification_curve(preds, gts, threshs_unncessary)
 
     # or less
-    binclf_curve.binclf_multiple_curves(preds, gts, threshs[1:3])
+    binary_classification_curve(preds, gts, threshs[1:3])
 
 
 def test_binclf_multiple_curves_validations(args: list, kwargs: dict, exception: Exception) -> None:
     """Test if `_binclf_multiple_curves_python()` raises the expected errors."""
     with pytest.raises(exception):
-        binclf_curve.binclf_multiple_curves(*args, **kwargs)
+        binary_classification_curve(*args, **kwargs)
 
 
 def test_per_image_binclf_curve(
@@ -363,7 +369,7 @@ def test_per_image_binclf_curve(
     expected_binclf_curves: torch.Tensor,
 ) -> None:
     """Test if `per_image_binclf_curve()` returns the expected values."""
-    computed_threshs, computed_binclf_curves = binclf_curve.per_image_binclf_curve(
+    computed_threshs, computed_binclf_curves = threshold_and_binary_classification_curve(
         anomaly_maps,
         masks,
         threshs_choice=threshs_choice,
@@ -385,7 +391,7 @@ def test_per_image_binclf_curve(
 def test_per_image_binclf_curve_validations(args: list, kwargs: dict, exception: Exception) -> None:
     """Test if `per_image_binclf_curve()` raises the expected errors."""
     with pytest.raises(exception):
-        binclf_curve.per_image_binclf_curve(*args, **kwargs)
+        threshold_and_binary_classification_curve(*args, **kwargs)
 
 
 def test_per_image_binclf_curve_validations_alt(args: list, kwargs: dict, exception: Exception) -> None:
@@ -399,8 +405,8 @@ def test_rate_metrics(
     expected_tprs: torch.Tensor,
 ) -> None:
     """Test if rate metrics are computed correctly."""
-    tprs = binclf_curve.per_image_tpr(binclf_curves)
-    fprs = binclf_curve.per_image_fpr(binclf_curves)
+    tprs = per_image_tpr(binclf_curves)
+    fprs = per_image_fpr(binclf_curves)
 
     assert tprs.shape == expected_tprs.shape
     assert fprs.shape == expected_fprs.shape
diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/pimo/test_pimo.py
similarity index 98%
rename from tests/unit/metrics/per_image/test_pimo.py
rename to tests/unit/metrics/pimo/test_pimo.py
index a678bda430..dc40abbb5f 100644
--- a/tests/unit/metrics/per_image/test_pimo.py
+++ b/tests/unit/metrics/pimo/test_pimo.py
@@ -13,8 +13,7 @@
 import torch
 from torch import Tensor
 
-from anomalib.metrics.per_image import functional, pimo
-from anomalib.metrics.per_image.pimo import AUPIMOResult, PIMOResult
+from anomalib.metrics.pimo import AUPIMOResult, PIMOResult, functional, pimo
 
 
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: