feat: add % of Stores (numeric distribution) metric

mvanwyk · claude · mvanwyk · commit 0f00d570105a · 2026-04-05T16:39:17.000+02:00
Add PctOfStores class that computes the percentage of stores selling
each product. Includes ratio_metric utility in metrics/base.py for
safe division with NaN on zero denominator.

Also refactors ACV to use the updated conventions: group_by → group_col,
keyword-only params, unconditional validate_columns, and input handling
before parameter validation. Updates docs, CLAUDE.md, ColumnHelper, and
consolidates duplicate tests across date and options modules.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -98,6 +98,8 @@ fails at runtime, and only "green" when it passes at runtime.
 - Include boundary/edge case tests for threshold values, limits, and special cases
 - When testing against expected values (colors, formats, etc.), reference package constants rather than hardcoding
   values in tests
+- Use `ColumnHelper` for column names in test DataFrames (e.g., `cols.store_id`, `cols.customer_id`) instead of
+  hardcoding string literals like `"store_id"`. This keeps tests decoupled from the current option defaults.
 - Use pytest fixtures for shared test data setup to improve readability and reduce duplication
 
 ### Anti-Patterns to Avoid
diff --git a/docs/api/metrics/distribution.md b/docs/api/metrics/distribution.md
@@ -1,3 +1,5 @@
 # Distribution Metrics
 
 ::: pyretailscience.metrics.distribution.acv
+
+::: pyretailscience.metrics.distribution.pct_of_stores
diff --git a/docs/metrics.md b/docs/metrics.md
@@ -29,10 +29,39 @@ df = pd.DataFrame({
     "unit_spend": [400_000, 600_000, 300_000, 200_000, 500_000],
 })
 
-acv = Acv(df, group_by="store_id")
+acv = Acv(df, group_col="store_id")
 print(acv.df)
 #    store_id  acv
 # 0       101  1.0
 # 1       102  0.5
 # 2       103  0.5
 ```
+
+### % of Stores (Numeric Distribution)
+
+% of Stores measures the share of total stores in the dataset that sell a given product. Every store counts equally
+regardless of its sales volume. It answers the question: "What fraction of stores carry this product?"
+
+$$
+\%\text{Stores} = \frac{\text{COUNT(DISTINCT stores selling product)}}{\text{COUNT(DISTINCT all stores)}} \times 100
+$$
+
+Example:
+
+```python
+import pandas as pd
+from pyretailscience.metrics.distribution.pct_of_stores import PctOfStores
+
+df = pd.DataFrame({
+    "store_id": [10, 20, 20, 30, 40],
+    "product_id": [501, 501, 502, 502, 503],
+    "unit_spend": [5.99, 3.49, 4.00, 6.00, 2.50],
+})
+
+pct = PctOfStores(df)
+print(pct.df)
+#    product_id  stores  stores_pct
+# 0         501       2        50.0
+# 1         502       2        50.0
+# 2         503       1        25.0
+```
diff --git a/pyretailscience/metrics/base.py b/pyretailscience/metrics/base.py
@@ -0,0 +1,29 @@
+"""Shared ibis expression helpers for metric calculations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import ibis.expr.types as ir
+
+PERCENTAGE_SCALE = 100
+
+
+def ratio_metric(
+    numerator: ir.NumericValue,
+    denominator: ir.NumericValue,
+    scale: float = PERCENTAGE_SCALE,
+) -> ir.FloatingValue:
+    """Computes a scaled ratio, returning NULL on zero denominator.
+
+    Args:
+        numerator (ir.NumericValue): The numerator ibis expression.
+        denominator (ir.NumericValue): The denominator ibis expression.
+        scale (float, optional): Multiplicative scale factor. Defaults to 100 for percentages.
+
+    Returns:
+        ir.FloatingValue: The scaled ratio expression. Returns NULL (NaN in pandas)
+            when denominator is zero.
+    """
+    return numerator / denominator.nullif(0) * scale
diff --git a/pyretailscience/metrics/distribution/acv.py b/pyretailscience/metrics/distribution/acv.py
@@ -25,7 +25,7 @@ class Acv:
 
     Args:
         df (pd.DataFrame | ibis.Table): Transaction data containing at least a unit_spend column.
-        group_by (str | list[str] | None, optional): Optional column(s) to group the ACV calculation by
+        group_col (str | list[str] | None, optional): Optional column(s) to group the ACV calculation by
             (e.g., store_id). Defaults to None for total ACV.
         acv_scale_factor (float, optional): Factor to scale the ACV result (default is 1,000,000 for $MM).
 
@@ -37,33 +37,34 @@ class Acv:
     def __init__(
         self,
         df: pd.DataFrame | ibis.Table,
-        group_by: str | list[str] | None = None,
+        *,
+        group_col: str | list[str] | None = None,
         acv_scale_factor: float = 1_000_000,
     ) -> None:
         """Initializes the ACV calculation."""
         self._df: pd.DataFrame | None = None
         self.table: ibis.Table
 
-        if acv_scale_factor <= 0:
-            raise ValueError("acv_scale_factor must be positive.")
-
         if isinstance(df, pd.DataFrame):
             df = ibis.memtable(df)
         elif not isinstance(df, ibis.Table):
             raise TypeError("df must be either a pandas DataFrame or an Ibis Table.")
 
+        if acv_scale_factor <= 0:
+            raise ValueError("acv_scale_factor must be positive.")
+
         unit_spend_col = get_option("column.unit_spend")
 
-        if isinstance(group_by, str):
-            group_by = [group_by]
+        if isinstance(group_col, str):
+            group_col = [group_col]
 
         required_cols = [unit_spend_col]
-        if group_by is not None:
-            required_cols.extend(group_by)
-            validate_columns(df, required_cols)
-            df = df.group_by(group_by)
-        else:
-            validate_columns(df, required_cols)
+        if group_col is not None:
+            required_cols.extend(group_col)
+        validate_columns(df, required_cols)
+
+        if group_col is not None:
+            df = df.group_by(group_col)
 
         self.table = df.aggregate(acv=_[unit_spend_col].sum() / acv_scale_factor)
 
diff --git a/pyretailscience/metrics/distribution/pct_of_stores.py b/pyretailscience/metrics/distribution/pct_of_stores.py
@@ -0,0 +1,114 @@
+"""% of Stores (Numeric Distribution) metric.
+
+% of Stores measures the share of total stores in the dataset that sell a given product.
+Every store counts equally regardless of its sales volume.
+"""
+
+from __future__ import annotations
+
+import ibis
+import pandas as pd
+from ibis import _
+
+from pyretailscience.metrics.base import ratio_metric
+from pyretailscience.options import ColumnHelper, get_option
+from pyretailscience.utils.validation import validate_columns
+
+_TEMP_TOTAL_STORES = "__prs_temp_total_stores__"
+
+
+class PctOfStores:
+    """Calculates the percentage of stores selling each product.
+
+    This is the simplest, unweighted distribution metric (numeric distribution).
+    It answers the question: "What fraction of stores carry this product?"
+
+    Results are accessible via the ``table`` attribute (ibis Table) or the ``df`` property
+    (materialized pandas DataFrame).
+
+    Args:
+        df (pd.DataFrame | ibis.Table): Transaction-level data containing at least
+            store_id and product_id columns.
+        product_col (str | None, optional): Column defining product granularity.
+            Defaults to ``get_option("column.product_id")``.
+        group_col (str | list[str] | None, optional): Additional grouping dimensions
+            (e.g., ``"category_0_name"``). Defaults to None.
+        within_group (bool, optional): Controls the denominator when ``group_col`` is specified.
+            When ``False`` (default), the percentage is relative to all stores in the dataset.
+            When ``True``, the percentage is relative to stores within each group independently.
+            Has no effect when ``group_col`` is None. Defaults to False.
+
+    Raises:
+        TypeError: If df is not a pandas DataFrame or an Ibis Table.
+        ValueError: If required columns are missing from the data, or if product_col
+            appears in group_col.
+    """
+
+    def __init__(
+        self,
+        df: pd.DataFrame | ibis.Table,
+        *,
+        product_col: str | None = None,
+        group_col: str | list[str] | None = None,
+        within_group: bool = False,
+    ) -> None:
+        """Initializes the % of Stores calculation."""
+        self._df: pd.DataFrame | None = None
+        self.table: ibis.Table
+
+        if isinstance(df, pd.DataFrame):
+            df = ibis.memtable(df)
+        elif not isinstance(df, ibis.Table):
+            raise TypeError("df must be either a pandas DataFrame or an Ibis Table.")
+
+        store_id_col = get_option("column.store_id")
+        product_col = product_col if product_col is not None else get_option("column.product_id")
+
+        if isinstance(group_col, str):
+            group_col = [group_col]
+
+        required_cols = [store_id_col, product_col]
+        if group_col is not None:
+            if product_col in group_col:
+                msg = f"product_col '{product_col}' must not also appear in group_col"
+                raise ValueError(msg)
+            required_cols.extend(group_col)
+        validate_columns(df, required_cols)
+
+        group_cols = [product_col]
+        if group_col is not None:
+            group_cols.extend(group_col)
+
+        store_product = df.select([store_id_col, *group_cols]).distinct()
+
+        agg_stores_col = get_option("column.agg.store_id")
+        per_group = store_product.group_by(group_cols).aggregate(
+            **{agg_stores_col: _[store_id_col].count()},
+        )
+
+        if within_group and group_col is not None:
+            total_stores = store_product.group_by(group_col).aggregate(
+                **{_TEMP_TOTAL_STORES: _[store_id_col].nunique()},
+            )
+            per_group = per_group.inner_join(total_stores, group_col)
+            denominator = _[_TEMP_TOTAL_STORES]
+        else:
+            denominator = store_product[store_id_col].nunique()
+
+        pct_stores_col = ColumnHelper.join_options("column.agg.store_id", "column.suffix.percent")
+        self.table = per_group.mutate(
+            **{pct_stores_col: ratio_metric(_[agg_stores_col], denominator)},
+        )
+        if within_group and group_col is not None:
+            self.table = self.table.drop(_TEMP_TOTAL_STORES)
+
+    @property
+    def df(self) -> pd.DataFrame:
+        """Returns the materialized pandas DataFrame of % of Stores results.
+
+        Returns:
+            pd.DataFrame: DataFrame with % of stores values. Cached after first access.
+        """
+        if self._df is None:
+            self._df = self.table.execute()
+        return self._df
diff --git a/pyretailscience/options.py b/pyretailscience/options.py
@@ -131,17 +131,17 @@ def __init__(self) -> None:
             "column.unit_price": "The name of the column containing the unit price of the product.",
             "column.unit_spend": (
                 "The name of the column containing the total spend of the products in the transaction. "
-                "ie, unit_price * units",
+                "ie, unit_price * units"
             ),
             "column.unit_cost": (
                 "The name of the column containing the total cost of the products in the transaction. "
-                "ie, single unit cost * units",
+                "ie, single unit cost * units"
             ),
             "column.promo_unit_spend": (
                 "The name of the column containing the total spend on promotion of the products in the transaction. "
-                "ie, promotional unit price * units",
+                "ie, promotional unit price * units"
             ),
-            "column.promo_unit_quantity": ("The name of the column containing the number of units sold on promotion."),
+            "column.promo_unit_quantity": "The name of the column containing the number of units sold on promotion.",
             "column.store_id": "The name of the column containing store IDs of the transaction.",
             # Aggregation columns
             "column.agg.customer_id": "The name of the column containing the number of unique customers.",
@@ -769,6 +769,7 @@ def __init__(self) -> None:
         self.transaction_time = get_option("column.transaction_time")
         self.customer_id = get_option("column.customer_id")
         self.transaction_id = get_option("column.transaction_id")
+        self.product_id = get_option("column.product_id")
         self.store_id = get_option("column.store_id")
         self.unit_spend = get_option("column.unit_spend")
         self.unit_qty = get_option("column.unit_quantity")
diff --git a/pyretailscience/utils/date.py b/pyretailscience/utils/date.py
@@ -4,8 +4,6 @@
 from datetime import datetime, timezone
 
 import ibis
-import numpy as np
-import pandas as pd
 
 from pyretailscience.options import get_option
 
@@ -24,6 +22,26 @@ def _normalize_datetime(date_val: datetime | str) -> datetime:
     raise TypeError(error_msg)
 
 
+def _is_naive(d: datetime | str) -> bool:
+    """Check whether a datetime-like input is timezone-naive.
+
+    Args:
+        d (datetime | str): A datetime object or date string to check.
+
+    Returns:
+        bool: True if the input is a string or a naive datetime, False if tz-aware.
+
+    Raises:
+        TypeError: If d is not a str or datetime instance.
+    """
+    if isinstance(d, str):
+        return True
+    if isinstance(d, datetime):
+        return d.tzinfo is None
+    msg = f"Expected str or datetime, got {type(d)}"
+    raise TypeError(msg)
+
+
 def _validate_and_normalize_periods(
     period_ranges: Mapping[str, tuple[datetime | str, datetime | str]],
 ) -> dict[str, tuple[datetime, datetime]]:
@@ -149,11 +167,18 @@ def find_overlapping_periods(
         String inputs produce naive datetime outputs.
 
     Raises:
+        TypeError: If start_date and end_date have mismatched timezone awareness
+            (one naive or string and one timezone-aware, or vice versa).
         ValueError: If the start date is after the end date.
     """
     # Track whether outputs should be tz-naive to preserve backward compatibility.
     # String inputs and naive datetime inputs both produced naive outputs before.
-    input_is_naive = isinstance(start_date, str) or start_date.tzinfo is None
+    start_is_naive = _is_naive(start_date)
+    end_is_naive = _is_naive(end_date)
+
+    if start_is_naive != end_is_naive:
+        msg = "start_date and end_date must have matching timezone awareness. Got naive and aware (or vice versa)."
+        raise TypeError(msg)
 
     start_date = _normalize_datetime(start_date)
     end_date = _normalize_datetime(end_date)
@@ -166,25 +191,22 @@ def find_overlapping_periods(
     if start_year == end_year:
         return []
 
-    years = np.arange(start_year, end_year)
+    if start_is_naive:
+        output_tz = None
+        start_date = start_date.replace(tzinfo=None)
+    else:
+        output_tz = start_date.tzinfo
+
+    years = range(start_year, end_year)
 
     period_starts = [
-        start_date if year == start_year else datetime(year, start_month, start_day, tzinfo=timezone.utc)
-        for year in years
+        start_date if year == start_year else datetime(year, start_month, start_day, tzinfo=output_tz) for year in years
     ]
-    period_ends = [datetime(year + 1, end_month, end_day, tzinfo=timezone.utc) for year in years]
+    period_ends = [datetime(year + 1, end_month, end_day, tzinfo=output_tz) for year in years]
 
-    df = pd.DataFrame({"start": period_starts, "end": period_ends})
+    pairs = list(zip(period_starts, period_ends, strict=True))
 
     if return_str:
-        return [
-            (start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d"))
-            for start, end in zip(df["start"], df["end"], strict=False)
-        ]
-
-    if input_is_naive:
-        return [
-            (start.replace(tzinfo=None), end.replace(tzinfo=None))
-            for start, end in zip(df["start"], df["end"], strict=False)
-        ]
-    return list(zip(df["start"], df["end"], strict=False))
+        return [(start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d")) for start, end in pairs]
+
+    return pairs
diff --git a/tests/metrics/distribution/test_acv.py b/tests/metrics/distribution/test_acv.py
diff --git a/tests/metrics/distribution/test_pct_of_stores.py b/tests/metrics/distribution/test_pct_of_stores.py
diff --git a/tests/test_options.py b/tests/test_options.py
diff --git a/tests/utils/test_date.py b/tests/utils/test_date.py