Skip to content

Commit 0f00d57

Browse files
mvanwykclaude
andcommitted
feat: add % of Stores (numeric distribution) metric
Add PctOfStores class that computes the percentage of stores selling each product. Includes ratio_metric utility in metrics/base.py for safe division with NaN on zero denominator. Also refactors ACV to use the updated conventions: group_by → group_col, keyword-only params, unconditional validate_columns, and input handling before parameter validation. Updates docs, CLAUDE.md, ColumnHelper, and consolidates duplicate tests across date and options modules. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent fba1d70 commit 0f00d57

12 files changed

Lines changed: 637 additions & 180 deletions

File tree

CLAUDE.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ fails at runtime, and only "green" when it passes at runtime.
9898
- Include boundary/edge case tests for threshold values, limits, and special cases
9999
- When testing against expected values (colors, formats, etc.), reference package constants rather than hardcoding
100100
values in tests
101+
- Use `ColumnHelper` for column names in test DataFrames (e.g., `cols.store_id`, `cols.customer_id`) instead of
102+
hardcoding string literals like `"store_id"`. This keeps tests decoupled from the current option defaults.
101103
- Use pytest fixtures for shared test data setup to improve readability and reduce duplication
102104

103105
### Anti-Patterns to Avoid

docs/api/metrics/distribution.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
# Distribution Metrics
22

33
::: pyretailscience.metrics.distribution.acv
4+
5+
::: pyretailscience.metrics.distribution.pct_of_stores

docs/metrics.md

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,39 @@ df = pd.DataFrame({
2929
"unit_spend": [400_000, 600_000, 300_000, 200_000, 500_000],
3030
})
3131

32-
acv = Acv(df, group_by="store_id")
32+
acv = Acv(df, group_col="store_id")
3333
print(acv.df)
3434
# store_id acv
3535
# 0 101 1.0
3636
# 1 102 0.5
3737
# 2 103 0.5
3838
```
39+
40+
### % of Stores (Numeric Distribution)
41+
42+
% of Stores measures the share of total stores in the dataset that sell a given product. Every store counts equally
43+
regardless of its sales volume. It answers the question: "What fraction of stores carry this product?"
44+
45+
$$
46+
\%\text{Stores} = \frac{\text{COUNT(DISTINCT stores selling product)}}{\text{COUNT(DISTINCT all stores)}} \times 100
47+
$$
48+
49+
Example:
50+
51+
```python
52+
import pandas as pd
53+
from pyretailscience.metrics.distribution.pct_of_stores import PctOfStores
54+
55+
df = pd.DataFrame({
56+
"store_id": [10, 20, 20, 30, 40],
57+
"product_id": [501, 501, 502, 502, 503],
58+
"unit_spend": [5.99, 3.49, 4.00, 6.00, 2.50],
59+
})
60+
61+
pct = PctOfStores(df)
62+
print(pct.df)
63+
# product_id stores stores_pct
64+
# 0 501 2 50.0
65+
# 1 502 2 50.0
66+
# 2 503 1 25.0
67+
```

pyretailscience/metrics/base.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""Shared ibis expression helpers for metric calculations."""
2+
3+
from __future__ import annotations
4+
5+
from typing import TYPE_CHECKING
6+
7+
if TYPE_CHECKING:
8+
import ibis.expr.types as ir
9+
10+
PERCENTAGE_SCALE = 100
11+
12+
13+
def ratio_metric(
14+
numerator: ir.NumericValue,
15+
denominator: ir.NumericValue,
16+
scale: float = PERCENTAGE_SCALE,
17+
) -> ir.FloatingValue:
18+
"""Computes a scaled ratio, returning NULL on zero denominator.
19+
20+
Args:
21+
numerator (ir.NumericValue): The numerator ibis expression.
22+
denominator (ir.NumericValue): The denominator ibis expression.
23+
scale (float, optional): Multiplicative scale factor. Defaults to 100 for percentages.
24+
25+
Returns:
26+
ir.FloatingValue: The scaled ratio expression. Returns NULL (NaN in pandas)
27+
when denominator is zero.
28+
"""
29+
return numerator / denominator.nullif(0) * scale

pyretailscience/metrics/distribution/acv.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class Acv:
2525
2626
Args:
2727
df (pd.DataFrame | ibis.Table): Transaction data containing at least a unit_spend column.
28-
group_by (str | list[str] | None, optional): Optional column(s) to group the ACV calculation by
28+
group_col (str | list[str] | None, optional): Optional column(s) to group the ACV calculation by
2929
(e.g., store_id). Defaults to None for total ACV.
3030
acv_scale_factor (float, optional): Factor to scale the ACV result (default is 1,000,000 for $MM).
3131
@@ -37,33 +37,34 @@ class Acv:
3737
def __init__(
3838
self,
3939
df: pd.DataFrame | ibis.Table,
40-
group_by: str | list[str] | None = None,
40+
*,
41+
group_col: str | list[str] | None = None,
4142
acv_scale_factor: float = 1_000_000,
4243
) -> None:
4344
"""Initializes the ACV calculation."""
4445
self._df: pd.DataFrame | None = None
4546
self.table: ibis.Table
4647

47-
if acv_scale_factor <= 0:
48-
raise ValueError("acv_scale_factor must be positive.")
49-
5048
if isinstance(df, pd.DataFrame):
5149
df = ibis.memtable(df)
5250
elif not isinstance(df, ibis.Table):
5351
raise TypeError("df must be either a pandas DataFrame or an Ibis Table.")
5452

53+
if acv_scale_factor <= 0:
54+
raise ValueError("acv_scale_factor must be positive.")
55+
5556
unit_spend_col = get_option("column.unit_spend")
5657

57-
if isinstance(group_by, str):
58-
group_by = [group_by]
58+
if isinstance(group_col, str):
59+
group_col = [group_col]
5960

6061
required_cols = [unit_spend_col]
61-
if group_by is not None:
62-
required_cols.extend(group_by)
63-
validate_columns(df, required_cols)
64-
df = df.group_by(group_by)
65-
else:
66-
validate_columns(df, required_cols)
62+
if group_col is not None:
63+
required_cols.extend(group_col)
64+
validate_columns(df, required_cols)
65+
66+
if group_col is not None:
67+
df = df.group_by(group_col)
6768

6869
self.table = df.aggregate(acv=_[unit_spend_col].sum() / acv_scale_factor)
6970

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
"""% of Stores (Numeric Distribution) metric.
2+
3+
% of Stores measures the share of total stores in the dataset that sell a given product.
4+
Every store counts equally regardless of its sales volume.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import ibis
10+
import pandas as pd
11+
from ibis import _
12+
13+
from pyretailscience.metrics.base import ratio_metric
14+
from pyretailscience.options import ColumnHelper, get_option
15+
from pyretailscience.utils.validation import validate_columns
16+
17+
_TEMP_TOTAL_STORES = "__prs_temp_total_stores__"
18+
19+
20+
class PctOfStores:
21+
"""Calculates the percentage of stores selling each product.
22+
23+
This is the simplest, unweighted distribution metric (numeric distribution).
24+
It answers the question: "What fraction of stores carry this product?"
25+
26+
Results are accessible via the ``table`` attribute (ibis Table) or the ``df`` property
27+
(materialized pandas DataFrame).
28+
29+
Args:
30+
df (pd.DataFrame | ibis.Table): Transaction-level data containing at least
31+
store_id and product_id columns.
32+
product_col (str | None, optional): Column defining product granularity.
33+
Defaults to ``get_option("column.product_id")``.
34+
group_col (str | list[str] | None, optional): Additional grouping dimensions
35+
(e.g., ``"category_0_name"``). Defaults to None.
36+
within_group (bool, optional): Controls the denominator when ``group_col`` is specified.
37+
When ``False`` (default), the percentage is relative to all stores in the dataset.
38+
When ``True``, the percentage is relative to stores within each group independently.
39+
Has no effect when ``group_col`` is None. Defaults to False.
40+
41+
Raises:
42+
TypeError: If df is not a pandas DataFrame or an Ibis Table.
43+
ValueError: If required columns are missing from the data, or if product_col
44+
appears in group_col.
45+
"""
46+
47+
def __init__(
48+
self,
49+
df: pd.DataFrame | ibis.Table,
50+
*,
51+
product_col: str | None = None,
52+
group_col: str | list[str] | None = None,
53+
within_group: bool = False,
54+
) -> None:
55+
"""Initializes the % of Stores calculation."""
56+
self._df: pd.DataFrame | None = None
57+
self.table: ibis.Table
58+
59+
if isinstance(df, pd.DataFrame):
60+
df = ibis.memtable(df)
61+
elif not isinstance(df, ibis.Table):
62+
raise TypeError("df must be either a pandas DataFrame or an Ibis Table.")
63+
64+
store_id_col = get_option("column.store_id")
65+
product_col = product_col if product_col is not None else get_option("column.product_id")
66+
67+
if isinstance(group_col, str):
68+
group_col = [group_col]
69+
70+
required_cols = [store_id_col, product_col]
71+
if group_col is not None:
72+
if product_col in group_col:
73+
msg = f"product_col '{product_col}' must not also appear in group_col"
74+
raise ValueError(msg)
75+
required_cols.extend(group_col)
76+
validate_columns(df, required_cols)
77+
78+
group_cols = [product_col]
79+
if group_col is not None:
80+
group_cols.extend(group_col)
81+
82+
store_product = df.select([store_id_col, *group_cols]).distinct()
83+
84+
agg_stores_col = get_option("column.agg.store_id")
85+
per_group = store_product.group_by(group_cols).aggregate(
86+
**{agg_stores_col: _[store_id_col].count()},
87+
)
88+
89+
if within_group and group_col is not None:
90+
total_stores = store_product.group_by(group_col).aggregate(
91+
**{_TEMP_TOTAL_STORES: _[store_id_col].nunique()},
92+
)
93+
per_group = per_group.inner_join(total_stores, group_col)
94+
denominator = _[_TEMP_TOTAL_STORES]
95+
else:
96+
denominator = store_product[store_id_col].nunique()
97+
98+
pct_stores_col = ColumnHelper.join_options("column.agg.store_id", "column.suffix.percent")
99+
self.table = per_group.mutate(
100+
**{pct_stores_col: ratio_metric(_[agg_stores_col], denominator)},
101+
)
102+
if within_group and group_col is not None:
103+
self.table = self.table.drop(_TEMP_TOTAL_STORES)
104+
105+
@property
106+
def df(self) -> pd.DataFrame:
107+
"""Returns the materialized pandas DataFrame of % of Stores results.
108+
109+
Returns:
110+
pd.DataFrame: DataFrame with % of stores values. Cached after first access.
111+
"""
112+
if self._df is None:
113+
self._df = self.table.execute()
114+
return self._df

pyretailscience/options.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -131,17 +131,17 @@ def __init__(self) -> None:
131131
"column.unit_price": "The name of the column containing the unit price of the product.",
132132
"column.unit_spend": (
133133
"The name of the column containing the total spend of the products in the transaction. "
134-
"ie, unit_price * units",
134+
"ie, unit_price * units"
135135
),
136136
"column.unit_cost": (
137137
"The name of the column containing the total cost of the products in the transaction. "
138-
"ie, single unit cost * units",
138+
"ie, single unit cost * units"
139139
),
140140
"column.promo_unit_spend": (
141141
"The name of the column containing the total spend on promotion of the products in the transaction. "
142-
"ie, promotional unit price * units",
142+
"ie, promotional unit price * units"
143143
),
144-
"column.promo_unit_quantity": ("The name of the column containing the number of units sold on promotion."),
144+
"column.promo_unit_quantity": "The name of the column containing the number of units sold on promotion.",
145145
"column.store_id": "The name of the column containing store IDs of the transaction.",
146146
# Aggregation columns
147147
"column.agg.customer_id": "The name of the column containing the number of unique customers.",
@@ -769,6 +769,7 @@ def __init__(self) -> None:
769769
self.transaction_time = get_option("column.transaction_time")
770770
self.customer_id = get_option("column.customer_id")
771771
self.transaction_id = get_option("column.transaction_id")
772+
self.product_id = get_option("column.product_id")
772773
self.store_id = get_option("column.store_id")
773774
self.unit_spend = get_option("column.unit_spend")
774775
self.unit_qty = get_option("column.unit_quantity")

pyretailscience/utils/date.py

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
from datetime import datetime, timezone
55

66
import ibis
7-
import numpy as np
8-
import pandas as pd
97

108
from pyretailscience.options import get_option
119

@@ -24,6 +22,26 @@ def _normalize_datetime(date_val: datetime | str) -> datetime:
2422
raise TypeError(error_msg)
2523

2624

25+
def _is_naive(d: datetime | str) -> bool:
26+
"""Check whether a datetime-like input is timezone-naive.
27+
28+
Args:
29+
d (datetime | str): A datetime object or date string to check.
30+
31+
Returns:
32+
bool: True if the input is a string or a naive datetime, False if tz-aware.
33+
34+
Raises:
35+
TypeError: If d is not a str or datetime instance.
36+
"""
37+
if isinstance(d, str):
38+
return True
39+
if isinstance(d, datetime):
40+
return d.tzinfo is None
41+
msg = f"Expected str or datetime, got {type(d)}"
42+
raise TypeError(msg)
43+
44+
2745
def _validate_and_normalize_periods(
2846
period_ranges: Mapping[str, tuple[datetime | str, datetime | str]],
2947
) -> dict[str, tuple[datetime, datetime]]:
@@ -149,11 +167,18 @@ def find_overlapping_periods(
149167
String inputs produce naive datetime outputs.
150168
151169
Raises:
170+
TypeError: If start_date and end_date have mismatched timezone awareness
171+
(one naive or string and one timezone-aware, or vice versa).
152172
ValueError: If the start date is after the end date.
153173
"""
154174
# Track whether outputs should be tz-naive to preserve backward compatibility.
155175
# String inputs and naive datetime inputs both produced naive outputs before.
156-
input_is_naive = isinstance(start_date, str) or start_date.tzinfo is None
176+
start_is_naive = _is_naive(start_date)
177+
end_is_naive = _is_naive(end_date)
178+
179+
if start_is_naive != end_is_naive:
180+
msg = "start_date and end_date must have matching timezone awareness. Got naive and aware (or vice versa)."
181+
raise TypeError(msg)
157182

158183
start_date = _normalize_datetime(start_date)
159184
end_date = _normalize_datetime(end_date)
@@ -166,25 +191,22 @@ def find_overlapping_periods(
166191
if start_year == end_year:
167192
return []
168193

169-
years = np.arange(start_year, end_year)
194+
if start_is_naive:
195+
output_tz = None
196+
start_date = start_date.replace(tzinfo=None)
197+
else:
198+
output_tz = start_date.tzinfo
199+
200+
years = range(start_year, end_year)
170201

171202
period_starts = [
172-
start_date if year == start_year else datetime(year, start_month, start_day, tzinfo=timezone.utc)
173-
for year in years
203+
start_date if year == start_year else datetime(year, start_month, start_day, tzinfo=output_tz) for year in years
174204
]
175-
period_ends = [datetime(year + 1, end_month, end_day, tzinfo=timezone.utc) for year in years]
205+
period_ends = [datetime(year + 1, end_month, end_day, tzinfo=output_tz) for year in years]
176206

177-
df = pd.DataFrame({"start": period_starts, "end": period_ends})
207+
pairs = list(zip(period_starts, period_ends, strict=True))
178208

179209
if return_str:
180-
return [
181-
(start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d"))
182-
for start, end in zip(df["start"], df["end"], strict=False)
183-
]
184-
185-
if input_is_naive:
186-
return [
187-
(start.replace(tzinfo=None), end.replace(tzinfo=None))
188-
for start, end in zip(df["start"], df["end"], strict=False)
189-
]
190-
return list(zip(df["start"], df["end"], strict=False))
210+
return [(start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d")) for start, end in pairs]
211+
212+
return pairs

0 commit comments

Comments
 (0)