Skip to content

Commit 41a992c

Browse files
mvanwykclaude
andcommitted
fix: address PR review comments
- Add full Google-style docstring to _normalize_datetime (#10) - Adopt ColumnHelper in test_acv.py for consistency (#7) - Rename misleading test_within_group_with_user_column_named_total_stores to test_within_group_ignores_unrelated_extra_columns (#4) - Add group_col/within_group examples to docs/metrics.md (#9) - Fix PR description: product_col is str | None, not list (#2) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 69f8a5d commit 41a992c

4 files changed

Lines changed: 63 additions & 28 deletions

File tree

docs/metrics.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,21 @@ print(pct.df)
6565
# 1 502 2 50.0
6666
# 2 503 1 25.0
6767
```
68+
69+
Use `group_col` to add extra grouping dimensions, and `within_group=True` to compute the
70+
percentage relative to stores within each group rather than all stores:
71+
72+
```python
73+
df = pd.DataFrame({
74+
"store_id": [10, 20, 30, 40, 10],
75+
"product_id": [501, 501, 502, 502, 502],
76+
"region": ["North", "North", "South", "South", "North"],
77+
"unit_spend": [5.99, 3.49, 4.00, 6.00, 2.50],
78+
})
79+
80+
# Percentage relative to all stores (default)
81+
pct = PctOfStores(df, group_col="region")
82+
83+
# Percentage relative to stores within each region
84+
pct_within = PctOfStores(df, group_col="region", within_group=True)
85+
```

pyretailscience/utils/date.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,21 @@
99

1010

1111
def _normalize_datetime(date_val: datetime | str) -> datetime:
12-
"""Convert string or datetime to timezone-aware datetime object."""
12+
"""Convert a string or datetime to a timezone-aware datetime object.
13+
14+
Strings are parsed as ``%Y-%m-%d`` and localized to UTC. Naive datetimes
15+
are made aware by attaching UTC. Already-aware datetimes are returned
16+
unchanged.
17+
18+
Args:
19+
date_val (datetime | str): A date string (``YYYY-MM-DD``) or datetime to normalize.
20+
21+
Returns:
22+
datetime: A timezone-aware datetime in UTC.
23+
24+
Raises:
25+
TypeError: If *date_val* is neither a ``str`` nor a ``datetime``.
26+
"""
1327
if isinstance(date_val, str):
1428
# Convert string to timezone-aware datetime
1529
return datetime.strptime(date_val, "%Y-%m-%d").replace(tzinfo=timezone.utc)

tests/metrics/distribution/test_acv.py

Lines changed: 27 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
from pandas.testing import assert_frame_equal
88

99
from pyretailscience.metrics.distribution.acv import Acv
10+
from pyretailscience.options import ColumnHelper
11+
12+
cols = ColumnHelper()
1013

1114

1215
class TestAcv:
@@ -16,10 +19,10 @@ def test_acv_total_no_grouping(self):
1619
"""Test total ACV across all transactions without grouping."""
1720
df = pd.DataFrame(
1821
{
19-
"customer_id": [1, 2, 3, 1, 2],
20-
"store_id": [101, 101, 102, 102, 103],
21-
"product_id": [10, 20, 30, 40, 50],
22-
"unit_spend": [500_000.0, 750_000.0, 300_000.0, 600_000.0, 350_000.0],
22+
cols.customer_id: [1, 2, 3, 1, 2],
23+
cols.store_id: [101, 101, 102, 102, 103],
24+
cols.product_id: [10, 20, 30, 40, 50],
25+
cols.unit_spend: [500_000.0, 750_000.0, 300_000.0, 600_000.0, 350_000.0],
2326
}
2427
)
2528
result = Acv(df).df
@@ -31,15 +34,15 @@ def test_acv_grouped_by_store(self, input_type):
3134
"""Test ACV grouped by store returns correct per-store values for both input types."""
3235
pdf = pd.DataFrame(
3336
{
34-
"store_id": [101, 101, 102, 102, 103],
35-
"unit_spend": [400_000.0, 600_000.0, 300_000.0, 200_000.0, 500_000.0],
37+
cols.store_id: [101, 101, 102, 102, 103],
38+
cols.unit_spend: [400_000.0, 600_000.0, 300_000.0, 200_000.0, 500_000.0],
3639
}
3740
)
3841
df = ibis.memtable(pdf) if input_type == "ibis" else pdf
39-
result = Acv(df, group_col="store_id").df.sort_values("store_id").reset_index(drop=True)
42+
result = Acv(df, group_col=cols.store_id).df.sort_values(cols.store_id).reset_index(drop=True)
4043
expected = pd.DataFrame(
4144
{
42-
"store_id": [101, 102, 103],
45+
cols.store_id: [101, 102, 103],
4346
"acv": [1.0, 0.5, 0.5],
4447
}
4548
)
@@ -49,15 +52,15 @@ def test_acv_group_col_list(self):
4952
"""Test ACV grouped by multiple columns."""
5053
df = pd.DataFrame(
5154
{
52-
"store_id": [101, 101, 102],
55+
cols.store_id: [101, 101, 102],
5356
"region": ["North", "North", "South"],
54-
"unit_spend": [1_000_000.0, 500_000.0, 2_000_000.0],
57+
cols.unit_spend: [1_000_000.0, 500_000.0, 2_000_000.0],
5558
}
5659
)
57-
result = Acv(df, group_col=["store_id", "region"]).df.sort_values("store_id").reset_index(drop=True)
60+
result = Acv(df, group_col=[cols.store_id, "region"]).df.sort_values(cols.store_id).reset_index(drop=True)
5861
expected = pd.DataFrame(
5962
{
60-
"store_id": [101, 102],
63+
cols.store_id: [101, 102],
6164
"region": ["North", "South"],
6265
"acv": [1.5, 2.0],
6366
}
@@ -68,37 +71,37 @@ def test_acv_with_nan_values(self):
6871
"""Test that NaN values are excluded from the ACV sum."""
6972
df = pd.DataFrame(
7073
{
71-
"store_id": [101, 101, 102],
72-
"unit_spend": [1_000_000.0, np.nan, 500_000.0],
74+
cols.store_id: [101, 101, 102],
75+
cols.unit_spend: [1_000_000.0, np.nan, 500_000.0],
7376
}
7477
)
75-
result = Acv(df, group_col="store_id").df.sort_values("store_id").reset_index(drop=True)
78+
result = Acv(df, group_col=cols.store_id).df.sort_values(cols.store_id).reset_index(drop=True)
7679
expected = pd.DataFrame(
7780
{
78-
"store_id": [101, 102],
81+
cols.store_id: [101, 102],
7982
"acv": [1.0, 0.5],
8083
}
8184
)
8285
assert_frame_equal(result, expected)
8386

8487
def test_acv_missing_column_raises(self):
8588
"""Test that missing unit_spend column raises ValueError."""
86-
df = pd.DataFrame({"customer_id": [1, 2], "store_id": [101, 102]})
89+
df = pd.DataFrame({cols.customer_id: [1, 2], cols.store_id: [101, 102]})
8790
with pytest.raises(ValueError, match="missing"):
8891
Acv(df)
8992

90-
def test_acv_missing_group_col_column_raises(self):
93+
def test_acv_missing_group_col_raises(self):
9194
"""Test that missing group_col column raises ValueError."""
92-
df = pd.DataFrame({"unit_spend": [100.0, 200.0]})
95+
df = pd.DataFrame({cols.unit_spend: [100.0, 200.0]})
9396
with pytest.raises(ValueError, match="missing"):
94-
Acv(df, group_col="store_id")
97+
Acv(df, group_col=cols.store_id)
9598

9699
def test_acv_custom_scale_factor(self):
97100
"""Test ACV with a custom scale factor."""
98101
df = pd.DataFrame(
99102
{
100-
"store_id": [101, 102],
101-
"unit_spend": [5_000.0, 10_000.0],
103+
cols.store_id: [101, 102],
104+
cols.unit_spend: [5_000.0, 10_000.0],
102105
}
103106
)
104107
result = Acv(df, acv_scale_factor=1_000).df
@@ -108,11 +111,11 @@ def test_acv_custom_scale_factor(self):
108111
@pytest.mark.parametrize("scale_factor", [0, -1_000])
109112
def test_acv_non_positive_scale_factor_raises(self, scale_factor):
110113
"""Test that zero or negative acv_scale_factor raises ValueError."""
111-
df = pd.DataFrame({"unit_spend": [500_000.0, 1_000_000.0]})
114+
df = pd.DataFrame({cols.unit_spend: [500_000.0, 1_000_000.0]})
112115
with pytest.raises(ValueError, match="acv_scale_factor must be positive"):
113116
Acv(df, acv_scale_factor=scale_factor)
114117

115118
def test_acv_invalid_type_raises(self):
116119
"""Test that passing a non-DataFrame/Table raises TypeError."""
117120
with pytest.raises(TypeError, match="pandas DataFrame or an Ibis Table"):
118-
Acv({"unit_spend": [100.0]})
121+
Acv({cols.unit_spend: [100.0]})

tests/metrics/distribution/test_pct_of_stores.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -220,14 +220,14 @@ def test_within_group_ignored_without_group_col(self):
220220
result_without = PctOfStores(df, within_group=False).df.sort_values(cols.product_id).reset_index(drop=True)
221221
assert_frame_equal(result_with, result_without)
222222

223-
def test_within_group_with_user_column_named_total_stores(self):
224-
"""Test that a user column named _total_stores does not collide with internal temp column."""
223+
def test_within_group_ignores_unrelated_extra_columns(self):
224+
"""Test that extra columns in the input don't affect within_group computation."""
225225
df = pd.DataFrame(
226226
{
227227
cols.store_id: [10, 20, 30, 40, 10],
228228
cols.product_id: [501, 501, 502, 502, 502],
229229
"region": ["North", "North", "South", "South", "North"],
230-
"_total_stores": [100, 200, 300, 400, 100],
230+
"some_metric": [100, 200, 300, 400, 100],
231231
cols.unit_spend: [5.99, 3.49, 4.00, 6.00, 2.50],
232232
}
233233
)

0 commit comments

Comments
 (0)