fix: address PR review comments

mvanwyk · claude · mvanwyk · commit 41a992cee268 · 2026-04-05T15:33:33.000+02:00
- Add full Google-style docstring to _normalize_datetime (#10) - Adopt ColumnHelper in test_acv.py for consistency (#7) - Rename misleading test_within_group_with_user_column_named_total_stores to test_within_group_ignores_unrelated_extra_columns (#4) - Add group_col/within_group examples to docs/metrics.md (#9) - Fix PR description: product_col is str | None, not list (#2) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/docs/metrics.md b/docs/metrics.md
@@ -65,3 +65,21 @@ print(pct.df)
 # 1         502       2        50.0
 # 2         503       1        25.0
 ```
+
+Use `group_col` to add extra grouping dimensions, and `within_group=True` to compute the
+percentage relative to stores within each group rather than all stores:
+
+```python
+df = pd.DataFrame({
+    "store_id": [10, 20, 30, 40, 10],
+    "product_id": [501, 501, 502, 502, 502],
+    "region": ["North", "North", "South", "South", "North"],
+    "unit_spend": [5.99, 3.49, 4.00, 6.00, 2.50],
+})
+
+# Percentage relative to all stores (default)
+pct = PctOfStores(df, group_col="region")
+
+# Percentage relative to stores within each region
+pct_within = PctOfStores(df, group_col="region", within_group=True)
+```
diff --git a/pyretailscience/utils/date.py b/pyretailscience/utils/date.py
@@ -9,7 +9,21 @@
 
 
 def _normalize_datetime(date_val: datetime | str) -> datetime:
-    """Convert string or datetime to timezone-aware datetime object."""
+    """Convert a string or datetime to a timezone-aware datetime object.
+
+    Strings are parsed as ``%Y-%m-%d`` and localized to UTC.  Naive datetimes
+    are made aware by attaching UTC.  Already-aware datetimes are returned
+    unchanged.
+
+    Args:
+        date_val (datetime | str): A date string (``YYYY-MM-DD``) or datetime to normalize.
+
+    Returns:
+        datetime: A timezone-aware datetime in UTC.
+
+    Raises:
+        TypeError: If *date_val* is neither a ``str`` nor a ``datetime``.
+    """
     if isinstance(date_val, str):
         # Convert string to timezone-aware datetime
         return datetime.strptime(date_val, "%Y-%m-%d").replace(tzinfo=timezone.utc)
diff --git a/tests/metrics/distribution/test_acv.py b/tests/metrics/distribution/test_acv.py
@@ -7,6 +7,9 @@
 from pandas.testing import assert_frame_equal
 
 from pyretailscience.metrics.distribution.acv import Acv
+from pyretailscience.options import ColumnHelper
+
+cols = ColumnHelper()
 
 
 class TestAcv:
@@ -16,10 +19,10 @@ def test_acv_total_no_grouping(self):
         """Test total ACV across all transactions without grouping."""
         df = pd.DataFrame(
             {
-                "customer_id": [1, 2, 3, 1, 2],
-                "store_id": [101, 101, 102, 102, 103],
-                "product_id": [10, 20, 30, 40, 50],
-                "unit_spend": [500_000.0, 750_000.0, 300_000.0, 600_000.0, 350_000.0],
+                cols.customer_id: [1, 2, 3, 1, 2],
+                cols.store_id: [101, 101, 102, 102, 103],
+                cols.product_id: [10, 20, 30, 40, 50],
+                cols.unit_spend: [500_000.0, 750_000.0, 300_000.0, 600_000.0, 350_000.0],
             }
         )
         result = Acv(df).df
@@ -31,15 +34,15 @@ def test_acv_grouped_by_store(self, input_type):
         """Test ACV grouped by store returns correct per-store values for both input types."""
         pdf = pd.DataFrame(
             {
-                "store_id": [101, 101, 102, 102, 103],
-                "unit_spend": [400_000.0, 600_000.0, 300_000.0, 200_000.0, 500_000.0],
+                cols.store_id: [101, 101, 102, 102, 103],
+                cols.unit_spend: [400_000.0, 600_000.0, 300_000.0, 200_000.0, 500_000.0],
             }
         )
         df = ibis.memtable(pdf) if input_type == "ibis" else pdf
-        result = Acv(df, group_col="store_id").df.sort_values("store_id").reset_index(drop=True)
+        result = Acv(df, group_col=cols.store_id).df.sort_values(cols.store_id).reset_index(drop=True)
         expected = pd.DataFrame(
             {
-                "store_id": [101, 102, 103],
+                cols.store_id: [101, 102, 103],
                 "acv": [1.0, 0.5, 0.5],
             }
         )
@@ -49,15 +52,15 @@ def test_acv_group_col_list(self):
         """Test ACV grouped by multiple columns."""
         df = pd.DataFrame(
             {
-                "store_id": [101, 101, 102],
+                cols.store_id: [101, 101, 102],
                 "region": ["North", "North", "South"],
-                "unit_spend": [1_000_000.0, 500_000.0, 2_000_000.0],
+                cols.unit_spend: [1_000_000.0, 500_000.0, 2_000_000.0],
             }
         )
-        result = Acv(df, group_col=["store_id", "region"]).df.sort_values("store_id").reset_index(drop=True)
+        result = Acv(df, group_col=[cols.store_id, "region"]).df.sort_values(cols.store_id).reset_index(drop=True)
         expected = pd.DataFrame(
             {
-                "store_id": [101, 102],
+                cols.store_id: [101, 102],
                 "region": ["North", "South"],
                 "acv": [1.5, 2.0],
             }
@@ -68,37 +71,37 @@ def test_acv_with_nan_values(self):
         """Test that NaN values are excluded from the ACV sum."""
         df = pd.DataFrame(
             {
-                "store_id": [101, 101, 102],
-                "unit_spend": [1_000_000.0, np.nan, 500_000.0],
+                cols.store_id: [101, 101, 102],
+                cols.unit_spend: [1_000_000.0, np.nan, 500_000.0],
             }
         )
-        result = Acv(df, group_col="store_id").df.sort_values("store_id").reset_index(drop=True)
+        result = Acv(df, group_col=cols.store_id).df.sort_values(cols.store_id).reset_index(drop=True)
         expected = pd.DataFrame(
             {
-                "store_id": [101, 102],
+                cols.store_id: [101, 102],
                 "acv": [1.0, 0.5],
             }
         )
         assert_frame_equal(result, expected)
 
     def test_acv_missing_column_raises(self):
         """Test that missing unit_spend column raises ValueError."""
-        df = pd.DataFrame({"customer_id": [1, 2], "store_id": [101, 102]})
+        df = pd.DataFrame({cols.customer_id: [1, 2], cols.store_id: [101, 102]})
         with pytest.raises(ValueError, match="missing"):
             Acv(df)
 
-    def test_acv_missing_group_col_column_raises(self):
+    def test_acv_missing_group_col_raises(self):
         """Test that missing group_col column raises ValueError."""
-        df = pd.DataFrame({"unit_spend": [100.0, 200.0]})
+        df = pd.DataFrame({cols.unit_spend: [100.0, 200.0]})
         with pytest.raises(ValueError, match="missing"):
-            Acv(df, group_col="store_id")
+            Acv(df, group_col=cols.store_id)
 
     def test_acv_custom_scale_factor(self):
         """Test ACV with a custom scale factor."""
         df = pd.DataFrame(
             {
-                "store_id": [101, 102],
-                "unit_spend": [5_000.0, 10_000.0],
+                cols.store_id: [101, 102],
+                cols.unit_spend: [5_000.0, 10_000.0],
             }
         )
         result = Acv(df, acv_scale_factor=1_000).df
@@ -108,11 +111,11 @@ def test_acv_custom_scale_factor(self):
     @pytest.mark.parametrize("scale_factor", [0, -1_000])
     def test_acv_non_positive_scale_factor_raises(self, scale_factor):
         """Test that zero or negative acv_scale_factor raises ValueError."""
-        df = pd.DataFrame({"unit_spend": [500_000.0, 1_000_000.0]})
+        df = pd.DataFrame({cols.unit_spend: [500_000.0, 1_000_000.0]})
         with pytest.raises(ValueError, match="acv_scale_factor must be positive"):
             Acv(df, acv_scale_factor=scale_factor)
 
     def test_acv_invalid_type_raises(self):
         """Test that passing a non-DataFrame/Table raises TypeError."""
         with pytest.raises(TypeError, match="pandas DataFrame or an Ibis Table"):
-            Acv({"unit_spend": [100.0]})
+            Acv({cols.unit_spend: [100.0]})
diff --git a/tests/metrics/distribution/test_pct_of_stores.py b/tests/metrics/distribution/test_pct_of_stores.py
@@ -220,14 +220,14 @@ def test_within_group_ignored_without_group_col(self):
         result_without = PctOfStores(df, within_group=False).df.sort_values(cols.product_id).reset_index(drop=True)
         assert_frame_equal(result_with, result_without)
 
-    def test_within_group_with_user_column_named_total_stores(self):
-        """Test that a user column named _total_stores does not collide with internal temp column."""
+    def test_within_group_ignores_unrelated_extra_columns(self):
+        """Test that extra columns in the input don't affect within_group computation."""
         df = pd.DataFrame(
             {
                 cols.store_id: [10, 20, 30, 40, 10],
                 cols.product_id: [501, 501, 502, 502, 502],
                 "region": ["North", "North", "South", "South", "North"],
-                "_total_stores": [100, 200, 300, 400, 100],
+                "some_metric": [100, 200, 300, 400, 100],
                 cols.unit_spend: [5.99, 3.49, 4.00, 6.00, 2.50],
             }
         )

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,9 @@`
`7`	`7`	`from pandas.testing import assert_frame_equal`
`8`	`8`
`9`	`9`	`from pyretailscience.metrics.distribution.acv import Acv`
	`10`	`+from pyretailscience.options import ColumnHelper`
	`11`	`+`
	`12`	`+cols = ColumnHelper()`
`10`	`13`
`11`	`14`
`12`	`15`	`class TestAcv:`
`@@ -16,10 +19,10 @@ def test_acv_total_no_grouping(self):`
`16`	`19`	`"""Test total ACV across all transactions without grouping."""`
`17`	`20`	`df = pd.DataFrame(`
`18`	`21`	`{`
`19`		`- "customer_id": [1, 2, 3, 1, 2],`
`20`		`- "store_id": [101, 101, 102, 102, 103],`
`21`		`- "product_id": [10, 20, 30, 40, 50],`
`22`		`- "unit_spend": [500_000.0, 750_000.0, 300_000.0, 600_000.0, 350_000.0],`
	`22`	`+ cols.customer_id: [1, 2, 3, 1, 2],`
	`23`	`+ cols.store_id: [101, 101, 102, 102, 103],`
	`24`	`+ cols.product_id: [10, 20, 30, 40, 50],`
	`25`	`+ cols.unit_spend: [500_000.0, 750_000.0, 300_000.0, 600_000.0, 350_000.0],`
`23`	`26`	`}`
`24`	`27`	`)`
`25`	`28`	`result = Acv(df).df`
`@@ -31,15 +34,15 @@ def test_acv_grouped_by_store(self, input_type):`
`31`	`34`	`"""Test ACV grouped by store returns correct per-store values for both input types."""`
`32`	`35`	`pdf = pd.DataFrame(`
`33`	`36`	`{`
`34`		`- "store_id": [101, 101, 102, 102, 103],`
`35`		`- "unit_spend": [400_000.0, 600_000.0, 300_000.0, 200_000.0, 500_000.0],`
	`37`	`+ cols.store_id: [101, 101, 102, 102, 103],`
	`38`	`+ cols.unit_spend: [400_000.0, 600_000.0, 300_000.0, 200_000.0, 500_000.0],`
`36`	`39`	`}`
`37`	`40`	`)`
`38`	`41`	`df = ibis.memtable(pdf) if input_type == "ibis" else pdf`
`39`		`- result = Acv(df, group_col="store_id").df.sort_values("store_id").reset_index(drop=True)`
	`42`	`+ result = Acv(df, group_col=cols.store_id).df.sort_values(cols.store_id).reset_index(drop=True)`
`40`	`43`	`expected = pd.DataFrame(`
`41`	`44`	`{`
`42`		`- "store_id": [101, 102, 103],`
	`45`	`+ cols.store_id: [101, 102, 103],`
`43`	`46`	`"acv": [1.0, 0.5, 0.5],`
`44`	`47`	`}`
`45`	`48`	`)`
`@@ -49,15 +52,15 @@ def test_acv_group_col_list(self):`
`49`	`52`	`"""Test ACV grouped by multiple columns."""`
`50`	`53`	`df = pd.DataFrame(`
`51`	`54`	`{`
`52`		`- "store_id": [101, 101, 102],`
	`55`	`+ cols.store_id: [101, 101, 102],`
`53`	`56`	`"region": ["North", "North", "South"],`
`54`		`- "unit_spend": [1_000_000.0, 500_000.0, 2_000_000.0],`
	`57`	`+ cols.unit_spend: [1_000_000.0, 500_000.0, 2_000_000.0],`
`55`	`58`	`}`
`56`	`59`	`)`
`57`		`- result = Acv(df, group_col=["store_id", "region"]).df.sort_values("store_id").reset_index(drop=True)`
	`60`	`+ result = Acv(df, group_col=[cols.store_id, "region"]).df.sort_values(cols.store_id).reset_index(drop=True)`
`58`	`61`	`expected = pd.DataFrame(`
`59`	`62`	`{`
`60`		`- "store_id": [101, 102],`
	`63`	`+ cols.store_id: [101, 102],`
`61`	`64`	`"region": ["North", "South"],`
`62`	`65`	`"acv": [1.5, 2.0],`
`63`	`66`	`}`
`@@ -68,37 +71,37 @@ def test_acv_with_nan_values(self):`
`68`	`71`	`"""Test that NaN values are excluded from the ACV sum."""`
`69`	`72`	`df = pd.DataFrame(`
`70`	`73`	`{`
`71`		`- "store_id": [101, 101, 102],`
`72`		`- "unit_spend": [1_000_000.0, np.nan, 500_000.0],`
	`74`	`+ cols.store_id: [101, 101, 102],`
	`75`	`+ cols.unit_spend: [1_000_000.0, np.nan, 500_000.0],`
`73`	`76`	`}`
`74`	`77`	`)`
`75`		`- result = Acv(df, group_col="store_id").df.sort_values("store_id").reset_index(drop=True)`
	`78`	`+ result = Acv(df, group_col=cols.store_id).df.sort_values(cols.store_id).reset_index(drop=True)`
`76`	`79`	`expected = pd.DataFrame(`
`77`	`80`	`{`
`78`		`- "store_id": [101, 102],`
	`81`	`+ cols.store_id: [101, 102],`
`79`	`82`	`"acv": [1.0, 0.5],`
`80`	`83`	`}`
`81`	`84`	`)`
`82`	`85`	`assert_frame_equal(result, expected)`
`83`	`86`
`84`	`87`	`def test_acv_missing_column_raises(self):`
`85`	`88`	`"""Test that missing unit_spend column raises ValueError."""`
`86`		`- df = pd.DataFrame({"customer_id": [1, 2], "store_id": [101, 102]})`
	`89`	`+ df = pd.DataFrame({cols.customer_id: [1, 2], cols.store_id: [101, 102]})`
`87`	`90`	`with pytest.raises(ValueError, match="missing"):`
`88`	`91`	`Acv(df)`
`89`	`92`
`90`		`- def test_acv_missing_group_col_column_raises(self):`
	`93`	`+ def test_acv_missing_group_col_raises(self):`
`91`	`94`	`"""Test that missing group_col column raises ValueError."""`
`92`		`- df = pd.DataFrame({"unit_spend": [100.0, 200.0]})`
	`95`	`+ df = pd.DataFrame({cols.unit_spend: [100.0, 200.0]})`
`93`	`96`	`with pytest.raises(ValueError, match="missing"):`
`94`		`- Acv(df, group_col="store_id")`
	`97`	`+ Acv(df, group_col=cols.store_id)`
`95`	`98`
`96`	`99`	`def test_acv_custom_scale_factor(self):`
`97`	`100`	`"""Test ACV with a custom scale factor."""`
`98`	`101`	`df = pd.DataFrame(`
`99`	`102`	`{`
`100`		`- "store_id": [101, 102],`
`101`		`- "unit_spend": [5_000.0, 10_000.0],`
	`103`	`+ cols.store_id: [101, 102],`
	`104`	`+ cols.unit_spend: [5_000.0, 10_000.0],`
`102`	`105`	`}`
`103`	`106`	`)`
`104`	`107`	`result = Acv(df, acv_scale_factor=1_000).df`
`@@ -108,11 +111,11 @@ def test_acv_custom_scale_factor(self):`
`108`	`111`	`@pytest.mark.parametrize("scale_factor", [0, -1_000])`
`109`	`112`	`def test_acv_non_positive_scale_factor_raises(self, scale_factor):`
`110`	`113`	`"""Test that zero or negative acv_scale_factor raises ValueError."""`
`111`		`- df = pd.DataFrame({"unit_spend": [500_000.0, 1_000_000.0]})`
	`114`	`+ df = pd.DataFrame({cols.unit_spend: [500_000.0, 1_000_000.0]})`
`112`	`115`	`with pytest.raises(ValueError, match="acv_scale_factor must be positive"):`
`113`	`116`	`Acv(df, acv_scale_factor=scale_factor)`
`114`	`117`
`115`	`118`	`def test_acv_invalid_type_raises(self):`
`116`	`119`	`"""Test that passing a non-DataFrame/Table raises TypeError."""`
`117`	`120`	`with pytest.raises(TypeError, match="pandas DataFrame or an Ibis Table"):`
`118`		`- Acv({"unit_spend": [100.0]})`
	`121`	`+ Acv({cols.unit_spend: [100.0]})`
Original file line number	Diff line number	Diff line change
`@@ -220,14 +220,14 @@ def test_within_group_ignored_without_group_col(self):`
`220`	`220`	`result_without = PctOfStores(df, within_group=False).df.sort_values(cols.product_id).reset_index(drop=True)`
`221`	`221`	`assert_frame_equal(result_with, result_without)`
`222`	`222`
`223`		`- def test_within_group_with_user_column_named_total_stores(self):`
`224`		`- """Test that a user column named _total_stores does not collide with internal temp column."""`
	`223`	`+ def test_within_group_ignores_unrelated_extra_columns(self):`
	`224`	`+ """Test that extra columns in the input don't affect within_group computation."""`
`225`	`225`	`df = pd.DataFrame(`
`226`	`226`	`{`
`227`	`227`	`cols.store_id: [10, 20, 30, 40, 10],`
`228`	`228`	`cols.product_id: [501, 501, 502, 502, 502],`
`229`	`229`	`"region": ["North", "North", "South", "South", "North"],`
`230`		`- "_total_stores": [100, 200, 300, 400, 100],`
	`230`	`+ "some_metric": [100, 200, 300, 400, 100],`
`231`	`231`	`cols.unit_spend: [5.99, 3.49, 4.00, 6.00, 2.50],`
`232`	`232`	`}`
`233`	`233`	`)`