|
| 1 | +"""% of Stores (Numeric Distribution) metric. |
| 2 | +
|
| 3 | +% of Stores measures the share of total stores in the dataset that sell a given product. |
| 4 | +Every store counts equally regardless of its sales volume. |
| 5 | +""" |
| 6 | + |
| 7 | +from __future__ import annotations |
| 8 | + |
| 9 | +import ibis |
| 10 | +import pandas as pd |
| 11 | +from ibis import _ |
| 12 | + |
| 13 | +from pyretailscience.metrics.base import ratio_metric |
| 14 | +from pyretailscience.options import ColumnHelper, get_option |
| 15 | +from pyretailscience.utils.validation import validate_columns |
| 16 | + |
| 17 | +_TEMP_TOTAL_STORES = "__prs_temp_total_stores__" |
| 18 | + |
| 19 | + |
| 20 | +class PctOfStores: |
| 21 | + """Calculates the percentage of stores selling each product. |
| 22 | +
|
| 23 | + This is the simplest, unweighted distribution metric (numeric distribution). |
| 24 | + It answers the question: "What fraction of stores carry this product?" |
| 25 | +
|
| 26 | + Results are accessible via the ``table`` attribute (ibis Table) or the ``df`` property |
| 27 | + (materialized pandas DataFrame). |
| 28 | +
|
| 29 | + Args: |
| 30 | + df (pd.DataFrame | ibis.Table): Transaction-level data containing at least |
| 31 | + store_id and product_id columns. |
| 32 | + product_col (str | None, optional): Column defining product granularity. |
| 33 | + Defaults to ``get_option("column.product_id")``. |
| 34 | + group_col (str | list[str] | None, optional): Additional grouping dimensions |
| 35 | + (e.g., ``"category_0_name"``). Defaults to None. |
| 36 | + within_group (bool, optional): Controls the denominator when ``group_col`` is specified. |
| 37 | + When ``False`` (default), the percentage is relative to all stores in the dataset. |
| 38 | + When ``True``, the percentage is relative to stores within each group independently. |
| 39 | + Has no effect when ``group_col`` is None. Defaults to False. |
| 40 | +
|
| 41 | + Raises: |
| 42 | + TypeError: If df is not a pandas DataFrame or an Ibis Table. |
| 43 | + ValueError: If required columns are missing from the data, or if product_col |
| 44 | + appears in group_col. |
| 45 | + """ |
| 46 | + |
| 47 | + def __init__( |
| 48 | + self, |
| 49 | + df: pd.DataFrame | ibis.Table, |
| 50 | + *, |
| 51 | + product_col: str | None = None, |
| 52 | + group_col: str | list[str] | None = None, |
| 53 | + within_group: bool = False, |
| 54 | + ) -> None: |
| 55 | + """Initializes the % of Stores calculation.""" |
| 56 | + self._df: pd.DataFrame | None = None |
| 57 | + self.table: ibis.Table |
| 58 | + |
| 59 | + if isinstance(df, pd.DataFrame): |
| 60 | + df = ibis.memtable(df) |
| 61 | + elif not isinstance(df, ibis.Table): |
| 62 | + raise TypeError("df must be either a pandas DataFrame or an Ibis Table.") |
| 63 | + |
| 64 | + store_id_col = get_option("column.store_id") |
| 65 | + product_col = product_col if product_col is not None else get_option("column.product_id") |
| 66 | + |
| 67 | + if isinstance(group_col, str): |
| 68 | + group_col = [group_col] |
| 69 | + |
| 70 | + required_cols = [store_id_col, product_col] |
| 71 | + if group_col is not None: |
| 72 | + if product_col in group_col: |
| 73 | + msg = f"product_col '{product_col}' must not also appear in group_col" |
| 74 | + raise ValueError(msg) |
| 75 | + required_cols.extend(group_col) |
| 76 | + validate_columns(df, required_cols) |
| 77 | + |
| 78 | + group_cols = [product_col] |
| 79 | + if group_col is not None: |
| 80 | + group_cols.extend(group_col) |
| 81 | + |
| 82 | + store_product = df.select([store_id_col, *group_cols]).distinct() |
| 83 | + |
| 84 | + agg_stores_col = get_option("column.agg.store_id") |
| 85 | + per_group = store_product.group_by(group_cols).aggregate( |
| 86 | + **{agg_stores_col: _[store_id_col].count()}, |
| 87 | + ) |
| 88 | + |
| 89 | + if within_group and group_col is not None: |
| 90 | + total_stores = store_product.group_by(group_col).aggregate( |
| 91 | + **{_TEMP_TOTAL_STORES: _[store_id_col].nunique()}, |
| 92 | + ) |
| 93 | + per_group = per_group.inner_join(total_stores, group_col) |
| 94 | + denominator = _[_TEMP_TOTAL_STORES] |
| 95 | + else: |
| 96 | + denominator = store_product[store_id_col].nunique() |
| 97 | + |
| 98 | + pct_stores_col = ColumnHelper.join_options("column.agg.store_id", "column.suffix.percent") |
| 99 | + self.table = per_group.mutate( |
| 100 | + **{pct_stores_col: ratio_metric(_[agg_stores_col], denominator)}, |
| 101 | + ) |
| 102 | + if within_group and group_col is not None: |
| 103 | + self.table = self.table.drop(_TEMP_TOTAL_STORES) |
| 104 | + |
| 105 | + @property |
| 106 | + def df(self) -> pd.DataFrame: |
| 107 | + """Returns the materialized pandas DataFrame of % of Stores results. |
| 108 | +
|
| 109 | + Returns: |
| 110 | + pd.DataFrame: DataFrame with % of stores values. Cached after first access. |
| 111 | + """ |
| 112 | + if self._df is None: |
| 113 | + self._df = self.table.execute() |
| 114 | + return self._df |
0 commit comments