feat: convert seg stats to use Ibis

mvanwyk · mvanwyk · commit de3a12bba8b0 · 2025-02-08T11:27:26.000+01:00
diff --git a/docs/examples/segmentation.ipynb b/docs/examples/segmentation.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -68,6 +68,7 @@ ignore = [
     "TRY003", # Disable until we start creating proper exception classes
     "PT011",  # Disable until we start creating proper exception classes
     "PTH123", # Not using open() to open files
+    "SLF001", # Ibis makes a lot of use of the ibis._[column] which triggers this
 ]
 select = [
     "A",    # Builtins
diff --git a/pyretailscience/options.py b/pyretailscience/options.py
@@ -237,7 +237,7 @@ def load_from_toml(cls, file_path: str) -> "Options":
 
         for section, options in toml_data.items():
             for option_name, option_value in Options.flatten_options(section, options).items():
-                if option_name in options_instance._options:  # noqa: SLF001
+                if option_name in options_instance._options:
                     options_instance.set_option(option_name, option_value)
                 else:
                     msg = f"Unknown option in TOML file: {option_name}"
@@ -392,6 +392,7 @@ def __init__(self) -> None:
         self.agg_customer_id_diff = self.join_options("column.agg.customer_id", "column.suffix.difference")
         self.agg_customer_id_pct_diff = self.join_options("column.agg.customer_id", "column.suffix.percent_difference")
         self.agg_customer_id_contrib = self.join_options("column.agg.customer_id", "column.suffix.contribution")
+        self.customers_pct = self.join_options("column.agg.customer_id", "column.suffix.percent")
         # Transactions
         self.transaction_id = get_option("column.transaction_id")
         self.agg_transaction_id = get_option("column.agg.transaction_id")
diff --git a/pyretailscience/segmentation.py b/pyretailscience/segmentation.py
@@ -2,10 +2,8 @@
 
 from typing import Literal
 
-import duckdb
 import ibis
 import pandas as pd
-from duckdb import DuckDBPyRelation
 from matplotlib.axes import Axes, SubplotBase
 
 import pyretailscience.style.graph_utils as gu
@@ -155,7 +153,7 @@ class HMLSegmentation(ThresholdSegmentation):
 
     def __init__(
         self,
-        df: pd.DataFrame,
+        df: pd.DataFrame | ibis.Table,
         value_col: str | None = None,
         agg_func: str = "sum",
         zero_value_customers: Literal["separate_segment", "exclude", "include_with_light"] = "separate_segment",
@@ -189,24 +187,27 @@ def __init__(
 class SegTransactionStats:
     """Calculates transaction statistics by segment."""
 
-    def __init__(self, data: pd.DataFrame | DuckDBPyRelation, segment_col: str = "segment_name") -> None:
+    _df: pd.DataFrame | None = None
+
+    def __init__(self, data: pd.DataFrame | ibis.Table, segment_col: str = "segment_name") -> None:
         """Calculates transaction statistics by segment.
 
         Args:
-            data (pd.DataFrame | DuckDBPyRelation): The transaction data. The dataframe must contain the columns
+            data (pd.DataFrame | ibis.Table): The transaction data. The dataframe must contain the columns
                 customer_id, unit_spend and transaction_id. If the dataframe contains the column unit_quantity, then
                 the columns unit_spend and unit_quantity are used to calculate the price_per_unit and
                 units_per_transaction.
             segment_col (str, optional): The column to use for the segmentation. Defaults to "segment_name".
         """
+        cols = ColumnHelper()
         required_cols = [
-            get_option("column.customer_id"),
-            get_option("column.unit_spend"),
-            get_option("column.transaction_id"),
+            cols.customer_id,
+            cols.unit_spend,
+            cols.transaction_id,
             segment_col,
         ]
-        if get_option("column.unit_quantity") in data.columns:
-            required_cols.append(get_option("column.unit_quantity"))
+        if cols.unit_qty in data.columns:
+            required_cols.append(cols.unit_qty)
 
         missing_cols = set(required_cols) - set(data.columns)
         if len(missing_cols) > 0:
@@ -215,66 +216,103 @@ def __init__(self, data: pd.DataFrame | DuckDBPyRelation, segment_col: str = "se
 
         self.segment_col = segment_col
 
-        self.df = self._calc_seg_stats(data, segment_col)
+        self.table = self._calc_seg_stats(data, segment_col)
 
     @staticmethod
-    def _calc_seg_stats(data: pd.DataFrame | DuckDBPyRelation, segment_col: str) -> pd.DataFrame:
+    def _get_col_order(include_quantity: bool) -> list[str]:
+        """Returns the default column order.
+
+        Columns should be supplied in the same order regardless of the function being called.
+
+        Args:
+            include_quantity (bool): Whether to include the columns related to quantity.
+
+        Returns:
+            list[str]: The default column order.
+        """
+        cols = ColumnHelper()
+        col_order = [
+            cols.agg_unit_spend,
+            cols.agg_transaction_id,
+            cols.agg_customer_id,
+            cols.calc_spend_per_cust,
+            cols.calc_spend_per_trans,
+            cols.calc_trans_per_cust,
+            cols.customers_pct,
+        ]
+        if include_quantity:
+            col_order.insert(3, "units")
+            col_order.insert(7, cols.calc_units_per_trans)
+            col_order.insert(7, cols.calc_price_per_unit)
+
+        return col_order
+
+    @staticmethod
+    def _calc_seg_stats(data: pd.DataFrame | ibis.Table, segment_col: str) -> ibis.Table:
         """Calculates the transaction statistics by segment.
 
         Args:
-            data (DuckDBPyRelation): The transaction data.
+            data (pd.DataFrame | ibis.Table): The transaction data.
             segment_col (str): The column to use for the segmentation.
 
         Returns:
             pd.DataFrame: The transaction statistics by segment.
 
         """
         if isinstance(data, pd.DataFrame):
-            data = duckdb.from_df(data)
-        elif not isinstance(data, DuckDBPyRelation):
-            raise TypeError("data must be either a pandas DataFrame or a DuckDBPyRelation")
-
-        base_aggs = [
-            f"SUM({get_option('column.unit_spend')}) as {get_option('column.agg.unit_spend')},",
-            f"COUNT(DISTINCT {get_option('column.transaction_id')}) as {get_option('column.agg.transaction_id')},",
-            f"COUNT(DISTINCT {get_option('column.customer_id')}) as {get_option('column.agg.customer_id')},",
-        ]
+            data = ibis.memtable(data)
 
-        total_customers = data.aggregate("COUNT(DISTINCT customer_id)").fetchone()[0]
-        return_cols = [
-            "*,",
-            f"{get_option('column.agg.unit_spend')} / {get_option('column.agg.customer_id')} ",
-            f"as {get_option('column.calc.spend_per_customer')},",
-            f"{get_option('column.agg.unit_spend')} / {get_option('column.agg.transaction_id')} ",
-            f"as {get_option('column.calc.spend_per_transaction')},",
-            f"{get_option('column.agg.transaction_id')} / {get_option('column.agg.customer_id')} ",
-            f"as {get_option('column.calc.transactions_per_customer')},",
-            f"{get_option('column.agg.customer_id')} / {total_customers}",
-            f"as customers_{get_option('column.suffix.percent')},",
-        ]
+        elif not isinstance(data, ibis.Table):
+            raise TypeError("data must be either a pandas DataFrame or a ibis Table")
 
-        if get_option("column.unit_quantity") in data.columns:
-            base_aggs.append(
-                f"SUM({get_option('column.unit_quantity')})::bigint as {get_option('column.agg.unit_quantity')},",
-            )
-            return_cols.extend(
-                [
-                    f"({get_option('column.agg.unit_spend')} / {get_option('column.agg.unit_quantity')}) ",
-                    f"as {get_option('column.calc.price_per_unit')},",
-                    f"({get_option('column.agg.unit_quantity')} / {get_option('column.agg.transaction_id')}) ",
-                    f"as {get_option('column.calc.units_per_transaction')},",
-                ],
-            )
+        cols = ColumnHelper()
 
-        segment_stats = data.aggregate(f"{segment_col} as segment_name," + "".join(base_aggs))
-        total_stats = data.aggregate("'Total' as segment_name," + "".join(base_aggs))
-        final_stats_df = segment_stats.union(total_stats).select("".join(return_cols)).df()
-        final_stats_df = final_stats_df.set_index("segment_name").sort_index()
+        # Base aggregations for segments
+        aggs = {
+            cols.agg_unit_spend: data[cols.unit_spend].sum(),
+            cols.agg_transaction_id: data[cols.transaction_id].nunique(),
+            cols.agg_customer_id: data[cols.customer_id].nunique(),
+        }
+        if cols.unit_qty in data.columns:
+            aggs[cols.agg_unit_qty] = data[cols.unit_qty].sum()
+
+        # Calculate metrics for segments and total
+        segment_metrics = data.group_by(segment_col).aggregate(**aggs)
+        total_metrics = data.aggregate(**aggs).mutate(**{segment_col: ibis.literal("Total")})
+
+        total_customers = data[cols.customer_id].nunique()
+
+        # Cross join with total_customers to make it available for percentage calculation
+        final_metrics = ibis.union(segment_metrics, total_metrics).mutate(
+            **{
+                cols.calc_spend_per_cust: ibis._[cols.agg_unit_spend] / ibis._[cols.agg_customer_id],
+                cols.calc_spend_per_trans: ibis._[cols.agg_unit_spend] / ibis._[cols.agg_transaction_id],
+                cols.calc_trans_per_cust: ibis._[cols.agg_transaction_id] / ibis._[cols.agg_customer_id],
+                cols.customers_pct: ibis._[cols.agg_customer_id] / total_customers,
+            },
+        )
 
-        # Make sure Total is the last row
-        desired_index_sort = final_stats_df.index.drop("Total").tolist() + ["Total"]  # noqa: RUF005
+        if cols.unit_qty in data.columns:
+            final_metrics = final_metrics.mutate(
+                **{
+                    cols.calc_price_per_unit: ibis._[cols.agg_unit_spend] / ibis._[cols.agg_unit_qty],
+                    cols.calc_units_per_trans: ibis._[cols.agg_unit_qty] / ibis._[cols.agg_transaction_id],
+                },
+            )
+
+        return final_metrics
 
-        return final_stats_df.reindex(desired_index_sort)
+    @property
+    def df(self) -> pd.DataFrame:
+        """Returns the dataframe with the transaction statistics by segment."""
+        if self._df is None:
+            cols = ColumnHelper()
+            col_order = [
+                self.segment_col,
+                *SegTransactionStats._get_col_order(include_quantity=cols.agg_unit_qty in self.table.columns),
+            ]
+            self._df = self.table.execute()[col_order]
+        return self._df
 
     def plot(
         self,
@@ -325,9 +363,9 @@ def plot(
         if orientation == "horizontal":
             kind = "barh"
 
-        val_s = self.df[value_col]
+        val_s = self.df.set_index(self.segment_col)[value_col]
         if hide_total:
-            val_s = val_s[val_s.index != "total"]
+            val_s = val_s[val_s.index != "Total"]
 
         if sort_order is not None:
             ascending = sort_order == "ascending"
diff --git a/tests/test_segmentation.py b/tests/test_segmentation.py
@@ -18,7 +18,7 @@ def base_df(self):
         return pd.DataFrame(
             {
                 cols.customer_id: [1, 2, 3, 4, 5],
-                cols.unit_spend: [100, 200, 150, 300, 250],
+                cols.unit_spend: [100.0, 200.0, 150.0, 300.0, 250.0],
                 cols.transaction_id: [101, 102, 103, 104, 105],
                 "segment_name": ["A", "B", "A", "B", "A"],
                 cols.unit_qty: [10, 20, 15, 30, 25],
@@ -37,21 +37,22 @@ def test_correctly_calculates_revenue_transactions_customers_per_segment(self, b
                 cols.calc_spend_per_cust: [166.666667, 250.0, 200.0],
                 cols.calc_spend_per_trans: [166.666667, 250.0, 200.0],
                 cols.calc_trans_per_cust: [1.0, 1.0, 1.0],
-                f"customers_{get_option('column.suffix.percent')}": [0.6, 0.4, 1.0],
                 cols.calc_price_per_unit: [10.0, 10.0, 10.0],
                 cols.calc_units_per_trans: [16.666667, 25.0, 20.0],
+                f"customers_{get_option('column.suffix.percent')}": [0.6, 0.4, 1.0],
             },
-        ).set_index("segment_name")
-
-        segment_stats = SegTransactionStats._calc_seg_stats(base_df, "segment_name")
+        )
+        segment_stats = (
+            SegTransactionStats(base_df, "segment_name").df.sort_values("segment_name").reset_index(drop=True)
+        )
         pd.testing.assert_frame_equal(segment_stats, expected_output)
 
     def test_correctly_calculates_revenue_transactions_customers(self):
         """Test that the method correctly calculates at the transaction level."""
         df = pd.DataFrame(
             {
                 get_option("column.customer_id"): [1, 2, 3, 4, 5],
-                cols.unit_spend: [100, 200, 150, 300, 250],
+                cols.unit_spend: [100.0, 200.0, 150.0, 300.0, 250.0],
                 cols.transaction_id: [101, 102, 103, 104, 105],
                 "segment_name": ["A", "B", "A", "B", "A"],
             },
@@ -68,18 +69,11 @@ def test_correctly_calculates_revenue_transactions_customers(self):
                 cols.calc_trans_per_cust: [1.0, 1.0, 1.0],
                 f"customers_{get_option('column.suffix.percent')}": [0.6, 0.4, 1.0],
             },
-        ).set_index("segment_name")
+        )
 
-        segment_stats = SegTransactionStats._calc_seg_stats(df, "segment_name")
+        segment_stats = SegTransactionStats(df, "segment_name").df.sort_values("segment_name").reset_index(drop=True)
         pd.testing.assert_frame_equal(segment_stats, expected_output)
 
-    def test_does_not_alter_original_dataframe(self, base_df):
-        """Test that the method does not alter the original DataFrame."""
-        original_df = base_df.copy()
-        _ = SegTransactionStats._calc_seg_stats(base_df, "segment_name")
-
-        pd.testing.assert_frame_equal(base_df, original_df)
-
     def test_handles_dataframe_with_one_segment(self, base_df):
         """Test that the method correctly handles a DataFrame with only one segment."""
         df = base_df.copy()
@@ -95,13 +89,13 @@ def test_handles_dataframe_with_one_segment(self, base_df):
                 cols.calc_spend_per_cust: [200.0, 200.0],
                 cols.calc_spend_per_trans: [200.0, 200.0],
                 cols.calc_trans_per_cust: [1.0, 1.0],
-                f"customers_{get_option('column.suffix.percent')}": [1.0, 1.0],
                 cols.calc_price_per_unit: [10.0, 10.0],
                 cols.calc_units_per_trans: [20.0, 20.0],
+                f"customers_{get_option('column.suffix.percent')}": [1.0, 1.0],
             },
-        ).set_index("segment_name")
+        )
 
-        segment_stats = SegTransactionStats._calc_seg_stats(df, "segment_name")
+        segment_stats = SegTransactionStats(df, "segment_name").df
         pd.testing.assert_frame_equal(segment_stats, expected_output)
 
 

Original file line number	Diff line number	Diff line change
`@@ -68,6 +68,7 @@ ignore = [`
`68`	`68`	`"TRY003", # Disable until we start creating proper exception classes`
`69`	`69`	`"PT011", # Disable until we start creating proper exception classes`
`70`	`70`	`"PTH123", # Not using open() to open files`
	`71`	`+ "SLF001", # Ibis makes a lot of use of the ibis._[column] which triggers this`
`71`	`72`	`]`
`72`	`73`	`select = [`
`73`	`74`	`"A", # Builtins`