Data-Simply · mvanwyk · Mar 18, 2025 · Mar 17, 2025 · Mar 18, 2025 · Mar 18, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,12 +1,12 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: "v0.2.2"
+    rev: "v0.11.0"
     hooks:
       - id: ruff
         args: ["--fix"]
       - id: ruff-format
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v5.0.0
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer

diff --git a/docs/examples/cross_shop.ipynb b/docs/examples/cross_shop.ipynb
@@ -238,7 +238,9 @@
    "source": [
     "shoes_idx = df[\"category_1_name\"] == \"Shoes\"\n",
     "df.loc[shoes_idx, \"category_1_name\"] = np.random.RandomState(42).choice(\n",
-    "    [\"Shoes\", \"Jeans\"], size=shoes_idx.sum(), p=[0.5, 0.5],\n",
+    "    [\"Shoes\", \"Jeans\"],\n",
+    "    size=shoes_idx.sum(),\n",
+    "    p=[0.5, 0.5],\n",
     ")"
    ]
   },

diff --git a/docs/examples/gain_loss.ipynb b/docs/examples/gain_loss.ipynb
@@ -254,7 +254,9 @@
     "# Reasign half the rows to Calvin Klein and leave the other half as Diesel\n",
     "p2_diesel_idx = time_period_2 & (df[\"brand_name\"] == \"Diesel\")\n",
     "df.loc[p2_diesel_idx, \"brand_name\"] = np.random.RandomState(42).choice(\n",
-    "    [\"Calvin Klein\", \"Diesel\"], size=p2_diesel_idx.sum(), p=[0.75, 0.25],\n",
+    "    [\"Calvin Klein\", \"Diesel\"],\n",
+    "    size=p2_diesel_idx.sum(),\n",
+    "    p=[0.75, 0.25],\n",
     ")\n",
     "\n",
     "# Apply a 20% discount to Calvin Klein products and increase the quantity by 50%\n",

diff --git a/docs/examples/segmentation.ipynb b/docs/examples/segmentation.ipynb
@@ -701,10 +701,10 @@
     "    },\n",
     "    color=\"black\",\n",
     "    bbox={\n",
-    "        \"facecolor\":\"white\",\n",
-    "        \"edgecolor\":\"white\",\n",
-    "        \"boxstyle\":\"round,rounding_size=0.75\",\n",
-    "        \"pad\":0.75,\n",
+    "        \"facecolor\": \"white\",\n",
+    "        \"edgecolor\": \"white\",\n",
+    "        \"boxstyle\": \"round,rounding_size=0.75\",\n",
+    "        \"pad\": 0.75,\n",
     "    },\n",
     "    linespacing=1.5,\n",
     ")\n",

diff --git a/pyretailscience/analysis/cross_shop.py b/pyretailscience/analysis/cross_shop.py
@@ -1,6 +1,5 @@
 """This module contains the CrossShop class that is used to create a cross-shop diagram."""
 
-
 import ibis
 import matplotlib.pyplot as plt
 import pandas as pd

diff --git a/pyretailscience/analysis/haversine.py b/pyretailscience/analysis/haversine.py
@@ -21,6 +21,7 @@
 - **Requires Ibis-Compatible Backend**: Ensure your Ibis backend supports trigonometric functions.
 - **Assumes Spherical Earth**: Uses the Haversine formula, which introduces slight inaccuracies due to Earth's oblate shape.
 """
+
 import ibis
 
 

diff --git a/pyretailscience/analysis/segmentation.py b/pyretailscience/analysis/segmentation.py
@@ -192,7 +192,7 @@
     def __init__(
         self,
         data: pd.DataFrame | ibis.Table,
-        segment_col: str = "segment_name",
+        segment_col: str | list[str] = "segment_name",
         extra_aggs: dict[str, tuple[str, str]] | None = None,
     ) -> None:
         """Calculates transaction statistics by segment.
@@ -202,7 +202,8 @@
                 customer_id, unit_spend and transaction_id. If the dataframe contains the column unit_quantity, then
                 the columns unit_spend and unit_quantity are used to calculate the price_per_unit and
                 units_per_transaction.
-            segment_col (str, optional): The column to use for the segmentation. Defaults to "segment_name".
+            segment_col (str | list[str], optional): The column or list of columns to use for the segmentation.
+                Defaults to "segment_name".
             extra_aggs (dict[str, tuple[str, str]], optional): Additional aggregations to perform.
                 The keys in the dictionary will be the column names for the aggregation results.
                 The values are tuples with (column_name, aggregation_function), where:
@@ -211,11 +212,14 @@
                 Example: {"stores": ("store_id", "nunique")} would count unique store_ids.
         """
         cols = ColumnHelper()
+
+        if isinstance(segment_col, str):
+            segment_col = [segment_col]
         required_cols = [
             cols.customer_id,
             cols.unit_spend,
             cols.transaction_id,
-            segment_col,
+            *segment_col,
         ]
         if cols.unit_qty in data.columns:
             required_cols.append(cols.unit_qty)
@@ -273,14 +277,14 @@
     @staticmethod
     def _calc_seg_stats(
         data: pd.DataFrame | ibis.Table,
-        segment_col: str,
+        segment_col: list[str],
         extra_aggs: dict[str, tuple[str, str]] | None = None,
     ) -> ibis.Table:
         """Calculates the transaction statistics by segment.
 
         Args:
             data (pd.DataFrame | ibis.Table): The transaction data.
-            segment_col (str): The column to use for the segmentation.
+            segment_col (list[str]): The columns to use for the segmentation.
             extra_aggs (dict[str, tuple[str, str]], optional): Additional aggregations to perform.
                 The keys in the dictionary will be the column names for the aggregation results.
                 The values are tuples with (column_name, aggregation_function).
@@ -314,7 +318,7 @@
 
         # Calculate metrics for segments and total
         segment_metrics = data.group_by(segment_col).aggregate(**aggs)
-        total_metrics = data.aggregate(**aggs).mutate(segment_name=ibis.literal("Total"))
+        total_metrics = data.aggregate(**aggs).mutate({col: ibis.literal("Total") for col in segment_col})
         total_customers = data[cols.customer_id].nunique()
 
         # Cross join with total_customers to make it available for percentage calculation
@@ -343,7 +347,7 @@
         if self._df is None:
             cols = ColumnHelper()
             col_order = [
-                self.segment_col,
+                *self.segment_col,
                 *SegTransactionStats._get_col_order(include_quantity=cols.agg_unit_qty in self.table.columns),
             ]
 
@@ -392,18 +396,23 @@
         Raises:
             ValueError: If the sort_order is not "ascending", "descending" or None.
             ValueError: If the orientation is not "vertical" or "horizontal".
+            ValueError: If multiple segment columns are used, as plotting is only supported for a single segment column.
         """
         if sort_order not in ["ascending", "descending", None]:
             raise ValueError("sort_order must be either 'ascending' or 'descending' or None")
         if orientation not in ["vertical", "horizontal"]:
             raise ValueError("orientation must be either 'vertical' or 'horizontal'")
+        if len(self.segment_col) > 1:
+            raise ValueError("Plotting is only supported for a single segment column")
 
         default_title = f"{value_col.title()} by Segment"
         kind = "bar"
         if orientation == "horizontal":
             kind = "barh"
 
-        val_s = self.df.set_index(self.segment_col)[value_col]
+        # Use the first segment column for plotting
+        plot_segment_col = self.segment_col[0]
+        val_s = self.df.set_index(plot_segment_col)[value_col]
         if hide_total:
             val_s = val_s[val_s.index != "Total"]
 

diff --git a/pyretailscience/plots/time.py b/pyretailscience/plots/time.py
@@ -33,7 +33,6 @@
 - **Helper functions**: Utilizes utility functions from the `pyretailscience` package to handle styling, formatting, and other plot adjustments.
 """
 
-
 import numpy as np
 import pandas as pd
 from matplotlib.axes import Axes, SubplotBase

diff --git a/pyretailscience/plots/venn.py b/pyretailscience/plots/venn.py
@@ -22,6 +22,7 @@
 - **Pre-Aggregated Data Required**: The module does not perform data aggregation; input data should already be structured correctly.
 
 """
+
 from collections.abc import Callable
 
 import pandas as pd

diff --git a/tests/analysis/test_cross_shop.py b/tests/analysis/test_cross_shop.py
@@ -9,7 +9,7 @@
 cols = ColumnHelper()
 
 
-@pytest.fixture()
+@pytest.fixture
 def sample_data():
     """Sample data for testing."""
     return pd.DataFrame(

diff --git a/tests/analysis/test_haversine.py b/tests/analysis/test_haversine.py
@@ -1,12 +1,13 @@
 """Tests for the haversine distance module."""
+
 import ibis
 import pandas as pd
 import pytest
 
 from pyretailscience.analysis.haversine import haversine_distance
 
 
-@pytest.fixture()
+@pytest.fixture
 def sample_ibis_table():
     """Fixture to provide a sample Ibis table for testing."""
     data = {

diff --git a/tests/analysis/test_product_association.py b/tests/analysis/test_product_association.py
@@ -12,7 +12,7 @@
 class TestProductAssociations:
     """Tests for the ProductAssociations class."""
 
-    @pytest.fixture()
+    @pytest.fixture
     def transactions_df(self) -> pd.DataFrame:
         """Return a sample DataFrame for testing."""
         # fmt: off
@@ -23,7 +23,7 @@ def transactions_df(self) -> pd.DataFrame:
         })
         # fmt: on
 
-    @pytest.fixture()
+    @pytest.fixture
     def expected_results_single_items_df(self) -> pd.DataFrame:
         """Return the expected results for the single items association analysis."""
         # fmt: off
@@ -58,7 +58,7 @@ def expected_results_single_items_df(self) -> pd.DataFrame:
         )
         # fmt: on
 
-    @pytest.fixture()
+    @pytest.fixture
     def expected_results_pair_items_df(self) -> pd.DataFrame:
         """Return the expected results for the pair items association analysis."""
         # fmt: off

diff --git a/tests/analysis/test_revenue_tree.py b/tests/analysis/test_revenue_tree.py
@@ -12,7 +12,7 @@
 class TestRevenueTree:
     """Test the RevenueTree class."""
 
-    @pytest.fixture()
+    @pytest.fixture
     def cols(self):
         """Return a ColumnHelper instance."""
         return ColumnHelper()

diff --git a/tests/analysis/test_segmentation.py b/tests/analysis/test_segmentation.py
@@ -13,7 +13,7 @@
 class TestCalcSegStats:
     """Tests for the _calc_seg_stats method."""
 
-    @pytest.fixture()
+    @pytest.fixture
     def base_df(self):
         """Return a base DataFrame for testing."""
         return pd.DataFrame(
@@ -314,6 +314,70 @@ def test_handles_empty_dataframe_with_errors(self):
         with pytest.raises(ValueError):
             SegTransactionStats(df, "segment_name")
 
+    def test_multiple_segment_columns(self):
+        """Test that the class correctly handles multiple segment columns."""
+        df = pd.DataFrame(
+            {
+                cols.customer_id: [1, 1, 2, 2, 3, 3],
+                cols.unit_spend: [100.0, 150.0, 200.0, 250.0, 300.0, 350.0],
+                cols.transaction_id: [101, 102, 103, 104, 105, 106],
+                "segment_name": ["A", "A", "B", "B", "A", "A"],
+                "region": ["North", "North", "South", "South", "East", "East"],
+            },
+        )
+
+        # Test with a list of segment columns
+        seg_stats = SegTransactionStats(df, ["segment_name", "region"])
+
+        # Create expected DataFrame with the combinations actually produced
+        expected_output = pd.DataFrame(
+            {
+                "segment_name": ["A", "A", "B", "Total"],
+                "region": ["East", "North", "South", "Total"],
+                cols.agg_unit_spend: [650.0, 250.0, 450.0, 1350.0],
+                cols.agg_transaction_id: [2, 2, 2, 6],
+                cols.agg_customer_id: [1, 1, 1, 3],
+                cols.calc_spend_per_cust: [650.0, 250.0, 450.0, 450.0],
+                cols.calc_spend_per_trans: [325.0, 125.0, 225.0, 225.0],
+                cols.calc_trans_per_cust: [2.0, 2.0, 2.0, 2.0],
+                cols.customers_pct: [1 / 3, 1 / 3, 1 / 3, 1.0],
+            },
+        )
+
+        # Sort both dataframes by the segment columns for consistent comparison
+        result_df = seg_stats.df.sort_values(["segment_name", "region"]).reset_index(drop=True)
+        expected_output = expected_output.sort_values(["segment_name", "region"]).reset_index(drop=True)
+
+        # Check that both segment columns are in the result
+        assert "segment_name" in result_df.columns
+        assert "region" in result_df.columns
+
+        # Check number of rows - the implementation only returns actual combinations that exist in data
+        # plus the Total row, not all possible combinations
+        assert len(result_df) == len(expected_output)
+
+        # Use pandas testing to compare the dataframes
+        pd.testing.assert_frame_equal(result_df[expected_output.columns], expected_output)
+
+    def test_plot_with_multiple_segment_columns(self):
+        """Test that plotting with multiple segment columns raises a ValueError."""
+        df = pd.DataFrame(
+            {
+                cols.customer_id: [1, 2, 3],
+                cols.unit_spend: [100.0, 200.0, 300.0],
+                cols.transaction_id: [101, 102, 103],
+                "segment_name": ["A", "B", "A"],
+                "region": ["North", "South", "East"],
+            },
+        )
+
+        seg_stats = SegTransactionStats(df, ["segment_name", "region"])
+
+        with pytest.raises(ValueError) as excinfo:
+            seg_stats.plot("spend")
+
+        assert "Plotting is only supported for a single segment column" in str(excinfo.value)
+
     def test_extra_aggs_functionality(self):
         """Test that the extra_aggs parameter works correctly."""
         # Constants for expected values
@@ -365,9 +429,11 @@ def test_extra_aggs_functionality(self):
         # Sort by segment_name to ensure consistent order
         result_df_multi = seg_stats_multi.df.sort_values("segment_name").reset_index(drop=True)
 
-        assert result_df_multi.loc[0, "distinct_products"] == segment_a_product_count  # Segment A
-        assert result_df_multi.loc[1, "distinct_products"] == segment_b_product_count  # Segment B
-        assert result_df_multi.loc[2, "distinct_products"] == total_product_count  # Total
+        assert result_df_multi["distinct_products"].to_list() == [
+            segment_a_product_count,
+            segment_b_product_count,
+            total_product_count,
+        ]
 
     def test_extra_aggs_with_invalid_column(self):
         """Test that an error is raised when an invalid column is specified in extra_aggs."""
@@ -405,7 +471,7 @@ def test_extra_aggs_with_invalid_function(self):
 class TestHMLSegmentation:
     """Tests for the HMLSegmentation class."""
 
-    @pytest.fixture()
+    @pytest.fixture
     def base_df(self):
         """Return a base DataFrame for testing."""
         return pd.DataFrame(

diff --git a/tests/plots/test_area.py b/tests/plots/test_area.py
@@ -1,4 +1,5 @@
 """Tests for the plots.area module."""
+
 from itertools import cycle
 
 import numpy as np
@@ -13,7 +14,7 @@
 RNG = np.random.default_rng(42)
 
 
-@pytest.fixture()
+@pytest.fixture
 def sample_dataframe():
     """A sample dataframe for Jeans sales data."""
     data = {
@@ -24,7 +25,7 @@ def sample_dataframe():
     return pd.DataFrame(data)
 
 
-@pytest.fixture()
+@pytest.fixture
 def _mock_color_generators(mocker):
     """Mock the color generators for single and multi color maps."""
     single_color_gen = cycle(["#FF0000"])  # Mocked single-color generator (e.g., red)
@@ -34,7 +35,7 @@ def _mock_color_generators(mocker):
     mocker.patch("pyretailscience.style.tailwind.get_multi_color_cmap", return_value=multi_color_gen)
 
 
-@pytest.fixture()
+@pytest.fixture
 def _mock_gu_functions(mocker):
     mocker.patch("pyretailscience.style.graph_utils.standard_graph_styles", side_effect=lambda ax, **kwargs: ax)
     mocker.patch("pyretailscience.style.graph_utils.standard_tick_styles", side_effect=lambda ax: ax)
-Original file line number
+Diff line change
@@ Expand Up / @@ -21,6 +21,7 @@ @@
     - **Requires Ibis-Compatible Backend**: Ensure your Ibis backend supports trigonometric functions.
     - **Assumes Spherical Earth**: Uses the Haversine formula, which introduces slight inaccuracies due to Earth's oblate shape.
     """
     import ibis
@@ Expand Down @@