feat: added treshold segmentation creation

mvanwyk · mvanwyk · commit fa6d7d7d0845 · 2024-07-10T20:15:31.000+02:00
diff --git a/pyretailscience/segmentation.py b/pyretailscience/segmentation.py
@@ -67,20 +67,30 @@ def __init__(self, df: pd.DataFrame) -> None:
         self.df = df[["customer_id", "segment_name", "segment_id"]].set_index("customer_id")
 
 
-class HMLSegmentation(BaseSegmentation):
-    """Segments customers into Heavy, Medium, Light and Zero spenders based on the total spend."""
+class ThresholdSegmentation(BaseSegmentation):
+    """Segments customers based on user-defined thresholds and segments."""
 
     def __init__(
         self,
         df: pd.DataFrame,
+        thresholds: list[float],
+        segments: dict[any, str],
         value_col: str = "total_price",
+        agg_func: str = "sum",
+        zero_segment_name: str = "Zero",
+        zero_segment_id: str = "Z",
         zero_value_customers: Literal["separate_segment", "exclude", "include_with_light"] = "separate_segment",
     ) -> None:
-        """Segments customers into Heavy, Medium, Light and Zero spenders based on the total spend.
+        """Segments customers based on user-defined thresholds and segments.
 
         Args:
             df (pd.DataFrame): A dataframe with the transaction data. The dataframe must contain a customer_id column.
-            value_col (str, optional): The column to use for the segmentation. Defaults to "total_price".
+            thresholds (List[float]): The percentile thresholds for segmentation.
+            segments (Dict[str, str]): A dictionary where keys are segment IDs and values are segment names.
+            value_col (str): The column to use for the segmentation.
+            agg_func (str, optional): The aggregation function to use when grouping by customer_id. Defaults to "sum".
+            zero_segment_name (str, optional): The name of the segment for customers with zero spend. Defaults to "Zero".
+            zero_segment_id (str, optional): The ID of the segment for customers with zero spend. Defaults to "Z".
             zero_value_customers (Literal["separate_segment", "exclude", "include_with_light"], optional): How to handle
                 customers with zero spend. Defaults to "separate_segment".
 
@@ -102,38 +112,79 @@ def __init__(
             msg = f"The dataframe requires the columns {required_cols} and they must be non-null"
             raise ValueError(msg)
 
-        hml_cuts = [0.500, 0.800, 1]
-        if len(df) < len(hml_cuts):
-            msg = f"There are {len(df)} customers, which is less than is less than the number of segment thresholds."
+        if len(df) < len(thresholds):
+            msg = f"There are {len(df)} customers, which is less than the number of segment thresholds."
             raise ValueError(msg)
 
+        if set(thresholds) != set(thresholds):
+            raise ValueError("The thresholds must be unique.")
+
+        thresholds = sorted(thresholds)
+        if thresholds[0] != 0:
+            thresholds = [0, *thresholds]
+        if thresholds[-1] != 1:
+            thresholds.append(1)
+
+        if len(thresholds) - 1 != len(segments):
+            raise ValueError("The number of thresholds must match the number of segments.")
+
         # Group by customer_id and calculate total_spend
-        grouped_df = df.groupby("customer_id")[value_col].sum().to_frame(value_col)
+        grouped_df = df.groupby("customer_id")[value_col].agg(agg_func).to_frame(value_col)
 
         # Separate customers with zero spend
-        hml_df = grouped_df
+        self.df = grouped_df
         if zero_value_customers in ["separate_segment", "exclude"]:
             zero_idx = grouped_df[value_col] == 0
-            zero_cust_df = grouped_df[zero_idx]
-            zero_cust_df["segment_name"] = "Zero"
+            zero_cust_df = grouped_df[zero_idx].copy()
+            zero_cust_df["segment_name"] = zero_segment_name
+            zero_cust_df["segment_id"] = zero_segment_id
 
-            hml_df = grouped_df[~zero_idx]
+            self.df = grouped_df[~zero_idx]
 
         # Create a new column 'segment' based on the total_spend
-        hml_df["segment_name"] = pd.qcut(
-            hml_df[value_col],
-            q=[0, *hml_cuts],
-            labels=["Light", "Medium", "Heavy"],
+        labels = list(segments.values())
+
+        self.df["segment_name"] = pd.qcut(
+            self.df[value_col],
+            q=thresholds,
+            labels=labels,
         )
 
+        self.df["segment_id"] = self.df["segment_name"].map({v: k for k, v in segments.items()})
+
         if zero_value_customers == "separate_segment":
-            hml_df = pd.concat([hml_df, zero_cust_df])
+            self.df = pd.concat([self.df, zero_cust_df])
+
 
-        segment_code_map = {"Light": "L", "Medium": "M", "Heavy": "H", "Zero": "Z"}
+class HMLSegmentation(ThresholdSegmentation):
+    """Segments customers into Heavy, Medium, Light and Zero spenders based on the total spend."""
 
-        hml_df["segment_id"] = hml_df["segment_name"].map(segment_code_map)
+    def __init__(
+        self,
+        df: pd.DataFrame,
+        value_col: str = "total_price",
+        agg_func: str = "sum",
+        zero_value_customers: Literal["separate_segment", "exclude", "include_with_light"] = "separate_segment",
+    ) -> None:
+        """Segments customers into Heavy, Medium, Light and Zero spenders based on the total spend.
 
-        self.df = hml_df
+        Args:
+            df (pd.DataFrame): A dataframe with the transaction data. The dataframe must contain a customer_id column.
+            value_col (str, optional): The column to use for the segmentation. Defaults to "total_price".
+            agg_func (str, optional): The aggregation function to use when grouping by customer_id. Defaults to "sum".
+            zero_value_customers (Literal["separate_segment", "exclude", "include_with_light"], optional): How to handle
+                customers with zero spend. Defaults to "separate_segment".
+        """
+        thresholds = [0.500, 0.800, 1]
+        segments = {"L": "Light", "M": "Medium", "H": "Heavy"}
+        super().__init__(
+            df=df,
+            value_col=value_col,
+            agg_func=agg_func,
+            thresholds=thresholds,
+            segments=segments,
+            zero_value_customers=zero_value_customers,
+        )
 
 
 class SegTransactionStats:
diff --git a/tests/test_segmentation.py b/tests/test_segmentation.py
@@ -3,7 +3,7 @@
 import pandas as pd
 import pytest
 
-from pyretailscience.segmentation import HMLSegmentation, SegTransactionStats
+from pyretailscience.segmentation import HMLSegmentation, SegTransactionStats, ThresholdSegmentation
 
 
 class TestCalcSegStats:
@@ -90,6 +90,193 @@ def test_handles_dataframe_with_one_segment(self, base_df):
         pd.testing.assert_frame_equal(segment_stats, expected_output)
 
 
+class TestThresholdSegmentation:
+    """Tests for the ThresholdSegmentation class."""
+
+    def test_correct_segmentation(self):
+        """Test that the method correctly segments customers based on given thresholds and segments."""
+        df = pd.DataFrame({"customer_id": [1, 2, 3, 4], "total_price": [100, 200, 300, 400]})
+        thresholds = [0.5, 1]
+        segments = {0: "Low", 1: "High"}
+        seg = ThresholdSegmentation(
+            df=df,
+            thresholds=thresholds,
+            segments=segments,
+            value_col="total_price",
+            zero_value_customers="exclude",
+        )
+        result_df = seg.df
+        assert result_df.loc[1, "segment_name"] == "Low"
+        assert result_df.loc[2, "segment_name"] == "Low"
+        assert result_df.loc[3, "segment_name"] == "High"
+        assert result_df.loc[4, "segment_name"] == "High"
+
+    def test_single_customer(self):
+        """Test that the method correctly segments a DataFrame with only one customer."""
+        df = pd.DataFrame({"customer_id": [1], "total_price": [100]})
+        thresholds = [0.5, 1]
+        segments = {0: "Low"}
+        with pytest.raises(ValueError):
+            ThresholdSegmentation(
+                df=df,
+                thresholds=thresholds,
+                segments=segments,
+            )
+
+    def test_correct_aggregation_function(self):
+        """Test that the correct aggregation function is applied for product_id custom segmentation."""
+        df = pd.DataFrame(
+            {
+                "customer_id": [1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5],
+                "product_id": [3, 4, 4, 6, 1, 5, 7, 2, 2, 3, 2, 3, 4, 1],
+            },
+        )
+        value_col = "product_id"
+        agg_func = "nunique"
+
+        my_seg = ThresholdSegmentation(
+            df=df,
+            value_col=value_col,
+            agg_func=agg_func,
+            thresholds=[0.2, 0.8, 1],
+            segments={"A": "Low", "B": "Medium", "C": "High"},
+            zero_value_customers="separate_segment",
+        )
+
+        expected_result = pd.DataFrame(
+            {
+                "customer_id": [1, 2, 3, 4, 5],
+                "product_id": [1, 4, 2, 2, 3],
+                "segment_name": ["Low", "High", "Medium", "Medium", "Medium"],
+                "segment_id": ["A", "C", "B", "B", "B"],
+            },
+        )
+        expected_result["segment_id"] = pd.Categorical(
+            expected_result["segment_id"],
+            categories=["A", "B", "C"],
+            ordered=True,
+        )
+        expected_result["segment_name"] = pd.Categorical(
+            expected_result["segment_name"],
+            categories=["Low", "Medium", "High"],
+            ordered=True,
+        )
+        pd.testing.assert_frame_equal(my_seg.df.reset_index(), expected_result)
+
+    def test_correctly_checks_segment_data(self):
+        """Test that the method correctly merges segment data back into the original DataFrame."""
+        df = pd.DataFrame(
+            {
+                "customer_id": [1, 2, 3, 4, 5],
+                "total_price": [100, 200, 0, 150, 0],
+            },
+        )
+        value_col = "total_price"
+        agg_func = "sum"
+        thresholds = [0.33, 0.66, 1]
+        segments = {"A": "Low", "B": "Medium", "C": "High"}
+        zero_value_customers = "separate_segment"
+
+        # Create ThresholdSegmentation instance
+        threshold_seg = ThresholdSegmentation(
+            df=df,
+            value_col=value_col,
+            agg_func=agg_func,
+            thresholds=thresholds,
+            segments=segments,
+            zero_value_customers=zero_value_customers,
+        )
+
+        # Call add_segment method
+        segmented_df = threshold_seg.add_segment(df)
+
+        # Assert the correct segment_name and segment_id
+        expected_df = pd.DataFrame(
+            {
+                "customer_id": [1, 2, 3, 4, 5],
+                "total_price": [100, 200, 0, 150, 0],
+                "segment_name": ["Low", "High", "Zero", "Medium", "Zero"],
+                "segment_id": ["A", "C", "Z", "B", "Z"],
+            },
+        )
+        pd.testing.assert_frame_equal(segmented_df, expected_df)
+
+    def test_handles_dataframe_with_duplicate_customer_id_entries(self):
+        """Test that the method correctly handles a DataFrame with duplicate customer_id entries."""
+        df = pd.DataFrame({"customer_id": [1, 2, 3, 1, 2, 3], "total_price": [100, 200, 300, 150, 250, 350]})
+
+        my_seg = ThresholdSegmentation(
+            df=df,
+            value_col="total_price",
+            agg_func="sum",
+            thresholds=[0.5, 0.8, 1],
+            segments={"L": "Light", "M": "Medium", "H": "Heavy"},
+            zero_value_customers="include_with_light",
+        )
+
+        result_df = my_seg.add_segment(df)
+        assert len(result_df) == len(df)
+
+    def test_correctly_maps_segment_names_to_segment_ids_with_fixed_thresholds(self):
+        """Test that the method correctly maps segment names to segment IDs with fixed thresholds."""
+        # Setup
+        df = pd.DataFrame({"customer_id": [1, 2, 3, 4, 5], "total_price": [100, 200, 300, 400, 500]})
+        value_col = "total_price"
+        agg_func = "sum"
+        thresholds = [0.33, 0.66, 1]
+        segments = {1: "Low", 2: "Medium", 3: "High"}
+        zero_value_customers = "separate_segment"
+
+        my_seg = ThresholdSegmentation(
+            df=df,
+            value_col=value_col,
+            agg_func=agg_func,
+            thresholds=thresholds,
+            segments=segments,
+            zero_value_customers=zero_value_customers,
+        )
+
+        assert len(my_seg.df[["segment_id", "segment_name"]].drop_duplicates()) == len(segments)
+        assert my_seg.df.set_index("segment_id")["segment_name"].to_dict() == segments
+
+    def test_thresholds_not_unique(self):
+        """Test that the method raises an error when the thresholds are not unique."""
+        df = pd.DataFrame({"customer_id": [1, 2, 3, 4, 5], "total_price": [100, 200, 300, 400, 500]})
+        thresholds = [0.5, 0.5, 0.8, 1]
+        segments = {1: "Low", 2: "Medium", 3: "High"}
+
+        with pytest.raises(ValueError):
+            ThresholdSegmentation(df, thresholds, segments)
+
+    def test_thresholds_too_few_segments(self):
+        """Test that the method raises an error when there are too few/many segments for the number of thresholds."""
+        df = pd.DataFrame({"customer_id": [1, 2, 3, 4, 5], "total_price": [100, 200, 300, 400, 500]})
+        thresholds = [0.4, 0.6, 0.8, 1]
+        segments = {1: "Low", 3: "High"}
+
+        with pytest.raises(ValueError):
+            ThresholdSegmentation(df, thresholds, segments)
+
+        segments = {1: "Low", 2: "Medium", 3: "High"}
+
+        with pytest.raises(ValueError):
+            ThresholdSegmentation(df, thresholds, segments)
+
+    def test_thresholds_too_too_few_thresholds(self):
+        """Test that the method raises an error when there are too few/many thresholds for the number of segments."""
+        df = pd.DataFrame({"customer_id": [1, 2, 3, 4, 5], "total_price": [100, 200, 300, 400, 500]})
+        thresholds = [0.4, 1]
+        segments = {1: "Low", 2: "Medium", 3: "High"}
+
+        with pytest.raises(ValueError):
+            ThresholdSegmentation(df, thresholds, segments)
+
+        thresholds = [0.2, 0.5, 0.6, 0.8, 1]
+
+        with pytest.raises(ValueError):
+            ThresholdSegmentation(df, thresholds, segments)
+
+
 class TestSegTransactionStats:
     """Tests for the SegTransactionStats class."""