feat: add input validation and tests in HMLSegmentation

mvanwyk · mvanwyk · commit 79a08293e27f · 2024-07-05T19:33:43.000+02:00
diff --git a/pyretailscience/segmentation.py b/pyretailscience/segmentation.py
@@ -88,6 +88,9 @@ def __init__(
             ValueError: If the dataframe is missing the columns "customer_id" or `value_col`, or these columns contain
                 null values.
         """
+        if df.empty:
+            raise ValueError("Input DataFrame is empty")
+
         required_cols = ["customer_id", value_col]
         contract = CustomContract(
             df,
@@ -99,6 +102,11 @@ def __init__(
             msg = f"The dataframe requires the columns {required_cols} and they must be non-null"
             raise ValueError(msg)
 
+        hml_cuts = [0.500, 0.800, 1]
+        if len(df) < len(hml_cuts):
+            msg = f"There are {len(df)} customers, which is less than is less than the number of segment thresholds."
+            raise ValueError(msg)
+
         # Group by customer_id and calculate total_spend
         grouped_df = df.groupby("customer_id")[value_col].sum().to_frame(value_col)
 
@@ -114,7 +122,7 @@ def __init__(
         # Create a new column 'segment' based on the total_spend
         hml_df["segment_name"] = pd.qcut(
             hml_df[value_col],
-            q=[0, 0.500, 0.800, 1],
+            q=[0, *hml_cuts],
             labels=["Light", "Medium", "Heavy"],
         )
 
diff --git a/tests/test_segmentation.py b/tests/test_segmentation.py
@@ -3,7 +3,7 @@
 import pandas as pd
 import pytest
 
-from pyretailscience.segmentation import SegTransactionStats
+from pyretailscience.segmentation import HMLSegmentation, SegTransactionStats
 
 
 class TestCalcSegStats:
@@ -99,3 +99,110 @@ def test_handles_empty_dataframe_with_errors(self):
 
         with pytest.raises(ValueError):
             SegTransactionStats(df, "segment_id")
+
+
+class TestHMLSegmentation:
+    """Tests for the HMLSegmentation class."""
+
+    @pytest.fixture()
+    def base_df(self):
+        """Return a base DataFrame for testing."""
+        return pd.DataFrame({"customer_id": [1, 2, 3, 4, 5], "total_price": [1000, 200, 0, 500, 300]})
+
+    def test_no_transactions(self):
+        """Test that the method raises an error when there are no transactions."""
+        data = {"customer_id": [], "total_price": []}
+        df = pd.DataFrame(data)
+        with pytest.raises(ValueError):
+            HMLSegmentation(df)
+
+    # Correctly handles zero spend customers when zero_value_customers is "exclude"
+    def test_handles_zero_spend_customers_are_excluded_in_result(self, base_df):
+        """Test that the method correctly handles zero spend customers when zero_value_customers is "exclude"."""
+        hml_segmentation = HMLSegmentation(base_df, zero_value_customers="exclude")
+        result_df = hml_segmentation.df
+
+        zero_spend_customer_id = 3
+
+        assert result_df.loc[1, "segment_name"] == "Heavy"
+        assert result_df.loc[1, "segment_id"] == "H"
+        assert result_df.loc[2, "segment_name"] == "Light"
+        assert result_df.loc[2, "segment_id"] == "L"
+        assert zero_spend_customer_id not in result_df.index
+        assert result_df.loc[4, "segment_name"] == "Medium"
+        assert result_df.loc[4, "segment_id"] == "M"
+        assert result_df.loc[5, "segment_name"] == "Light"
+        assert result_df.loc[5, "segment_id"] == "L"
+
+    # Correctly handles zero spend customers when zero_value_customers is "include_with_light"
+    def test_handles_zero_spend_customers_include_with_light(self, base_df):
+        """Test that the method correctly handles zero spend customers when zero_value_customers is "include_with_light"."""
+        hml_segmentation = HMLSegmentation(base_df, zero_value_customers="include_with_light")
+        result_df = hml_segmentation.df
+
+        assert result_df.loc[1, "segment_name"] == "Heavy"
+        assert result_df.loc[1, "segment_id"] == "H"
+        assert result_df.loc[2, "segment_name"] == "Light"
+        assert result_df.loc[2, "segment_id"] == "L"
+        assert result_df.loc[3, "segment_name"] == "Light"
+        assert result_df.loc[3, "segment_id"] == "L"
+        assert result_df.loc[4, "segment_name"] == "Medium"
+        assert result_df.loc[4, "segment_id"] == "M"
+        assert result_df.loc[5, "segment_name"] == "Light"
+        assert result_df.loc[5, "segment_id"] == "L"
+
+    # Correctly handles zero spend customers when zero_value_customers is "separate_segment"
+    def test_handles_zero_spend_customers_separate_segment(self, base_df):
+        """Test that the method correctly handles zero spend customers when zero_value_customers is "separate_segment"."""
+        hml_segmentation = HMLSegmentation(base_df, zero_value_customers="separate_segment")
+        result_df = hml_segmentation.df
+
+        assert result_df.loc[1, "segment_name"] == "Heavy"
+        assert result_df.loc[1, "segment_id"] == "H"
+        assert result_df.loc[2, "segment_name"] == "Light"
+        assert result_df.loc[2, "segment_id"] == "L"
+        assert result_df.loc[3, "segment_name"] == "Zero"
+        assert result_df.loc[3, "segment_id"] == "Z"
+        assert result_df.loc[4, "segment_name"] == "Medium"
+        assert result_df.loc[4, "segment_id"] == "M"
+        assert result_df.loc[5, "segment_name"] == "Light"
+        assert result_df.loc[5, "segment_id"] == "L"
+
+    # Raises ValueError if required columns are missing
+    def test_raises_value_error_if_required_columns_missing(self, base_df):
+        """Test that the method raises an error when the DataFrame is missing a required column."""
+        with pytest.raises(ValueError):
+            HMLSegmentation(base_df.drop(columns=["customer_id"]))
+
+    # DataFrame with only one customer
+    def test_segments_customer_single(self):
+        """Test that the method correctly segments a DataFrame with only one customer."""
+        data = {"customer_id": [1], "total_price": [0]}
+        df = pd.DataFrame(data)
+        with pytest.raises(ValueError):
+            HMLSegmentation(df)
+
+    # Validate that the input dataframe is not changed
+    def test_input_dataframe_not_changed(self, base_df):
+        """Test that the method does not alter the original DataFrame."""
+        original_df = base_df.copy()
+
+        hml_segmentation = HMLSegmentation(base_df)
+        _ = hml_segmentation.df
+
+        assert original_df.equals(base_df)  # Check if the original dataframe is not changed
+
+    def test_alternate_value_col(self, base_df):
+        """Test that the method correctly segments a DataFrame with an alternate value column."""
+        base_df = base_df.rename(columns={"total_price": "quantity"})
+        hml_segmentation = HMLSegmentation(base_df, value_col="quantity")
+        result_df = hml_segmentation.df
+
+        assert result_df.loc[1, "segment_name"] == "Heavy"
+        assert result_df.loc[1, "segment_id"] == "H"
+        assert result_df.loc[2, "segment_name"] == "Light"
+        assert result_df.loc[2, "segment_id"] == "L"
+        assert result_df.loc[4, "segment_name"] == "Medium"
+        assert result_df.loc[4, "segment_id"] == "M"
+        assert result_df.loc[5, "segment_name"] == "Light"
+        assert result_df.loc[5, "segment_id"] == "L"