From 314ea0a413d6fad538d67e5b41117feaa845917d Mon Sep 17 00:00:00 2001 From: MayurK Date: Mon, 17 Mar 2025 18:49:36 +0530 Subject: [PATCH] feat: Created RFM segmentation --- docs/analysis_modules.md | 49 +++++++++ pyretailscience/analysis/segmentation.py | 104 +++++++++++++++++++ tests/analysis/test_segmentation.py | 124 ++++++++++++++++++++++- 3 files changed, 276 insertions(+), 1 deletion(-) diff --git a/docs/analysis_modules.md b/docs/analysis_modules.md index 7153ced..5963616 100644 --- a/docs/analysis_modules.md +++ b/docs/analysis_modules.md @@ -791,6 +791,55 @@ segment_stats.df | Total | 4604.28 | 150 | 50 | 92.0856 | 30.6952 | 3 | 1 | + +### RFM Segmentation + +
+ +![RFM Segmentation Distribution](assets/images/analysis_modules/rfm_segmentation.svg){ align=right loading=lazy width="50%"} + +**Recency, Frequency, Monetary (RFM) segmentation** categorizes customers based on their purchasing behavior: + +- **Recency (R)**: How recently a customer made a purchase +- **Frequency (F)**: How often a customer makes purchases +- **Monetary (M)**: How much a customer spends + +Each metric is typically scored on a scale, and the combined RFM score helps businesses identify **loyal customers, at-risk customers, and high-value buyers**. + +RFM segmentation helps answer questions such as: + +- Who are your most valuable customers? +- Which customers are at risk of churn? +- Which customers should be targeted for re-engagement? + +
+ +Example: + +```python +import pandas as pd +from pyretailscience.analysis.segmentation import RFMSegmentation + +data = pd.DataFrame({ + "customer_id": [1, 1, 2, 2, 3, 3, 3], + "transaction_id": [101, 102, 201, 202, 301, 302, 303], + "transaction_date": ["2024-03-01", "2024-03-10", "2024-02-20", "2024-02-25", "2024-01-15", "2024-01-20", "2024-02-05"], + "unit_spend": [50, 75, 100, 150, 200, 250, 300] +}) + +data["transaction_date"] = pd.to_datetime(data["transaction_date"]) +current_date = "2024-07-01" + +rfm_segmenter = RFMSegmentation(df=data, current_date=current_date) +rfm_results = rfm_segmenter.df +``` + +| customer_id | recency_days | frequency | monetary | r_score | f_score | m_score | rfm_segment | +|-------------|--------------|-----------|----------|---------|---------|---------|-------------| +| 3 | 147 | 3 | 750 | 0 | 0 | 0 | 0 | +| 2 | 127 | 2 | 250 | 1 | 2 | 1 | 121 | +| 1 | 113 | 2 | 125 | 2 | 1 | 2 | 212 | + ### Purchases Per Customer
diff --git a/pyretailscience/analysis/segmentation.py b/pyretailscience/analysis/segmentation.py index 62db474..1aa226c 100644 --- a/pyretailscience/analysis/segmentation.py +++ b/pyretailscience/analysis/segmentation.py @@ -1,5 +1,6 @@ """This module contains classes for segmenting customers based on their spend and transaction statistics by segment.""" +import datetime from typing import Literal import ibis @@ -402,3 +403,106 @@ def plot( gu.standard_tick_styles(ax) return ax + + +class RFMSegmentation: + """Segments customers using the RFM (Recency, Frequency, Monetary) methodology. + + Customers are scored on three dimensions: + - Recency (R): Days since the last transaction (lower is better). + - Frequency (F): Number of unique transactions (higher is better). + - Monetary (M): Total amount spent (higher is better). + + Each metric is ranked into 10 bins (0-9) using NTILE(10) where, + - 9 represents the best score (top 10% of customers). + - 0 represents the lowest score (bottom 10% of customers). + The RFM segment is a 3-digit number (R*100 + F*10 + M), representing customer value. + """ + + _df: pd.DataFrame | None = None + + def __init__(self, df: pd.DataFrame | ibis.Table, current_date: str | None = None) -> None: + """Initializes the RFM segmentation process. + + Args: + df (pd.DataFrame | ibis.Table): A DataFrame or Ibis table containing transaction data. + Must include the following columns: + - customer_id + - transaction_date + - unit_spend + - transaction_id + current_date (Optional[str]): The reference date for calculating recency (format: "YYYY-MM-DD"). + If not provided, the current system date will be used. + + Raises: + ValueError: If the dataframe is missing required columns. + TypeError: If the input data is not a pandas DataFrame or an Ibis Table. + """ + cols = ColumnHelper() + required_cols = [ + cols.customer_id, + cols.transaction_date, + cols.unit_spend, + cols.transaction_id, + ] + + missing_cols = set(required_cols) - set(df.columns) + if missing_cols: + error_message = f"Missing required columns: {missing_cols}" + raise ValueError(error_message) + current_date = ( + datetime.date.fromisoformat(current_date) if current_date else datetime.datetime.now(datetime.UTC).date() + ) + + self.table = self._compute_rfm(df, current_date) + + def _compute_rfm(self, df: ibis.Table, current_date: datetime.date) -> ibis.Table: + """Computes the RFM metrics and segments customers accordingly. + + Args: + df (ibis.Table): The transaction data table. + current_date (datetime.date): The reference date for calculating recency. + + Returns: + ibis.Table: A table with RFM scores and segment values. + """ + if isinstance(df, pd.DataFrame): + df = ibis.memtable(df) + elif not isinstance(df, ibis.Table): + raise TypeError("df must be either a pandas DataFrame or an Ibis Table") + + cols = ColumnHelper() + current_date_expr = ibis.literal(current_date) + + customer_metrics = df.group_by(cols.customer_id).aggregate( + recency_days=(current_date_expr - df[cols.transaction_date].max().cast("date")).cast("int32"), + frequency=df[cols.transaction_id].nunique(), + monetary=df[cols.unit_spend].sum(), + ) + + window_recency = ibis.window( + order_by=[ibis.asc(customer_metrics.recency_days), ibis.asc(customer_metrics.customer_id)], + ) + window_frequency = ibis.window( + order_by=[ibis.desc(customer_metrics.frequency), ibis.asc(customer_metrics.customer_id)], + ) + window_monetary = ibis.window( + order_by=[ibis.desc(customer_metrics.monetary), ibis.asc(customer_metrics.customer_id)], + ) + + rfm_scores = customer_metrics.mutate( + r_score=(ibis.ntile(10).over(window_recency)), + f_score=(ibis.ntile(10).over(window_frequency)), + m_score=(ibis.ntile(10).over(window_monetary)), + ) + + rfm_segment = (rfm_scores.r_score * 100 + rfm_scores.f_score * 10 + rfm_scores.m_score).name("rfm_segment") + + return rfm_scores.mutate(rfm_segment=rfm_segment) + + @property + def df(self) -> pd.DataFrame: + """Returns the dataframe with the segment names.""" + if self._df is None: + self._df = self.table.execute().set_index(get_option("column.customer_id")) + return self._df diff --git a/tests/analysis/test_segmentation.py b/tests/analysis/test_segmentation.py index 9ee82a2..fb49845 100644 --- a/tests/analysis/test_segmentation.py +++ b/tests/analysis/test_segmentation.py @@ -4,7 +4,12 @@ import pandas as pd import pytest -from pyretailscience.analysis.segmentation import HMLSegmentation, SegTransactionStats, ThresholdSegmentation +from pyretailscience.analysis.segmentation import ( + HMLSegmentation, + RFMSegmentation, + SegTransactionStats, + ThresholdSegmentation, +) from pyretailscience.options import ColumnHelper, get_option cols = ColumnHelper() @@ -392,3 +397,120 @@ def test_alternate_value_col(self, base_df): assert result_df.loc[2, "segment_name"] == "Light" assert result_df.loc[4, "segment_name"] == "Medium" assert result_df.loc[5, "segment_name"] == "Light" + + +class TestRFMSegmentation: + """Tests for the RFMSegmentation class.""" + + @pytest.fixture() + def base_df(self): + """Return a base DataFrame for testing.""" + return pd.DataFrame( + { + cols.customer_id: [1, 2, 3, 4, 5], + cols.transaction_id: [101, 102, 103, 104, 105], + cols.unit_spend: [100.0, 200.0, 150.0, 300.0, 250.0], + cols.transaction_date: [ + "2025-03-01", + "2025-02-15", + "2025-01-30", + "2025-03-10", + "2025-02-20", + ], + }, + ) + + def test_correct_rfm_segmentation(self, base_df): + """Test that the RFM segmentation correctly calculates the RFM scores and segments.""" + current_date = "2025-03-17" + rfm_segmentation = RFMSegmentation(df=base_df, current_date=current_date) + result_df = rfm_segmentation.df + expected_df = pd.DataFrame( + { + "customer_id": [1, 2, 3, 4, 5], + "rfm_segment": [104, 312, 423, 30, 241], + }, + ).set_index("customer_id") + + pd.testing.assert_frame_equal( + result_df[["rfm_segment"]].sort_index(), + expected_df[["rfm_segment"]].sort_index(), + check_like=True, + ) + + def test_handles_dataframe_with_missing_columns(self): + """Test that the method raises an error when required columns are missing.""" + base_df = pd.DataFrame( + { + cols.customer_id: [1, 2, 3], + cols.unit_spend: [100.0, 200.0, 150.0], + cols.transaction_id: [101, 102, 103], + }, + ) + + with pytest.raises(ValueError): + RFMSegmentation(df=base_df, current_date="2025-03-17") + + def test_single_customer(self): + """Test that the method correctly calculates RFM segmentation for a single customer.""" + df_single_customer = pd.DataFrame( + { + cols.customer_id: [1], + cols.transaction_id: [101], + cols.unit_spend: [200.0], + cols.transaction_date: ["2025-03-01"], + }, + ) + current_date = "2025-03-17" + rfm_segmentation = RFMSegmentation(df=df_single_customer, current_date=current_date) + result_df = rfm_segmentation.df + assert result_df.loc[1, "rfm_segment"] == 0 + + def test_multiple_transactions_per_customer(self): + """Test that the method correctly handles multiple transactions for the same customer.""" + df_multiple_transactions = pd.DataFrame( + { + cols.customer_id: [1, 1, 1, 1, 1], + cols.transaction_id: [101, 102, 103, 104, 105], + cols.unit_spend: [120.0, 250.0, 180.0, 300.0, 220.0], + cols.transaction_date: [ + "2025-03-01", + "2025-02-15", + "2025-01-10", + "2025-03-10", + "2025-02-25", + ], + }, + ) + current_date = "2025-03-17" + rfm_segmentation = RFMSegmentation(df=df_multiple_transactions, current_date=current_date) + result_df = rfm_segmentation.df + + assert result_df.loc[1, "rfm_segment"] == 0 + + def test_calculates_rfm_correctly_for_all_customers(self, base_df): + """Test that RFM scores are calculated correctly for all customers.""" + current_date = "2025-03-17" + expected_customer_count = 5 + rfm_segmentation = RFMSegmentation(df=base_df, current_date=current_date) + result_df = rfm_segmentation.df + + assert len(result_df) == expected_customer_count + assert "rfm_segment" in result_df.columns + + def test_rfm_segmentation_with_no_date(self, base_df): + """Test that the RFM segmentation correctly calculates the RFM scores and segments.""" + rfm_segmentation = RFMSegmentation(df=base_df) + result_df = rfm_segmentation.df + expected_df = pd.DataFrame( + { + "customer_id": [1, 2, 3, 4, 5], + "rfm_segment": [104, 312, 423, 30, 241], + }, + ).set_index("customer_id") + + pd.testing.assert_frame_equal( + result_df[["rfm_segment"]].sort_index(), + expected_df[["rfm_segment"]].sort_index(), + check_like=True, + )