From 314ea0a413d6fad538d67e5b41117feaa845917d Mon Sep 17 00:00:00 2001
From: MayurK <mayurk.mmt@gmail.com>
Date: Mon, 17 Mar 2025 18:49:36 +0530
Subject: [PATCH] feat: Created RFM segmentation

---
 docs/analysis_modules.md                 |  49 +++++++++
 pyretailscience/analysis/segmentation.py | 104 +++++++++++++++++++
 tests/analysis/test_segmentation.py      | 124 ++++++++++++++++++++++-
 3 files changed, 276 insertions(+), 1 deletion(-)
diff --git a/docs/analysis_modules.md b/docs/analysis_modules.md
index 7153ced..5963616 100644
--- a/docs/analysis_modules.md
+++ b/docs/analysis_modules.md
@@ -791,6 +791,55 @@ segment_stats.df
 | Total          | 4604.28  |            150 |          50 |              92.0856 |                30.6952  |                           3 |             1   |
 <!-- markdownlint-enable MD013 -->
 
+
+### RFM Segmentation
+
+<div class="clear" markdown>
+
+![RFM Segmentation Distribution](assets/images/analysis_modules/rfm_segmentation.svg){ align=right loading=lazy width="50%"}
+
+**Recency, Frequency, Monetary (RFM) segmentation** categorizes customers based on their purchasing behavior:
+
+- **Recency (R)**: How recently a customer made a purchase
+- **Frequency (F)**: How often a customer makes purchases
+- **Monetary (M)**: How much a customer spends
+
+Each metric is typically scored on a scale, and the combined RFM score helps businesses identify **loyal customers, at-risk customers, and high-value buyers**.
+
+RFM segmentation helps answer questions such as:
+
+- Who are your most valuable customers?
+- Which customers are at risk of churn?
+- Which customers should be targeted for re-engagement?
+
+</div>
+
+Example:
+
+```python
+import pandas as pd
+from pyretailscience.analysis.segmentation import RFMSegmentation
+
+data = pd.DataFrame({
+    "customer_id": [1, 1, 2, 2, 3, 3, 3],
+    "transaction_id": [101, 102, 201, 202, 301, 302, 303],
+    "transaction_date": ["2024-03-01", "2024-03-10", "2024-02-20", "2024-02-25", "2024-01-15", "2024-01-20", "2024-02-05"],
+    "unit_spend": [50, 75, 100, 150, 200, 250, 300]
+})
+
+data["transaction_date"] = pd.to_datetime(data["transaction_date"])
+current_date = "2024-07-01"
+
+rfm_segmenter = RFMSegmentation(df=data, current_date=current_date)
+rfm_results = rfm_segmenter.df
+```
+
+| customer_id | recency_days | frequency | monetary | r_score | f_score | m_score | rfm_segment |
+|-------------|--------------|-----------|----------|---------|---------|---------|-------------|
+| 3           | 147          | 3         | 750      | 0       | 0       | 0       | 0           |
+| 2           | 127          | 2         | 250      | 1       | 2       | 1       | 121         |
+| 1           | 113          | 2         | 125      | 2       | 1       | 2       | 212         |
+
 ### Purchases Per Customer
 
 <div class="clear" markdown>
diff --git a/pyretailscience/analysis/segmentation.py b/pyretailscience/analysis/segmentation.py
index 62db474..1aa226c 100644
--- a/pyretailscience/analysis/segmentation.py
+++ b/pyretailscience/analysis/segmentation.py
@@ -1,5 +1,6 @@
 """This module contains classes for segmenting customers based on their spend and transaction statistics by segment."""
 
+import datetime
 from typing import Literal
 
 import ibis
@@ -402,3 +403,106 @@ def plot(
         gu.standard_tick_styles(ax)
 
         return ax
+
+
+class RFMSegmentation:
+    """Segments customers using the RFM (Recency, Frequency, Monetary) methodology.
+
+    Customers are scored on three dimensions:
+    - Recency (R): Days since the last transaction (lower is better).
+    - Frequency (F): Number of unique transactions (higher is better).
+    - Monetary (M): Total amount spent (higher is better).
+
+    Each metric is ranked into 10 bins (0-9) using NTILE(10) where,
+    - 9 represents the best score (top 10% of customers).
+    - 0 represents the lowest score (bottom 10% of customers).
+    The RFM segment is a 3-digit number (R*100 + F*10 + M), representing customer value.
+    """
+
+    _df: pd.DataFrame | None = None
+
+    def __init__(self, df: pd.DataFrame | ibis.Table, current_date: str | None = None) -> None:
+        """Initializes the RFM segmentation process.
+
+        Args:
+            df (pd.DataFrame | ibis.Table): A DataFrame or Ibis table containing transaction data.
+                Must include the following columns:
+                - customer_id
+                - transaction_date
+                - unit_spend
+                - transaction_id
+            current_date (Optional[str]): The reference date for calculating recency (format: "YYYY-MM-DD").
+                If not provided, the current system date will be used.
+
+        Raises:
+            ValueError: If the dataframe is missing required columns.
+            TypeError: If the input data is not a pandas DataFrame or an Ibis Table.
+        """
+        cols = ColumnHelper()
+        required_cols = [
+            cols.customer_id,
+            cols.transaction_date,
+            cols.unit_spend,
+            cols.transaction_id,
+        ]
+
+        missing_cols = set(required_cols) - set(df.columns)
+        if missing_cols:
+            error_message = f"Missing required columns: {missing_cols}"
+            raise ValueError(error_message)
+        current_date = (
+            datetime.date.fromisoformat(current_date) if current_date else datetime.datetime.now(datetime.UTC).date()
+        )
+
+        self.table = self._compute_rfm(df, current_date)
+
+    def _compute_rfm(self, df: ibis.Table, current_date: datetime.date) -> ibis.Table:
+        """Computes the RFM metrics and segments customers accordingly.
+
+        Args:
+            df (ibis.Table): The transaction data table.
+            current_date (datetime.date): The reference date for calculating recency.
+
+        Returns:
+            ibis.Table: A table with RFM scores and segment values.
+        """
+        if isinstance(df, pd.DataFrame):
+            df = ibis.memtable(df)
+        elif not isinstance(df, ibis.Table):
+            raise TypeError("df must be either a pandas DataFrame or an Ibis Table")
+
+        cols = ColumnHelper()
+        current_date_expr = ibis.literal(current_date)
+
+        customer_metrics = df.group_by(cols.customer_id).aggregate(
+            recency_days=(current_date_expr - df[cols.transaction_date].max().cast("date")).cast("int32"),
+            frequency=df[cols.transaction_id].nunique(),
+            monetary=df[cols.unit_spend].sum(),
+        )
+
+        window_recency = ibis.window(
+            order_by=[ibis.asc(customer_metrics.recency_days), ibis.asc(customer_metrics.customer_id)],
+        )
+        window_frequency = ibis.window(
+            order_by=[ibis.desc(customer_metrics.frequency), ibis.asc(customer_metrics.customer_id)],
+        )
+        window_monetary = ibis.window(
+            order_by=[ibis.desc(customer_metrics.monetary), ibis.asc(customer_metrics.customer_id)],
+        )
+
+        rfm_scores = customer_metrics.mutate(
+            r_score=(ibis.ntile(10).over(window_recency)),
+            f_score=(ibis.ntile(10).over(window_frequency)),
+            m_score=(ibis.ntile(10).over(window_monetary)),
+        )
+
+        rfm_segment = (rfm_scores.r_score * 100 + rfm_scores.f_score * 10 + rfm_scores.m_score).name("rfm_segment")
+
+        return rfm_scores.mutate(rfm_segment=rfm_segment)
+
+    @property
+    def df(self) -> pd.DataFrame:
+        """Returns the dataframe with the segment names."""
+        if self._df is None:
+            self._df = self.table.execute().set_index(get_option("column.customer_id"))
+        return self._df
diff --git a/tests/analysis/test_segmentation.py b/tests/analysis/test_segmentation.py
index 9ee82a2..fb49845 100644
--- a/tests/analysis/test_segmentation.py
+++ b/tests/analysis/test_segmentation.py
@@ -4,7 +4,12 @@
 import pandas as pd
 import pytest
 
-from pyretailscience.analysis.segmentation import HMLSegmentation, SegTransactionStats, ThresholdSegmentation
+from pyretailscience.analysis.segmentation import (
+    HMLSegmentation,
+    RFMSegmentation,
+    SegTransactionStats,
+    ThresholdSegmentation,
+)
 from pyretailscience.options import ColumnHelper, get_option
 
 cols = ColumnHelper()
@@ -392,3 +397,120 @@ def test_alternate_value_col(self, base_df):
         assert result_df.loc[2, "segment_name"] == "Light"
         assert result_df.loc[4, "segment_name"] == "Medium"
         assert result_df.loc[5, "segment_name"] == "Light"
+
+
+class TestRFMSegmentation:
+    """Tests for the RFMSegmentation class."""
+
+    @pytest.fixture()
+    def base_df(self):
+        """Return a base DataFrame for testing."""
+        return pd.DataFrame(
+            {
+                cols.customer_id: [1, 2, 3, 4, 5],
+                cols.transaction_id: [101, 102, 103, 104, 105],
+                cols.unit_spend: [100.0, 200.0, 150.0, 300.0, 250.0],
+                cols.transaction_date: [
+                    "2025-03-01",
+                    "2025-02-15",
+                    "2025-01-30",
+                    "2025-03-10",
+                    "2025-02-20",
+                ],
+            },
+        )
+
+    def test_correct_rfm_segmentation(self, base_df):
+        """Test that the RFM segmentation correctly calculates the RFM scores and segments."""
+        current_date = "2025-03-17"
+        rfm_segmentation = RFMSegmentation(df=base_df, current_date=current_date)
+        result_df = rfm_segmentation.df
+        expected_df = pd.DataFrame(
+            {
+                "customer_id": [1, 2, 3, 4, 5],
+                "rfm_segment": [104, 312, 423, 30, 241],
+            },
+        ).set_index("customer_id")
+
+        pd.testing.assert_frame_equal(
+            result_df[["rfm_segment"]].sort_index(),
+            expected_df[["rfm_segment"]].sort_index(),
+            check_like=True,
+        )
+
+    def test_handles_dataframe_with_missing_columns(self):
+        """Test that the method raises an error when required columns are missing."""
+        base_df = pd.DataFrame(
+            {
+                cols.customer_id: [1, 2, 3],
+                cols.unit_spend: [100.0, 200.0, 150.0],
+                cols.transaction_id: [101, 102, 103],
+            },
+        )
+
+        with pytest.raises(ValueError):
+            RFMSegmentation(df=base_df, current_date="2025-03-17")
+
+    def test_single_customer(self):
+        """Test that the method correctly calculates RFM segmentation for a single customer."""
+        df_single_customer = pd.DataFrame(
+            {
+                cols.customer_id: [1],
+                cols.transaction_id: [101],
+                cols.unit_spend: [200.0],
+                cols.transaction_date: ["2025-03-01"],
+            },
+        )
+        current_date = "2025-03-17"
+        rfm_segmentation = RFMSegmentation(df=df_single_customer, current_date=current_date)
+        result_df = rfm_segmentation.df
+        assert result_df.loc[1, "rfm_segment"] == 0
+
+    def test_multiple_transactions_per_customer(self):
+        """Test that the method correctly handles multiple transactions for the same customer."""
+        df_multiple_transactions = pd.DataFrame(
+            {
+                cols.customer_id: [1, 1, 1, 1, 1],
+                cols.transaction_id: [101, 102, 103, 104, 105],
+                cols.unit_spend: [120.0, 250.0, 180.0, 300.0, 220.0],
+                cols.transaction_date: [
+                    "2025-03-01",
+                    "2025-02-15",
+                    "2025-01-10",
+                    "2025-03-10",
+                    "2025-02-25",
+                ],
+            },
+        )
+        current_date = "2025-03-17"
+        rfm_segmentation = RFMSegmentation(df=df_multiple_transactions, current_date=current_date)
+        result_df = rfm_segmentation.df
+
+        assert result_df.loc[1, "rfm_segment"] == 0
+
+    def test_calculates_rfm_correctly_for_all_customers(self, base_df):
+        """Test that RFM scores are calculated correctly for all customers."""
+        current_date = "2025-03-17"
+        expected_customer_count = 5
+        rfm_segmentation = RFMSegmentation(df=base_df, current_date=current_date)
+        result_df = rfm_segmentation.df
+
+        assert len(result_df) == expected_customer_count
+        assert "rfm_segment" in result_df.columns
+
+    def test_rfm_segmentation_with_no_date(self, base_df):
+        """Test that the RFM segmentation correctly calculates the RFM scores and segments."""
+        rfm_segmentation = RFMSegmentation(df=base_df)
+        result_df = rfm_segmentation.df
+        expected_df = pd.DataFrame(
+            {
+                "customer_id": [1, 2, 3, 4, 5],
+                "rfm_segment": [104, 312, 423, 30, 241],
+            },
+        ).set_index("customer_id")
+
+        pd.testing.assert_frame_equal(
+            result_df[["rfm_segment"]].sort_index(),
+            expected_df[["rfm_segment"]].sort_index(),
+            check_like=True,
+        )