Skip to content

RFM Segmentation #140

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions docs/analysis_modules.md
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,55 @@ segment_stats.df
| Total | 4604.28 | 150 | 50 | 92.0856 | 30.6952 | 3 | 1 |
<!-- markdownlint-enable MD013 -->


### RFM Segmentation

<div class="clear" markdown>

![RFM Segmentation Distribution](assets/images/analysis_modules/rfm_segmentation.svg){ align=right loading=lazy width="50%"}

**Recency, Frequency, Monetary (RFM) segmentation** categorizes customers based on their purchasing behavior:

- **Recency (R)**: How recently a customer made a purchase
- **Frequency (F)**: How often a customer makes purchases
- **Monetary (M)**: How much a customer spends

Each metric is typically scored on a scale, and the combined RFM score helps businesses identify **loyal customers, at-risk customers, and high-value buyers**.

RFM segmentation helps answer questions such as:

- Who are your most valuable customers?
- Which customers are at risk of churn?
- Which customers should be targeted for re-engagement?

</div>

Example:

```python
import pandas as pd
from pyretailscience.analysis.segmentation import RFMSegmentation

data = pd.DataFrame({
"customer_id": [1, 1, 2, 2, 3, 3, 3],
"transaction_id": [101, 102, 201, 202, 301, 302, 303],
"transaction_date": ["2024-03-01", "2024-03-10", "2024-02-20", "2024-02-25", "2024-01-15", "2024-01-20", "2024-02-05"],
"unit_spend": [50, 75, 100, 150, 200, 250, 300]
})

data["transaction_date"] = pd.to_datetime(data["transaction_date"])
current_date = "2024-07-01"

rfm_segmenter = RFMSegmentation(df=data, current_date=current_date)
rfm_results = rfm_segmenter.df
```

| customer_id | recency_days | frequency | monetary | r_score | f_score | m_score | rfm_segment | fm_segment |
|-------------|--------------|-----------|----------|---------|---------|---------|-------------|------------|
| 1 | 113 | 2 | 125 | 0 | 0 | 0 | 0 | 0 |
| 2 | 127 | 2 | 250 | 1 | 1 | 1 | 111 | 11 |
| 3 | 147 | 3 | 750 | 2 | 2 | 2 | 222 | 22 |

### Purchases Per Customer

<div class="clear" markdown>
Expand Down
113 changes: 113 additions & 0 deletions pyretailscience/analysis/segmentation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""This module contains classes for segmenting customers based on their spend and transaction statistics by segment."""

import datetime
from typing import Literal

import ibis
Expand Down Expand Up @@ -452,3 +453,115 @@ def plot(
gu.standard_tick_styles(ax)

return ax


class RFMSegmentation:
"""Segments customers using the RFM (Recency, Frequency, Monetary) methodology.

Customers are scored on three dimensions:
- Recency (R): Days since the last transaction (lower is better).
- Frequency (F): Number of unique transactions (higher is better).
- Monetary (M): Total amount spent (higher is better).

Each metric is ranked into 10 bins (0-9) using NTILE(10) where,
- 9 represents the best score (top 10% of customers).
- 0 represents the lowest score (bottom 10% of customers).
The RFM segment is a 3-digit number (R*100 + F*10 + M), representing customer value.
"""

_df: pd.DataFrame | None = None

def __init__(self, df: pd.DataFrame | ibis.Table, current_date: str | datetime.date | None = None) -> None:
"""Initializes the RFM segmentation process.

Args:
df (pd.DataFrame | ibis.Table): A DataFrame or Ibis table containing transaction data.
Must include the following columns:
- customer_id
- transaction_date
- unit_spend
- transaction_id
current_date (Optional[Union[str, datetime.date]]): The reference date for calculating recency.
Can be a string (format: "YYYY-MM-DD"), a date object, or None (defaults to the current system date).

Raises:
ValueError: If the dataframe is missing required columns.
TypeError: If the input data is not a pandas DataFrame or an Ibis Table.
"""
cols = ColumnHelper()
required_cols = [
cols.customer_id,
cols.transaction_date,
cols.unit_spend,
cols.transaction_id,
]
if isinstance(df, pd.DataFrame):
df = ibis.memtable(df)
elif not isinstance(df, ibis.Table):
raise TypeError("df must be either a pandas DataFrame or an Ibis Table")

missing_cols = set(required_cols) - set(df.columns)
if missing_cols:
error_message = f"Missing required columns: {missing_cols}"
raise ValueError(error_message)

if isinstance(current_date, str):
current_date = datetime.date.fromisoformat(current_date)
elif current_date is None:
current_date = datetime.datetime.now(datetime.UTC).date()
elif not isinstance(current_date, datetime.date):
raise TypeError("current_date must be a string in 'YYYY-MM-DD' format, a datetime.date object, or None")

self.table = self._compute_rfm(df, current_date)

def _compute_rfm(self, df: ibis.Table, current_date: datetime.date) -> ibis.Table:
"""Computes the RFM metrics and segments customers accordingly.

Args:
df (ibis.Table): The transaction data table.
current_date (datetime.date): The reference date for calculating recency.

Returns:
ibis.Table: A table with RFM scores and segment values.
"""
cols = ColumnHelper()
current_date_expr = ibis.literal(current_date)

customer_metrics = df.group_by(cols.customer_id).aggregate(
recency_days=(current_date_expr - df[cols.transaction_date].max().cast("date")).cast("int32"),
frequency=df[cols.transaction_id].nunique(),
monetary=df[cols.unit_spend].sum(),
)

window_recency = ibis.window(
order_by=[ibis.asc(customer_metrics.recency_days), ibis.asc(customer_metrics.customer_id)],
)
window_frequency = ibis.window(
order_by=[ibis.asc(customer_metrics.frequency), ibis.asc(customer_metrics.customer_id)],
)
window_monetary = ibis.window(
order_by=[ibis.asc(customer_metrics.monetary), ibis.asc(customer_metrics.customer_id)],
)

rfm_scores = customer_metrics.mutate(
r_score=(ibis.ntile(10).over(window_recency)),
f_score=(ibis.ntile(10).over(window_frequency)),
m_score=(ibis.ntile(10).over(window_monetary)),
)

return rfm_scores.mutate(
rfm_segment=(rfm_scores.r_score * 100 + rfm_scores.f_score * 10 + rfm_scores.m_score),
fm_segment=(rfm_scores.f_score * 10 + rfm_scores.m_score),
)

@property
def df(self) -> pd.DataFrame:
"""Returns the dataframe with the segment names."""
if self._df is None:
self._df = self.table.execute().set_index(get_option("column.customer_id"))
return self._df

@property
def ibis_table(self) -> ibis.Table:
"""Returns the computed Ibis table with RFM segmentation."""
return self.table
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Add unit tests for the ibis_table property.

Line 568 returns the ibis_table but the patch coverage warning suggests no direct test invokes or verifies this property. Adding a test ensures full coverage and confidence in its functionality.

🧰 Tools
🪛 GitHub Check: codecov/patch

[warning] 568-568: pyretailscience/analysis/segmentation.py#L568
Added line #L568 was not covered by tests

156 changes: 155 additions & 1 deletion tests/analysis/test_segmentation.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
"""Tests for the SegTransactionStats class."""

import ibis
import numpy as np
import pandas as pd
import pytest

from pyretailscience.analysis.segmentation import HMLSegmentation, SegTransactionStats, ThresholdSegmentation
from pyretailscience.analysis.segmentation import (
HMLSegmentation,
RFMSegmentation,
SegTransactionStats,
ThresholdSegmentation,
)
from pyretailscience.options import ColumnHelper, get_option

cols = ColumnHelper()
Expand Down Expand Up @@ -545,3 +551,151 @@ def test_alternate_value_col(self, base_df):
assert result_df.loc[2, "segment_name"] == "Light"
assert result_df.loc[4, "segment_name"] == "Medium"
assert result_df.loc[5, "segment_name"] == "Light"


class TestRFMSegmentation:
"""Tests for the RFMSegmentation class."""

@pytest.fixture
def base_df(self):
"""Return a base DataFrame for testing."""
return pd.DataFrame(
{
cols.customer_id: [1, 2, 3, 4, 5],
cols.transaction_id: [101, 102, 103, 104, 105],
cols.unit_spend: [100.0, 200.0, 150.0, 300.0, 250.0],
cols.transaction_date: [
"2025-03-01",
"2025-02-15",
"2025-01-30",
"2025-03-10",
"2025-02-20",
],
},
)

@pytest.fixture
def expected_df(self):
"""Returns the expected DataFrame for testing segmentation."""
return pd.DataFrame(
{
"customer_id": [1, 2, 3, 4, 5],
"frequency": [1, 1, 1, 1, 1],
"monetary": [100.0, 200.0, 150.0, 300.0, 250.0],
"r_score": [1, 3, 4, 0, 2],
"f_score": [0, 1, 2, 3, 4],
"m_score": [0, 2, 1, 4, 3],
"rfm_segment": [100, 312, 421, 34, 243],
"fm_segment": [0, 12, 21, 34, 43],
},
).set_index("customer_id")

def test_correct_rfm_segmentation(self, base_df, expected_df):
"""Test that the RFM segmentation correctly calculates the RFM scores and segments."""
current_date = "2025-03-17"
rfm_segmentation = RFMSegmentation(df=base_df, current_date=current_date)
result_df = rfm_segmentation.df
expected_df["recency_days"] = [16, 30, 46, 7, 25]
expected_df["recency_days"] = expected_df["recency_days"].astype(result_df["recency_days"].dtype)

pd.testing.assert_frame_equal(
result_df.sort_index(),
expected_df.sort_index(),
check_like=True,
)

def test_handles_dataframe_with_missing_columns(self):
"""Test that the method raises an error when required columns are missing."""
base_df = pd.DataFrame(
{
cols.customer_id: [1, 2, 3],
cols.unit_spend: [100.0, 200.0, 150.0],
cols.transaction_id: [101, 102, 103],
},
)

with pytest.raises(ValueError):
RFMSegmentation(df=base_df, current_date="2025-03-17")

def test_single_customer(self):
"""Test that the method correctly calculates RFM segmentation for a single customer."""
df_single_customer = pd.DataFrame(
{
cols.customer_id: [1],
cols.transaction_id: [101],
cols.unit_spend: [200.0],
cols.transaction_date: ["2025-03-01"],
},
)
current_date = "2025-03-17"
rfm_segmentation = RFMSegmentation(df=df_single_customer, current_date=current_date)
result_df = rfm_segmentation.df
assert result_df.loc[1, "rfm_segment"] == 0

def test_multiple_transactions_per_customer(self):
"""Test that the method correctly handles multiple transactions for the same customer."""
df_multiple_transactions = pd.DataFrame(
{
cols.customer_id: [1, 1, 1, 1, 1],
cols.transaction_id: [101, 102, 103, 104, 105],
cols.unit_spend: [120.0, 250.0, 180.0, 300.0, 220.0],
cols.transaction_date: [
"2025-03-01",
"2025-02-15",
"2025-01-10",
"2025-03-10",
"2025-02-25",
],
},
)
current_date = "2025-03-17"
rfm_segmentation = RFMSegmentation(df=df_multiple_transactions, current_date=current_date)
result_df = rfm_segmentation.df

assert result_df.loc[1, "rfm_segment"] == 0

def test_calculates_rfm_correctly_for_all_customers(self, base_df):
"""Test that RFM scores are calculated correctly for all customers."""
current_date = "2025-03-17"
expected_customer_count = 5
rfm_segmentation = RFMSegmentation(df=base_df, current_date=current_date)
result_df = rfm_segmentation.df

assert len(result_df) == expected_customer_count
assert "rfm_segment" in result_df.columns

def test_rfm_segmentation_with_no_date(self, base_df, expected_df):
"""Test that the RFM segmentation correctly calculates the RFM scores and segments."""
rfm_segmentation = RFMSegmentation(df=base_df)
result_df = rfm_segmentation.df
expected_df["recency_days"] = [18, 32, 48, 9, 27]
expected_df["recency_days"] = expected_df["recency_days"].astype(result_df["recency_days"].dtype)

pd.testing.assert_frame_equal(
result_df.sort_index(),
expected_df.sort_index(),
check_like=True,
)

def test_invalid_current_date_type(self, base_df):
"""Test that RFMSegmentation raises a TypeError when an invalid current_date is provided."""
with pytest.raises(
TypeError,
match="current_date must be a string in 'YYYY-MM-DD' format, a datetime.date object, or None",
):
RFMSegmentation(base_df, current_date=12345)

def test_invalid_df_type(self):
"""Test that RFMSegmentation raises a TypeError when df is neither a DataFrame nor an Ibis Table."""
invalid_df = "this is not a dataframe"

with pytest.raises(TypeError, match="df must be either a pandas DataFrame or an Ibis Table"):
RFMSegmentation(df=invalid_df, current_date="2025-03-17")

def test_ibis_table_property(self, base_df):
"""Test that ibis_table property returns an Ibis Table."""
segmentation = RFMSegmentation(df=base_df, current_date="2025-03-17")

result = segmentation.ibis_table

assert isinstance(result, ibis.Table), "Expected ibis.Table but got a different type"
1 change: 1 addition & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.