-
Notifications
You must be signed in to change notification settings - Fork 1
Split Segmentation #154
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Split Segmentation #154
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Base Segmentation | ||
|
||
::: pyretailscience.segmentation.base |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# HML Segmentation | ||
|
||
::: pyretailscience.segmentation.hml |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# RFM Segmentation | ||
|
||
::: pyretailscience.segmentation.rfm |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# SegTransactionStats Segmentation | ||
|
||
::: pyretailscience.segmentation.segstats |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Threshold Segmentation | ||
|
||
::: pyretailscience.segmentation.threshold |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,7 +26,12 @@ nav: | |
- Haversine Distance: api/analysis/haversine.md | ||
- Product Association: api/analysis/product_association.md | ||
- Revenue Tree: api/analysis/revenue_tree.md | ||
- Segmentation: api/analysis/segmentation.md | ||
- Segmentation: | ||
- Base Segmentation: api/segmentation/base.md | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this file exist? I don't see it in the PR |
||
- HML Segmentation: api/segmentation/hml.md | ||
- RFM Segmentation: api/segmentation/rfm.md | ||
- SegTransactionStats Segmentation: api/segmentation/segstats.md | ||
- Threshold Segmentation: api/segmentation/threshold.md | ||
- Plots: | ||
- Area Plot: api/plots/area.md | ||
- Bar Plot: api/plots/bar.md | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
"""This module provides a base class for segmenting customers based on their spend and transaction statistics.""" | ||
|
||
import pandas as pd | ||
|
||
from pyretailscience.options import get_option | ||
|
||
|
||
class BaseSegmentation: | ||
"""A base class for customer segmentation.""" | ||
|
||
def add_segment(self, df: pd.DataFrame) -> pd.DataFrame: | ||
"""Adds the segment to the dataframe based on the customer_id column. | ||
Args: | ||
df (pd.DataFrame): The dataframe to add the segment to. The dataframe must have a customer_id column. | ||
Returns: | ||
pd.DataFrame: The dataframe with the segment added. | ||
Raises: | ||
ValueError: If the number of rows before and after the merge do not match. | ||
""" | ||
rows_before = len(df) | ||
df = df.merge( | ||
self.df["segment_name"], | ||
how="left", | ||
left_on=get_option("column.customer_id"), | ||
right_index=True, | ||
) | ||
rows_after = len(df) | ||
if rows_before != rows_after: | ||
raise ValueError("The number of rows before and after the merge do not match. This should not happen.") | ||
murray-ds marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
return df |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
"""This module provides the `HMLSegmentation` class for categorizing customers into spend-based segments. | ||
|
||
HMLSegmentation extends `ThresholdSegmentation` and classifies customers into Heavy, Medium, Light, | ||
and optionally Zero spenders based on the Pareto principle (80/20 rule). It is commonly used in retail | ||
to analyze customer spending behavior and optimize marketing strategies. | ||
""" | ||
|
||
from typing import Literal | ||
|
||
import ibis | ||
import pandas as pd | ||
|
||
from pyretailscience.segmentation.threshold import ThresholdSegmentation | ||
|
||
|
||
class HMLSegmentation(ThresholdSegmentation): | ||
"""Segments customers into Heavy, Medium, Light and Zero spenders based on the total spend.""" | ||
|
||
def __init__( | ||
self, | ||
df: pd.DataFrame | ibis.Table, | ||
value_col: str | None = None, | ||
agg_func: str = "sum", | ||
zero_value_customers: Literal["separate_segment", "exclude", "include_with_light"] = "separate_segment", | ||
) -> None: | ||
"""Segments customers into Heavy, Medium, Light and Zero spenders based on the total spend. | ||
|
||
HMLSegmentation is a subclass of ThresholdSegmentation and based around an industry standard definition. The | ||
thresholds for Heavy (top 20%), Medium (next 30%) and Light (bottom 50%) are chosen based on the pareto | ||
distribution, commonly know as the 80/20 rule. It is typically used in retail to segment customers based on | ||
their spend, transaction volume or quantities purchased. | ||
|
||
Args: | ||
df (pd.DataFrame): A dataframe with the transaction data. The dataframe must contain a customer_id column. | ||
value_col (str, optional): The column to use for the segmentation. Defaults to get_option("column.unit_spend"). | ||
agg_func (str, optional): The aggregation function to use when grouping by customer_id. Defaults to "sum". | ||
zero_value_customers (Literal["separate_segment", "exclude", "include_with_light"], optional): How to handle | ||
customers with zero spend. Defaults to "separate_segment". | ||
""" | ||
thresholds = [0.500, 0.800, 1] | ||
segments = ["Light", "Medium", "Heavy"] | ||
super().__init__( | ||
df=df, | ||
value_col=value_col, | ||
agg_func=agg_func, | ||
thresholds=thresholds, | ||
segments=segments, | ||
zero_value_customers=zero_value_customers, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
"""Customer Segmentation Using RFM Analysis. | ||
|
||
This module implements RFM (Recency, Frequency, Monetary) segmentation, a widely used technique in customer analytics | ||
to categorize customers based on their purchasing behavior. | ||
|
||
RFM segmentation assigns scores to customers based on: | ||
1. Recency (R): How recently a customer made a purchase. | ||
2. Frequency (F): How often a customer makes purchases. | ||
3. Monetary (M): The total amount spent by a customer. | ||
|
||
### Benefits of RFM Segmentation: | ||
- **Customer Value Analysis**: Identifies high-value customers who contribute the most revenue. | ||
- **Personalized Marketing**: Enables targeted campaigns based on customer purchasing behavior. | ||
- **Customer Retention Strategies**: Helps recognize at-risk customers and develop engagement strategies. | ||
- **Sales Forecasting**: Provides insights into future revenue trends based on past spending behavior. | ||
|
||
### Scoring Methodology: | ||
- Each metric (R, F, M) is divided into 10 bins (0-9) using the NTILE(10) function. | ||
- A higher score indicates a better customer (e.g., lower recency, higher frequency, and monetary value). | ||
- The final RFM segment is computed as `R*100 + F*10 + M`, providing a unique customer classification. | ||
|
||
This module leverages `pandas` and `ibis` for efficient data processing and integrates with retail analytics workflows | ||
to enhance customer insights and business decision-making. | ||
""" | ||
|
||
import datetime | ||
|
||
import ibis | ||
import pandas as pd | ||
|
||
from pyretailscience.options import ColumnHelper, get_option | ||
|
||
|
||
class RFMSegmentation: | ||
"""Segments customers using the RFM (Recency, Frequency, Monetary) methodology. | ||
|
||
Customers are scored on three dimensions: | ||
- Recency (R): Days since the last transaction (lower is better). | ||
- Frequency (F): Number of unique transactions (higher is better). | ||
- Monetary (M): Total amount spent (higher is better). | ||
|
||
Each metric is ranked into 10 bins (0-9) using NTILE(10) where, | ||
- 9 represents the best score (top 10% of customers). | ||
- 0 represents the lowest score (bottom 10% of customers). | ||
The RFM segment is a 3-digit number (R*100 + F*10 + M), representing customer value. | ||
""" | ||
|
||
_df: pd.DataFrame | None = None | ||
|
||
def __init__(self, df: pd.DataFrame | ibis.Table, current_date: str | datetime.date | None = None) -> None: | ||
"""Initializes the RFM segmentation process. | ||
|
||
Args: | ||
df (pd.DataFrame | ibis.Table): A DataFrame or Ibis table containing transaction data. | ||
Must include the following columns: | ||
- customer_id | ||
- transaction_date | ||
- unit_spend | ||
- transaction_id | ||
current_date (Optional[Union[str, datetime.date]]): The reference date for calculating recency. | ||
Can be a string (format: "YYYY-MM-DD"), a date object, or None (defaults to the current system date). | ||
|
||
Raises: | ||
ValueError: If the dataframe is missing required columns. | ||
TypeError: If the input data is not a pandas DataFrame or an Ibis Table. | ||
""" | ||
cols = ColumnHelper() | ||
required_cols = [ | ||
cols.customer_id, | ||
cols.transaction_date, | ||
cols.unit_spend, | ||
cols.transaction_id, | ||
] | ||
if isinstance(df, pd.DataFrame): | ||
df = ibis.memtable(df) | ||
elif not isinstance(df, ibis.Table): | ||
raise TypeError("df must be either a pandas DataFrame or an Ibis Table") | ||
|
||
missing_cols = set(required_cols) - set(df.columns) | ||
if missing_cols: | ||
error_message = f"Missing required columns: {missing_cols}" | ||
raise ValueError(error_message) | ||
|
||
if isinstance(current_date, str): | ||
current_date = datetime.date.fromisoformat(current_date) | ||
elif current_date is None: | ||
current_date = datetime.datetime.now(datetime.UTC).date() | ||
elif not isinstance(current_date, datetime.date): | ||
raise TypeError("current_date must be a string in 'YYYY-MM-DD' format, a datetime.date object, or None") | ||
|
||
self.table = self._compute_rfm(df, current_date) | ||
|
||
Comment on lines
+50
to
+92
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Verification agent 🧩 Analysis chainInitialization method has thorough validation. The However, there's a potential Python version compatibility issue: 🏁 Script executed: #!/bin/bash
# Description: Check Python version requirement for datetime.UTC usage
# Check if the repository specifies a minimum Python version
grep -r "python_requires\|python_version" setup.py pyproject.toml requirements*.txt | head -5
# Check if the code uses datetime.UTC elsewhere
rg "datetime\.UTC" Length of output: 317 Attention: Python Version Compatibility for
|
||
def _compute_rfm(self, df: ibis.Table, current_date: datetime.date) -> ibis.Table: | ||
"""Computes the RFM metrics and segments customers accordingly. | ||
|
||
Args: | ||
df (ibis.Table): The transaction data table. | ||
current_date (datetime.date): The reference date for calculating recency. | ||
|
||
Returns: | ||
ibis.Table: A table with RFM scores and segment values. | ||
""" | ||
cols = ColumnHelper() | ||
current_date_expr = ibis.literal(current_date) | ||
|
||
customer_metrics = df.group_by(cols.customer_id).aggregate( | ||
recency_days=(current_date_expr - df[cols.transaction_date].max().cast("date")).cast("int32"), | ||
frequency=df[cols.transaction_id].nunique(), | ||
monetary=df[cols.unit_spend].sum(), | ||
) | ||
|
||
window_recency = ibis.window( | ||
order_by=[ibis.asc(customer_metrics.recency_days), ibis.asc(customer_metrics.customer_id)], | ||
) | ||
window_frequency = ibis.window( | ||
order_by=[ibis.asc(customer_metrics.frequency), ibis.asc(customer_metrics.customer_id)], | ||
) | ||
window_monetary = ibis.window( | ||
order_by=[ibis.asc(customer_metrics.monetary), ibis.asc(customer_metrics.customer_id)], | ||
) | ||
|
||
rfm_scores = customer_metrics.mutate( | ||
r_score=(ibis.ntile(10).over(window_recency)), | ||
f_score=(ibis.ntile(10).over(window_frequency)), | ||
m_score=(ibis.ntile(10).over(window_monetary)), | ||
) | ||
|
||
return rfm_scores.mutate( | ||
rfm_segment=(rfm_scores.r_score * 100 + rfm_scores.f_score * 10 + rfm_scores.m_score), | ||
fm_segment=(rfm_scores.f_score * 10 + rfm_scores.m_score), | ||
) | ||
|
||
@property | ||
def df(self) -> pd.DataFrame: | ||
"""Returns the dataframe with the segment names.""" | ||
if self._df is None: | ||
self._df = self.table.execute().set_index(get_option("column.customer_id")) | ||
return self._df |
Uh oh!
There was an error while loading. Please reload this page.