Skip to content

Commit 6a4058b

Browse files
committed
feat: split segmentations.py file
1 parent 91edbf4 commit 6a4058b

File tree

18 files changed

+1109
-1026
lines changed

18 files changed

+1109
-1026
lines changed

docs/analysis_modules.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -681,7 +681,7 @@ Example:
681681

682682
```python
683683
from pyretailscience.plots import bar
684-
from pyretailscience.analysis.segmentation import HMLSegmentation
684+
from pyretailscience.segmentation.hml import HMLSegmentation
685685

686686
seg = HMLSegmentation(df, zero_value_customers="include_with_light")
687687

@@ -724,7 +724,7 @@ Example:
724724

725725
```python
726726
from pyretailscience.plots import bar
727-
from pyretailscience.analysis.segmentation import ThresholdSegmentation
727+
from pyretailscience.segmentation.threshold import ThresholdSegmentation
728728

729729
# Create custom segmentation with quartiles
730730
# Define thresholds at 25%, 50%, 75%, and 100% (quartiles)
@@ -766,7 +766,8 @@ segmentation.
766766
Example:
767767

768768
```python
769-
from pyretailscience.analysis.segmentation import HMLSegmentation, SegTransactionStats
769+
from pyretailscience.segmentation.segstats import SegTransactionStats
770+
from pyretailscience.segmentation.hml import HMLSegmentation
770771

771772
seg = HMLSegmentation(df, zero_value_customers="include_with_light")
772773

@@ -818,7 +819,7 @@ Example:
818819

819820
```python
820821
import pandas as pd
821-
from pyretailscience.analysis.segmentation import RFMSegmentation
822+
from pyretailscience.segmentation.rfm import RFMSegmentation
822823

823824
data = pd.DataFrame({
824825
"customer_id": [1, 1, 2, 2, 3, 3, 3],

docs/api/segmentation/base.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Base Segmentation
2+
3+
::: pyretailscience.segmentation.base

docs/api/segmentation/hml.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# HML Segmentation
2+
3+
::: pyretailscience.segmentation.hml

docs/api/segmentation/rfm.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# RFM Segmentation
2+
3+
::: pyretailscience.segmentation.rfm

docs/api/segmentation/segstats.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# SegTransactionStats Segmentation
2+
3+
::: pyretailscience.segmentation.segstats

docs/api/segmentation/threshold.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Threshold Segmentation
2+
3+
::: pyretailscience.segmentation.threshold

mkdocs.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,12 @@ nav:
2626
- Haversine Distance: api/analysis/haversine.md
2727
- Product Association: api/analysis/product_association.md
2828
- Revenue Tree: api/analysis/revenue_tree.md
29-
- Segmentation: api/analysis/segmentation.md
29+
- Segmentation:
30+
- Base Segmentation: api/segmentation/base.md
31+
- HML Segmentation: api/segmentation/hml.md
32+
- RFM Segmentation: api/segmentation/rfm.md
33+
- SegTransactionStats Segmentation: api/segmentation/segstats.md
34+
- Threshold Segmentation: api/segmentation/threshold.md
3035
- Plots:
3136
- Area Plot: api/plots/area.md
3237
- Bar Plot: api/plots/bar.md

pyretailscience/segmentation/__init__.py

Whitespace-only changes.

pyretailscience/segmentation/base.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""This module provides a base class for segmenting customers based on their spend and transaction statistics."""
2+
3+
import pandas as pd
4+
5+
from pyretailscience.options import get_option
6+
7+
8+
class BaseSegmentation:
9+
"""A base class for customer segmentation."""
10+
11+
def add_segment(self, df: pd.DataFrame) -> pd.DataFrame:
12+
"""Adds the segment to the dataframe based on the customer_id column.
13+
14+
Args:
15+
df (pd.DataFrame): The dataframe to add the segment to. The dataframe must have a customer_id column.
16+
17+
Returns:
18+
pd.DataFrame: The dataframe with the segment added.
19+
20+
Raises:
21+
ValueError: If the number of rows before and after the merge do not match.
22+
"""
23+
rows_before = len(df)
24+
df = df.merge(
25+
self.df["segment_name"],
26+
how="left",
27+
left_on=get_option("column.customer_id"),
28+
right_index=True,
29+
)
30+
rows_after = len(df)
31+
if rows_before != rows_after:
32+
raise ValueError("The number of rows before and after the merge do not match. This should not happen.")
33+
34+
return df

pyretailscience/segmentation/hml.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
"""This module provides the `HMLSegmentation` class for categorizing customers into spend-based segments.
2+
3+
HMLSegmentation extends `ThresholdSegmentation` and classifies customers into Heavy, Medium, Light,
4+
and optionally Zero spenders based on the Pareto principle (80/20 rule). It is commonly used in retail
5+
to analyze customer spending behavior and optimize marketing strategies.
6+
"""
7+
8+
from typing import Literal
9+
10+
import ibis
11+
import pandas as pd
12+
13+
from pyretailscience.segmentation.threshold import ThresholdSegmentation
14+
15+
16+
class HMLSegmentation(ThresholdSegmentation):
17+
"""Segments customers into Heavy, Medium, Light and Zero spenders based on the total spend."""
18+
19+
def __init__(
20+
self,
21+
df: pd.DataFrame | ibis.Table,
22+
value_col: str | None = None,
23+
agg_func: str = "sum",
24+
zero_value_customers: Literal["separate_segment", "exclude", "include_with_light"] = "separate_segment",
25+
) -> None:
26+
"""Segments customers into Heavy, Medium, Light and Zero spenders based on the total spend.
27+
28+
HMLSegmentation is a subclass of ThresholdSegmentation and based around an industry standard definition. The
29+
thresholds for Heavy (top 20%), Medium (next 30%) and Light (bottom 50%) are chosen based on the pareto
30+
distribution, commonly know as the 80/20 rule. It is typically used in retail to segment customers based on
31+
their spend, transaction volume or quantities purchased.
32+
33+
Args:
34+
df (pd.DataFrame): A dataframe with the transaction data. The dataframe must contain a customer_id column.
35+
value_col (str, optional): The column to use for the segmentation. Defaults to get_option("column.unit_spend").
36+
agg_func (str, optional): The aggregation function to use when grouping by customer_id. Defaults to "sum".
37+
zero_value_customers (Literal["separate_segment", "exclude", "include_with_light"], optional): How to handle
38+
customers with zero spend. Defaults to "separate_segment".
39+
"""
40+
thresholds = [0.500, 0.800, 1]
41+
segments = ["Light", "Medium", "Heavy"]
42+
super().__init__(
43+
df=df,
44+
value_col=value_col,
45+
agg_func=agg_func,
46+
thresholds=thresholds,
47+
segments=segments,
48+
zero_value_customers=zero_value_customers,
49+
)

pyretailscience/segmentation/rfm.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
"""Customer Segmentation Using RFM Analysis.
2+
3+
This module implements RFM (Recency, Frequency, Monetary) segmentation, a widely used technique in customer analytics
4+
to categorize customers based on their purchasing behavior.
5+
6+
RFM segmentation assigns scores to customers based on:
7+
1. Recency (R): How recently a customer made a purchase.
8+
2. Frequency (F): How often a customer makes purchases.
9+
3. Monetary (M): The total amount spent by a customer.
10+
11+
### Benefits of RFM Segmentation:
12+
- **Customer Value Analysis**: Identifies high-value customers who contribute the most revenue.
13+
- **Personalized Marketing**: Enables targeted campaigns based on customer purchasing behavior.
14+
- **Customer Retention Strategies**: Helps recognize at-risk customers and develop engagement strategies.
15+
- **Sales Forecasting**: Provides insights into future revenue trends based on past spending behavior.
16+
17+
### Scoring Methodology:
18+
- Each metric (R, F, M) is divided into 10 bins (0-9) using the NTILE(10) function.
19+
- A higher score indicates a better customer (e.g., lower recency, higher frequency, and monetary value).
20+
- The final RFM segment is computed as `R*100 + F*10 + M`, providing a unique customer classification.
21+
22+
This module leverages `pandas` and `ibis` for efficient data processing and integrates with retail analytics workflows
23+
to enhance customer insights and business decision-making.
24+
"""
25+
26+
import datetime
27+
28+
import ibis
29+
import pandas as pd
30+
31+
from pyretailscience.options import ColumnHelper, get_option
32+
33+
34+
class RFMSegmentation:
35+
"""Segments customers using the RFM (Recency, Frequency, Monetary) methodology.
36+
37+
Customers are scored on three dimensions:
38+
- Recency (R): Days since the last transaction (lower is better).
39+
- Frequency (F): Number of unique transactions (higher is better).
40+
- Monetary (M): Total amount spent (higher is better).
41+
42+
Each metric is ranked into 10 bins (0-9) using NTILE(10) where,
43+
- 9 represents the best score (top 10% of customers).
44+
- 0 represents the lowest score (bottom 10% of customers).
45+
The RFM segment is a 3-digit number (R*100 + F*10 + M), representing customer value.
46+
"""
47+
48+
_df: pd.DataFrame | None = None
49+
50+
def __init__(self, df: pd.DataFrame | ibis.Table, current_date: str | datetime.date | None = None) -> None:
51+
"""Initializes the RFM segmentation process.
52+
53+
Args:
54+
df (pd.DataFrame | ibis.Table): A DataFrame or Ibis table containing transaction data.
55+
Must include the following columns:
56+
- customer_id
57+
- transaction_date
58+
- unit_spend
59+
- transaction_id
60+
current_date (Optional[Union[str, datetime.date]]): The reference date for calculating recency.
61+
Can be a string (format: "YYYY-MM-DD"), a date object, or None (defaults to the current system date).
62+
63+
Raises:
64+
ValueError: If the dataframe is missing required columns.
65+
TypeError: If the input data is not a pandas DataFrame or an Ibis Table.
66+
"""
67+
cols = ColumnHelper()
68+
required_cols = [
69+
cols.customer_id,
70+
cols.transaction_date,
71+
cols.unit_spend,
72+
cols.transaction_id,
73+
]
74+
if isinstance(df, pd.DataFrame):
75+
df = ibis.memtable(df)
76+
elif not isinstance(df, ibis.Table):
77+
raise TypeError("df must be either a pandas DataFrame or an Ibis Table")
78+
79+
missing_cols = set(required_cols) - set(df.columns)
80+
if missing_cols:
81+
error_message = f"Missing required columns: {missing_cols}"
82+
raise ValueError(error_message)
83+
84+
if isinstance(current_date, str):
85+
current_date = datetime.date.fromisoformat(current_date)
86+
elif current_date is None:
87+
current_date = datetime.datetime.now(datetime.UTC).date()
88+
elif not isinstance(current_date, datetime.date):
89+
raise TypeError("current_date must be a string in 'YYYY-MM-DD' format, a datetime.date object, or None")
90+
91+
self.table = self._compute_rfm(df, current_date)
92+
93+
def _compute_rfm(self, df: ibis.Table, current_date: datetime.date) -> ibis.Table:
94+
"""Computes the RFM metrics and segments customers accordingly.
95+
96+
Args:
97+
df (ibis.Table): The transaction data table.
98+
current_date (datetime.date): The reference date for calculating recency.
99+
100+
Returns:
101+
ibis.Table: A table with RFM scores and segment values.
102+
"""
103+
cols = ColumnHelper()
104+
current_date_expr = ibis.literal(current_date)
105+
106+
customer_metrics = df.group_by(cols.customer_id).aggregate(
107+
recency_days=(current_date_expr - df[cols.transaction_date].max().cast("date")).cast("int32"),
108+
frequency=df[cols.transaction_id].nunique(),
109+
monetary=df[cols.unit_spend].sum(),
110+
)
111+
112+
window_recency = ibis.window(
113+
order_by=[ibis.asc(customer_metrics.recency_days), ibis.asc(customer_metrics.customer_id)],
114+
)
115+
window_frequency = ibis.window(
116+
order_by=[ibis.asc(customer_metrics.frequency), ibis.asc(customer_metrics.customer_id)],
117+
)
118+
window_monetary = ibis.window(
119+
order_by=[ibis.asc(customer_metrics.monetary), ibis.asc(customer_metrics.customer_id)],
120+
)
121+
122+
rfm_scores = customer_metrics.mutate(
123+
r_score=(ibis.ntile(10).over(window_recency)),
124+
f_score=(ibis.ntile(10).over(window_frequency)),
125+
m_score=(ibis.ntile(10).over(window_monetary)),
126+
)
127+
128+
return rfm_scores.mutate(
129+
rfm_segment=(rfm_scores.r_score * 100 + rfm_scores.f_score * 10 + rfm_scores.m_score),
130+
fm_segment=(rfm_scores.f_score * 10 + rfm_scores.m_score),
131+
)
132+
133+
@property
134+
def df(self) -> pd.DataFrame:
135+
"""Returns the dataframe with the segment names."""
136+
if self._df is None:
137+
self._df = self.table.execute().set_index(get_option("column.customer_id"))
138+
return self._df

0 commit comments

Comments
 (0)