Data-Simply · mayurkmmt · Mar 26, 2025 · Mar 25, 2025 · murray-ds · Mar 26, 2025
diff --git a/docs/analysis_modules.md b/docs/analysis_modules.md
@@ -681,7 +681,7 @@ Example:
 
 ```python
 from pyretailscience.plots import bar
-from pyretailscience.analysis.segmentation import HMLSegmentation
+from pyretailscience.segmentation.hml import HMLSegmentation
 
 seg = HMLSegmentation(df, zero_value_customers="include_with_light")
 
@@ -724,7 +724,7 @@ Example:
 
 ```python
 from pyretailscience.plots import bar
-from pyretailscience.analysis.segmentation import ThresholdSegmentation
+from pyretailscience.segmentation.threshold import ThresholdSegmentation
 
 # Create custom segmentation with quartiles
 # Define thresholds at 25%, 50%, 75%, and 100% (quartiles)
@@ -766,7 +766,8 @@ segmentation.
 Example:
 
 ```python
-from pyretailscience.analysis.segmentation import HMLSegmentation, SegTransactionStats
+from pyretailscience.segmentation.segstats import SegTransactionStats
+from pyretailscience.segmentation.hml import HMLSegmentation
 
 seg = HMLSegmentation(df, zero_value_customers="include_with_light")
 
@@ -818,7 +819,7 @@ Example:
 
 ```python
 import pandas as pd
-from pyretailscience.analysis.segmentation import RFMSegmentation
+from pyretailscience.segmentation.rfm import RFMSegmentation
 
 data = pd.DataFrame({
     "customer_id": [1, 1, 2, 2, 3, 3, 3],

diff --git a/docs/api/segmentation/base.md b/docs/api/segmentation/base.md
@@ -0,0 +1,3 @@
+# Base Segmentation
+
+::: pyretailscience.segmentation.base
diff --git a/docs/api/segmentation/hml.md b/docs/api/segmentation/hml.md
@@ -0,0 +1,3 @@
+# HML Segmentation
+
+::: pyretailscience.segmentation.hml
diff --git a/docs/api/segmentation/rfm.md b/docs/api/segmentation/rfm.md
@@ -0,0 +1,3 @@
+# RFM Segmentation
+
+::: pyretailscience.segmentation.rfm
diff --git a/docs/api/segmentation/segstats.md b/docs/api/segmentation/segstats.md
@@ -0,0 +1,3 @@
+# SegTransactionStats Segmentation
+
+::: pyretailscience.segmentation.segstats
diff --git a/docs/api/segmentation/threshold.md b/docs/api/segmentation/threshold.md
@@ -0,0 +1,3 @@
+# Threshold Segmentation
+
+::: pyretailscience.segmentation.threshold
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -26,7 +26,12 @@ nav:
           - Haversine Distance: api/analysis/haversine.md
           - Product Association: api/analysis/product_association.md
           - Revenue Tree: api/analysis/revenue_tree.md
-          - Segmentation: api/analysis/segmentation.md
+      - Segmentation:
+          - Base Segmentation: api/segmentation/base.md
+          - HML Segmentation: api/segmentation/hml.md
+          - RFM Segmentation: api/segmentation/rfm.md
+          - SegTransactionStats Segmentation: api/segmentation/segstats.md
+          - Threshold Segmentation: api/segmentation/threshold.md
       - Plots:
           - Area Plot: api/plots/area.md
           - Bar Plot: api/plots/bar.md

diff --git a/pyretailscience/segmentation/__init__.py b/pyretailscience/segmentation/__init__.py
diff --git a/pyretailscience/segmentation/base.py b/pyretailscience/segmentation/base.py
@@ -0,0 +1,34 @@
+"""This module provides a base class for segmenting customers based on their spend and transaction statistics."""
+
+import pandas as pd
+
+from pyretailscience.options import get_option
+
+
+class BaseSegmentation:
+    """A base class for customer segmentation."""
+
+    def add_segment(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Adds the segment to the dataframe based on the customer_id column.
+
+        Args:
+            df (pd.DataFrame): The dataframe to add the segment to. The dataframe must have a customer_id column.
+
+        Returns:
+            pd.DataFrame: The dataframe with the segment added.
+
+        Raises:
+            ValueError: If the number of rows before and after the merge do not match.
+        """
+        rows_before = len(df)
+        df = df.merge(
+            self.df["segment_name"],
+            how="left",
+            left_on=get_option("column.customer_id"),
+            right_index=True,
+        )
+        rows_after = len(df)
+        if rows_before != rows_after:
+            raise ValueError("The number of rows before and after the merge do not match. This should not happen.")
+
+        return df
diff --git a/pyretailscience/segmentation/hml.py b/pyretailscience/segmentation/hml.py
@@ -0,0 +1,49 @@
+"""This module provides the `HMLSegmentation` class for categorizing customers into spend-based segments.
+
+HMLSegmentation extends `ThresholdSegmentation` and classifies customers into Heavy, Medium, Light,
+and optionally Zero spenders based on the Pareto principle (80/20 rule). It is commonly used in retail
+to analyze customer spending behavior and optimize marketing strategies.
+"""
+
+from typing import Literal
+
+import ibis
+import pandas as pd
+
+from pyretailscience.segmentation.threshold import ThresholdSegmentation
+
+
+class HMLSegmentation(ThresholdSegmentation):
+    """Segments customers into Heavy, Medium, Light and Zero spenders based on the total spend."""
+
+    def __init__(
+        self,
+        df: pd.DataFrame | ibis.Table,
+        value_col: str | None = None,
+        agg_func: str = "sum",
+        zero_value_customers: Literal["separate_segment", "exclude", "include_with_light"] = "separate_segment",
+    ) -> None:
+        """Segments customers into Heavy, Medium, Light and Zero spenders based on the total spend.
+
+        HMLSegmentation is a subclass of ThresholdSegmentation and based around an industry standard definition. The
+        thresholds for Heavy (top 20%), Medium (next 30%) and Light (bottom 50%) are chosen based on the pareto
+        distribution, commonly know as the 80/20 rule. It is typically used in retail to segment customers based on
+        their spend, transaction volume or quantities purchased.
+
+        Args:
+            df (pd.DataFrame): A dataframe with the transaction data. The dataframe must contain a customer_id column.
+            value_col (str, optional): The column to use for the segmentation. Defaults to get_option("column.unit_spend").
+            agg_func (str, optional): The aggregation function to use when grouping by customer_id. Defaults to "sum".
+            zero_value_customers (Literal["separate_segment", "exclude", "include_with_light"], optional): How to handle
+                customers with zero spend. Defaults to "separate_segment".
+        """
+        thresholds = [0.500, 0.800, 1]
+        segments = ["Light", "Medium", "Heavy"]
+        super().__init__(
+            df=df,
+            value_col=value_col,
+            agg_func=agg_func,
+            thresholds=thresholds,
+            segments=segments,
+            zero_value_customers=zero_value_customers,
+        )
diff --git a/pyretailscience/segmentation/rfm.py b/pyretailscience/segmentation/rfm.py
@@ -0,0 +1,138 @@
+"""Customer Segmentation Using RFM Analysis.
+
+This module implements RFM (Recency, Frequency, Monetary) segmentation, a widely used technique in customer analytics
+to categorize customers based on their purchasing behavior.
+
+RFM segmentation assigns scores to customers based on:
+1. Recency (R): How recently a customer made a purchase.
+2. Frequency (F): How often a customer makes purchases.
+3. Monetary (M): The total amount spent by a customer.
+
+### Benefits of RFM Segmentation:
+- **Customer Value Analysis**: Identifies high-value customers who contribute the most revenue.
+- **Personalized Marketing**: Enables targeted campaigns based on customer purchasing behavior.
+- **Customer Retention Strategies**: Helps recognize at-risk customers and develop engagement strategies.
+- **Sales Forecasting**: Provides insights into future revenue trends based on past spending behavior.
+
+### Scoring Methodology:
+- Each metric (R, F, M) is divided into 10 bins (0-9) using the NTILE(10) function.
+- A higher score indicates a better customer (e.g., lower recency, higher frequency, and monetary value).
+- The final RFM segment is computed as `R*100 + F*10 + M`, providing a unique customer classification.
+
+This module leverages `pandas` and `ibis` for efficient data processing and integrates with retail analytics workflows
+to enhance customer insights and business decision-making.
+"""
+
+import datetime
+
+import ibis
+import pandas as pd
+
+from pyretailscience.options import ColumnHelper, get_option
+
+
+class RFMSegmentation:
+    """Segments customers using the RFM (Recency, Frequency, Monetary) methodology.
+
+    Customers are scored on three dimensions:
+    - Recency (R): Days since the last transaction (lower is better).
+    - Frequency (F): Number of unique transactions (higher is better).
+    - Monetary (M): Total amount spent (higher is better).
+
+    Each metric is ranked into 10 bins (0-9) using NTILE(10) where,
+    - 9 represents the best score (top 10% of customers).
+    - 0 represents the lowest score (bottom 10% of customers).
+    The RFM segment is a 3-digit number (R*100 + F*10 + M), representing customer value.
+    """
+
+    _df: pd.DataFrame | None = None
+
+    def __init__(self, df: pd.DataFrame | ibis.Table, current_date: str | datetime.date | None = None) -> None:
+        """Initializes the RFM segmentation process.
+
+        Args:
+            df (pd.DataFrame | ibis.Table): A DataFrame or Ibis table containing transaction data.
+                Must include the following columns:
+                - customer_id
+                - transaction_date
+                - unit_spend
+                - transaction_id
+            current_date (Optional[Union[str, datetime.date]]): The reference date for calculating recency.
+                Can be a string (format: "YYYY-MM-DD"), a date object, or None (defaults to the current system date).
+
+        Raises:
+            ValueError: If the dataframe is missing required columns.
+            TypeError: If the input data is not a pandas DataFrame or an Ibis Table.
+        """
+        cols = ColumnHelper()
+        required_cols = [
+            cols.customer_id,
+            cols.transaction_date,
+            cols.unit_spend,
+            cols.transaction_id,
+        ]
+        if isinstance(df, pd.DataFrame):
+            df = ibis.memtable(df)
+        elif not isinstance(df, ibis.Table):
+            raise TypeError("df must be either a pandas DataFrame or an Ibis Table")
+
+        missing_cols = set(required_cols) - set(df.columns)
+        if missing_cols:
+            error_message = f"Missing required columns: {missing_cols}"
+            raise ValueError(error_message)
+
+        if isinstance(current_date, str):
+            current_date = datetime.date.fromisoformat(current_date)
+        elif current_date is None:
+            current_date = datetime.datetime.now(datetime.UTC).date()
+        elif not isinstance(current_date, datetime.date):
+            raise TypeError("current_date must be a string in 'YYYY-MM-DD' format, a datetime.date object, or None")
+
+        self.table = self._compute_rfm(df, current_date)
+
+    def _compute_rfm(self, df: ibis.Table, current_date: datetime.date) -> ibis.Table:
+        """Computes the RFM metrics and segments customers accordingly.
+
+        Args:
+            df (ibis.Table): The transaction data table.
+            current_date (datetime.date): The reference date for calculating recency.
+
+        Returns:
+            ibis.Table: A table with RFM scores and segment values.
+        """
+        cols = ColumnHelper()
+        current_date_expr = ibis.literal(current_date)
+
+        customer_metrics = df.group_by(cols.customer_id).aggregate(
+            recency_days=(current_date_expr - df[cols.transaction_date].max().cast("date")).cast("int32"),
+            frequency=df[cols.transaction_id].nunique(),
+            monetary=df[cols.unit_spend].sum(),
+        )
+
+        window_recency = ibis.window(
+            order_by=[ibis.asc(customer_metrics.recency_days), ibis.asc(customer_metrics.customer_id)],
+        )
+        window_frequency = ibis.window(
+            order_by=[ibis.asc(customer_metrics.frequency), ibis.asc(customer_metrics.customer_id)],
+        )
+        window_monetary = ibis.window(
+            order_by=[ibis.asc(customer_metrics.monetary), ibis.asc(customer_metrics.customer_id)],
+        )
+
+        rfm_scores = customer_metrics.mutate(
+            r_score=(ibis.ntile(10).over(window_recency)),
+            f_score=(ibis.ntile(10).over(window_frequency)),
+            m_score=(ibis.ntile(10).over(window_monetary)),
+        )
+
+        return rfm_scores.mutate(
+            rfm_segment=(rfm_scores.r_score * 100 + rfm_scores.f_score * 10 + rfm_scores.m_score),
+            fm_segment=(rfm_scores.f_score * 10 + rfm_scores.m_score),
+        )
+
+    @property
+    def df(self) -> pd.DataFrame:
+        """Returns the dataframe with the segment names."""
+        if self._df is None:
+            self._df = self.table.execute().set_index(get_option("column.customer_id"))
+        return self._df
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Base Segmentation

		::: pyretailscience.segmentation.base
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# HML Segmentation

		::: pyretailscience.segmentation.hml
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# RFM Segmentation

		::: pyretailscience.segmentation.rfm
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# SegTransactionStats Segmentation

		::: pyretailscience.segmentation.segstats
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Threshold Segmentation

		::: pyretailscience.segmentation.threshold