|
| 1 | +"""Customer Segmentation Using RFM Analysis. |
| 2 | +
|
| 3 | +This module implements RFM (Recency, Frequency, Monetary) segmentation, a widely used technique in customer analytics |
| 4 | +to categorize customers based on their purchasing behavior. |
| 5 | +
|
| 6 | +RFM segmentation assigns scores to customers based on: |
| 7 | +1. Recency (R): How recently a customer made a purchase. |
| 8 | +2. Frequency (F): How often a customer makes purchases. |
| 9 | +3. Monetary (M): The total amount spent by a customer. |
| 10 | +
|
| 11 | +### Benefits of RFM Segmentation: |
| 12 | +- **Customer Value Analysis**: Identifies high-value customers who contribute the most revenue. |
| 13 | +- **Personalized Marketing**: Enables targeted campaigns based on customer purchasing behavior. |
| 14 | +- **Customer Retention Strategies**: Helps recognize at-risk customers and develop engagement strategies. |
| 15 | +- **Sales Forecasting**: Provides insights into future revenue trends based on past spending behavior. |
| 16 | +
|
| 17 | +### Scoring Methodology: |
| 18 | +- Each metric (R, F, M) is divided into 10 bins (0-9) using the NTILE(10) function. |
| 19 | +- A higher score indicates a better customer (e.g., lower recency, higher frequency, and monetary value). |
| 20 | +- The final RFM segment is computed as `R*100 + F*10 + M`, providing a unique customer classification. |
| 21 | +
|
| 22 | +This module leverages `pandas` and `ibis` for efficient data processing and integrates with retail analytics workflows |
| 23 | +to enhance customer insights and business decision-making. |
| 24 | +""" |
| 25 | + |
| 26 | +import datetime |
| 27 | + |
| 28 | +import ibis |
| 29 | +import pandas as pd |
| 30 | + |
| 31 | +from pyretailscience.options import ColumnHelper, get_option |
| 32 | + |
| 33 | + |
| 34 | +class RFMSegmentation: |
| 35 | + """Segments customers using the RFM (Recency, Frequency, Monetary) methodology. |
| 36 | +
|
| 37 | + Customers are scored on three dimensions: |
| 38 | + - Recency (R): Days since the last transaction (lower is better). |
| 39 | + - Frequency (F): Number of unique transactions (higher is better). |
| 40 | + - Monetary (M): Total amount spent (higher is better). |
| 41 | +
|
| 42 | + Each metric is ranked into 10 bins (0-9) using NTILE(10) where, |
| 43 | + - 9 represents the best score (top 10% of customers). |
| 44 | + - 0 represents the lowest score (bottom 10% of customers). |
| 45 | + The RFM segment is a 3-digit number (R*100 + F*10 + M), representing customer value. |
| 46 | + """ |
| 47 | + |
| 48 | + _df: pd.DataFrame | None = None |
| 49 | + |
| 50 | + def __init__(self, df: pd.DataFrame | ibis.Table, current_date: str | datetime.date | None = None) -> None: |
| 51 | + """Initializes the RFM segmentation process. |
| 52 | +
|
| 53 | + Args: |
| 54 | + df (pd.DataFrame | ibis.Table): A DataFrame or Ibis table containing transaction data. |
| 55 | + Must include the following columns: |
| 56 | + - customer_id |
| 57 | + - transaction_date |
| 58 | + - unit_spend |
| 59 | + - transaction_id |
| 60 | + current_date (Optional[Union[str, datetime.date]]): The reference date for calculating recency. |
| 61 | + Can be a string (format: "YYYY-MM-DD"), a date object, or None (defaults to the current system date). |
| 62 | +
|
| 63 | + Raises: |
| 64 | + ValueError: If the dataframe is missing required columns. |
| 65 | + TypeError: If the input data is not a pandas DataFrame or an Ibis Table. |
| 66 | + """ |
| 67 | + cols = ColumnHelper() |
| 68 | + required_cols = [ |
| 69 | + cols.customer_id, |
| 70 | + cols.transaction_date, |
| 71 | + cols.unit_spend, |
| 72 | + cols.transaction_id, |
| 73 | + ] |
| 74 | + if isinstance(df, pd.DataFrame): |
| 75 | + df = ibis.memtable(df) |
| 76 | + elif not isinstance(df, ibis.Table): |
| 77 | + raise TypeError("df must be either a pandas DataFrame or an Ibis Table") |
| 78 | + |
| 79 | + missing_cols = set(required_cols) - set(df.columns) |
| 80 | + if missing_cols: |
| 81 | + error_message = f"Missing required columns: {missing_cols}" |
| 82 | + raise ValueError(error_message) |
| 83 | + |
| 84 | + if isinstance(current_date, str): |
| 85 | + current_date = datetime.date.fromisoformat(current_date) |
| 86 | + elif current_date is None: |
| 87 | + current_date = datetime.datetime.now(datetime.UTC).date() |
| 88 | + elif not isinstance(current_date, datetime.date): |
| 89 | + raise TypeError("current_date must be a string in 'YYYY-MM-DD' format, a datetime.date object, or None") |
| 90 | + |
| 91 | + self.table = self._compute_rfm(df, current_date) |
| 92 | + |
| 93 | + def _compute_rfm(self, df: ibis.Table, current_date: datetime.date) -> ibis.Table: |
| 94 | + """Computes the RFM metrics and segments customers accordingly. |
| 95 | +
|
| 96 | + Args: |
| 97 | + df (ibis.Table): The transaction data table. |
| 98 | + current_date (datetime.date): The reference date for calculating recency. |
| 99 | +
|
| 100 | + Returns: |
| 101 | + ibis.Table: A table with RFM scores and segment values. |
| 102 | + """ |
| 103 | + cols = ColumnHelper() |
| 104 | + current_date_expr = ibis.literal(current_date) |
| 105 | + |
| 106 | + customer_metrics = df.group_by(cols.customer_id).aggregate( |
| 107 | + recency_days=(current_date_expr - df[cols.transaction_date].max().cast("date")).cast("int32"), |
| 108 | + frequency=df[cols.transaction_id].nunique(), |
| 109 | + monetary=df[cols.unit_spend].sum(), |
| 110 | + ) |
| 111 | + |
| 112 | + window_recency = ibis.window( |
| 113 | + order_by=[ibis.asc(customer_metrics.recency_days), ibis.asc(customer_metrics.customer_id)], |
| 114 | + ) |
| 115 | + window_frequency = ibis.window( |
| 116 | + order_by=[ibis.asc(customer_metrics.frequency), ibis.asc(customer_metrics.customer_id)], |
| 117 | + ) |
| 118 | + window_monetary = ibis.window( |
| 119 | + order_by=[ibis.asc(customer_metrics.monetary), ibis.asc(customer_metrics.customer_id)], |
| 120 | + ) |
| 121 | + |
| 122 | + rfm_scores = customer_metrics.mutate( |
| 123 | + r_score=(ibis.ntile(10).over(window_recency)), |
| 124 | + f_score=(ibis.ntile(10).over(window_frequency)), |
| 125 | + m_score=(ibis.ntile(10).over(window_monetary)), |
| 126 | + ) |
| 127 | + |
| 128 | + return rfm_scores.mutate( |
| 129 | + rfm_segment=(rfm_scores.r_score * 100 + rfm_scores.f_score * 10 + rfm_scores.m_score), |
| 130 | + fm_segment=(rfm_scores.f_score * 10 + rfm_scores.m_score), |
| 131 | + ) |
| 132 | + |
| 133 | + @property |
| 134 | + def df(self) -> pd.DataFrame: |
| 135 | + """Returns the dataframe with the segment names.""" |
| 136 | + if self._df is None: |
| 137 | + self._df = self.table.execute().set_index(get_option("column.customer_id")) |
| 138 | + return self._df |
0 commit comments