feat: Create purchase path analysis module

mayurkmmt · mayurkmmt · commit 0cc9d7906ba9 · 2025-06-10T13:20:26.000+05:30
diff --git a/docs/analysis_modules.md b/docs/analysis_modules.md
@@ -1301,6 +1301,87 @@ cr.df.sort_values("composite_rank")
 | 3          | 75    | 15        | 5.0                | 5         | 5             | 1                      | 3.67           |
 <!-- markdownlint-enable MD013 -->
 
+### Purchase Path Analysis
+
+<div class="clear" markdown>
+
+The Purchase Path Analysis module tracks customer journeys through product categories over time,
+providing insights into sequential purchasing behavior and category transitions.
+This analysis helps retailers understand how customers navigate between different product categories
+during their shopping journey.
+
+Key applications include:
+
+- **Customer Journey Mapping**: Understanding the sequence of categories customers purchase from
+- **Cross-Selling Optimization**: Identifying natural category progressions for targeted recommendations
+- **Category Management**: Planning product placement and promotional strategies based on purchase paths
+- **Customer Segmentation**: Grouping customers based on their shopping patterns across categories
+- **Inventory Planning**: Predicting demand patterns based on typical purchase sequences
+
+The module provides flexible options for handling multiple categories within the same transaction,
+aggregation methods, and filtering criteria to focus on meaningful patterns.
+
+</div>
+
+Example:
+
+```python
+import pandas as pd
+from pyretailscience.analysis.purchase_path import purchase_path_analysis
+
+sample_data = pd.DataFrame({
+    'customer_id': [
+        1, 1, 1, 1, 1, 1,
+        2, 2, 2, 2, 2, 2,
+        3, 3, 3, 3, 3, 3,
+        4, 4, 4, 4,
+        5, 5, 5, 5, 5, 5,
+        6, 6, 6, 6
+    ],
+    'transaction_id': [
+        101, 101, 102, 102, 103, 103,
+        201, 201, 202, 202, 203, 203,
+        301, 301, 302, 302, 303, 303,
+        401, 401, 402, 402,
+        501, 501, 502, 502, 503, 503,
+        601, 601, 602, 602
+    ],
+    'transaction_date': [
+        '2024-01-01', '2024-01-01', '2024-01-10', '2024-01-10', '2024-01-20', '2024-01-20',
+        '2024-01-02', '2024-01-02', '2024-01-11', '2024-01-11', '2024-01-21', '2024-01-21',
+        '2024-01-03', '2024-01-03', '2024-01-12', '2024-01-12', '2024-01-22', '2024-01-22',
+        '2024-01-04', '2024-01-04', '2024-01-13', '2024-01-13',
+        '2024-01-05', '2024-01-05', '2024-01-14', '2024-01-14', '2024-01-23', '2024-01-23',
+        '2024-01-06', '2024-01-06', '2024-01-15', '2024-01-15'
+    ],
+    'product_id': range(1, 33),
+    'product_category': [
+        'womens_clothing', 'womens_clothing', 'kids_clothing', 'kids_clothing', 'mens_clothing', 'mens_clothing',
+        'womens_clothing', 'womens_clothing', 'kids_clothing', 'kids_clothing', 'kids_clothing', 'kids_clothing',
+        'womens_clothing', 'womens_clothing', 'kids_clothing', 'kids_clothing', 'mens_clothing', 'mens_clothing',
+        'womens_clothing', 'womens_clothing', 'kids_clothing', 'kids_clothing',
+        'mens_clothing', 'mens_clothing', 'womens_clothing', 'womens_clothing', 'kids_clothing', 'kids_clothing',
+        'mens_clothing', 'mens_clothing', 'womens_clothing', 'womens_clothing'
+    ],
+    'revenue': [50] * 32
+})
+
+result = purchase_path_analysis(
+    sample_data,
+    category_column='product_category',
+    min_customers=1,
+    min_transactions=3,
+    multi_category_handling='concatenate'
+)
+
+```
+
+| basket_1        | basket_2        | basket_3      | customer_count | transition_probability |
+|:----------------|:----------------|:--------------|---------------:|-----------------------:|
+| mens_clothing   | womens_clothing | kids_clothing |              1 |                   0.25 |
+| womens_clothing | kids_clothing   | mens_clothing |              2 |                   0.50 |
+| womens_clothing | kids_clothing   |               |              1 |                   0.25 |
+
 ## Utils
 
 ### Filter and Label by Periods
diff --git a/docs/api/analysis/purchase_path.md b/docs/api/analysis/purchase_path.md
@@ -0,0 +1,3 @@
+# Purchase Path
+
+::: pyretailscience.analysis.purchase_path
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -28,6 +28,7 @@ nav:
           - Customer Decision Hierarchy: api/analysis/customer_decision_hierarchy.md
           - Revenue Tree: api/analysis/revenue_tree.md
           - Composite Rank: api/analysis/composite_rank.md
+          - Purchase Path: api/analysis/purchase_path.md
       - Segmentation:
           - Base Segmentation: api/segmentation/base.md
           - HML Segmentation: api/segmentation/hml.md
diff --git a/pyretailscience/analysis/purchase_path.py b/pyretailscience/analysis/purchase_path.py
@@ -0,0 +1,161 @@
+"""Module for analyzing customer purchase paths from transaction data.
+
+This module defines the `purchase_path_analysis` function that tracks
+customer journeys through product categories over time.
+"""
+
+import ibis
+import pandas as pd
+
+from pyretailscience.options import ColumnHelper
+
+
+def _build_category_group_df(
+    first_df: pd.DataFrame,
+    category_column: str,
+    sort_by_metric: bool,
+    multi_category_handling: str,
+) -> pd.DataFrame:
+    """Creates a DataFrame mapping customers to concatenated or individual categories."""
+    if multi_category_handling == "concatenate":
+        sort_cols = ["customer_id", "first_basket_number"]
+        if sort_by_metric:
+            sort_cols.append("metric_value")
+            ascending = [True, True, False]
+        else:
+            sort_cols.append(category_column)
+            ascending = [True, True, True]
+
+        return (
+            first_df.sort_values(sort_cols, ascending=ascending)
+            .groupby(["customer_id", "first_basket_number"])[category_column]
+            .apply(lambda x: ",".join(x))
+            .reset_index()
+            .rename(columns={category_column: "categories"})
+        )
+    return first_df[["customer_id", "first_basket_number", category_column]].rename(
+        columns={category_column: "categories"},
+    )
+
+
+def _build_paths_df(category_groups_df: pd.DataFrame) -> pd.DataFrame:
+    """Constructs a pivoted DataFrame representing customer purchase paths."""
+    actual_baskets = sorted(category_groups_df["first_basket_number"].unique()) if not category_groups_df.empty else []
+    paths_df = category_groups_df.pivot_table(
+        index="customer_id",
+        columns="first_basket_number",
+        values="categories",
+        aggfunc="first",
+    ).reset_index()
+
+    column_mapping = {"customer_id": "customer_id"}
+    for i, basket_num in enumerate(sorted(actual_baskets), 1):
+        if basket_num in paths_df.columns:
+            column_mapping[basket_num] = f"basket_{i}"
+    return paths_df.rename(columns=column_mapping).fillna("")
+
+
+def purchase_path_analysis(
+    transactions_df: pd.DataFrame,
+    category_column: str = "product_category",
+    min_transactions: int = 3,
+    min_basket_size: int = 2,
+    min_basket_value: float = 10.0,
+    max_depth: int = 10,
+    min_customers: int = 5,
+    exclude_negative_revenue: bool = True,
+    multi_category_handling: str = "concatenate",
+    sort_by: str = "alphabetical",
+    aggregation_column: str | None = None,
+    aggregation_function: str = "sum",
+) -> pd.DataFrame:
+    """Analyzes customer purchase paths through product categories over time."""
+    cols = ColumnHelper()
+    required_cols = [cols.customer_id, cols.transaction_id, cols.transaction_date, category_column]
+    missing_cols = set(required_cols) - set(transactions_df.columns)
+    if missing_cols:
+        msg = f"The following columns are required but missing: {missing_cols}"
+        raise ValueError(msg)
+
+    transactions_table = (
+        ibis.memtable(transactions_df) if isinstance(transactions_df, pd.DataFrame) else transactions_df
+    )
+    if exclude_negative_revenue:
+        transactions_table = transactions_table.filter(transactions_table.revenue > 0)
+
+    customer_baskets = (
+        transactions_table.group_by(["customer_id", "transaction_id", "transaction_date"])
+        .aggregate(
+            item_count=ibis._.product_id.nunique(),
+            basket_value=ibis._.revenue.sum(),
+        )
+        .filter(
+            (ibis._.item_count >= min_basket_size) & (ibis._.basket_value >= min_basket_value),
+        )
+        .mutate(
+            basket_number=ibis.row_number().over(
+                ibis.window(group_by="customer_id", order_by="transaction_date"),
+            ),
+        )
+        .filter(ibis._.basket_number <= max_depth)
+    )
+    eligible_customers = (
+        customer_baskets.group_by("customer_id")
+        .aggregate(transaction_count=ibis._.basket_number.count())
+        .filter(ibis._.transaction_count >= min_transactions)
+        .select("customer_id")
+    )
+
+    transactions_with_baskets = transactions_table.inner_join(
+        customer_baskets.inner_join(eligible_customers, "customer_id").select(
+            ["customer_id", "transaction_id", "basket_number"],
+        ),
+        ["customer_id", "transaction_id"],
+    )
+
+    use_agg_sort = (
+        multi_category_handling == "concatenate"
+        and sort_by == "aggregation"
+        and aggregation_column
+        and aggregation_function
+    )
+
+    if use_agg_sort:
+        agg_func = getattr(
+            transactions_with_baskets[aggregation_column],
+            {"sum": "sum", "max": "max", "min": "min", "avg": "mean"}[aggregation_function],
+        )
+        first_df = transactions_with_baskets.group_by(["customer_id", category_column]).aggregate(
+            first_basket_number=ibis._.basket_number.min(),
+            metric_value=agg_func(),
+        )
+    else:
+        first_df = transactions_with_baskets.group_by(["customer_id", category_column]).aggregate(
+            first_basket_number=ibis._.basket_number.min(),
+        )
+    first_df = first_df.execute()
+
+    if first_df.empty:
+        return pd.DataFrame(columns=["customer_count", "transition_probability"])
+
+    category_groups_df = _build_category_group_df(first_df, category_column, use_agg_sort, multi_category_handling)
+    paths_df = _build_paths_df(category_groups_df)
+
+    basket_cols = sorted(
+        [col for col in paths_df.columns if col.startswith("basket_")],
+        key=lambda x: int(x.split("_")[1]),
+    )
+    paths_df = paths_df[paths_df[basket_cols].ne("").any(axis=1)]
+
+    if paths_df.empty:
+        return pd.DataFrame(columns=["customer_count", "transition_probability"])
+
+    pattern_counts = paths_df.groupby(basket_cols).size().reset_index(name="customer_count")
+    pattern_counts = pattern_counts[pattern_counts.customer_count >= min_customers]
+
+    if not pattern_counts.empty:
+        total_customers = pattern_counts.customer_count.sum()
+        pattern_counts["transition_probability"] = (pattern_counts.customer_count / total_customers).round(3)
+        return pattern_counts.sort_values("customer_count", ascending=False).reset_index(drop=True)
+
+    return pd.DataFrame(columns=[*basket_cols, "customer_count", "transition_probability"])
diff --git a/tests/analysis/test_purchase_path.py b/tests/analysis/test_purchase_path.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Purchase Path`
	`2`	`+`
	`3`	`+::: pyretailscience.analysis.purchase_path`