Skip to content

Commit 0cc9d79

Browse files
committed
feat: Create purchase path analysis module
1 parent d8c65ca commit 0cc9d79

5 files changed

Lines changed: 583 additions & 0 deletions

File tree

docs/analysis_modules.md

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,6 +1301,87 @@ cr.df.sort_values("composite_rank")
13011301
| 3 | 75 | 15 | 5.0 | 5 | 5 | 1 | 3.67 |
13021302
<!-- markdownlint-enable MD013 -->
13031303

1304+
### Purchase Path Analysis
1305+
1306+
<div class="clear" markdown>
1307+
1308+
The Purchase Path Analysis module tracks customer journeys through product categories over time,
1309+
providing insights into sequential purchasing behavior and category transitions.
1310+
This analysis helps retailers understand how customers navigate between different product categories
1311+
during their shopping journey.
1312+
1313+
Key applications include:
1314+
1315+
- **Customer Journey Mapping**: Understanding the sequence of categories customers purchase from
1316+
- **Cross-Selling Optimization**: Identifying natural category progressions for targeted recommendations
1317+
- **Category Management**: Planning product placement and promotional strategies based on purchase paths
1318+
- **Customer Segmentation**: Grouping customers based on their shopping patterns across categories
1319+
- **Inventory Planning**: Predicting demand patterns based on typical purchase sequences
1320+
1321+
The module provides flexible options for handling multiple categories within the same transaction,
1322+
aggregation methods, and filtering criteria to focus on meaningful patterns.
1323+
1324+
</div>
1325+
1326+
Example:
1327+
1328+
```python
1329+
import pandas as pd
1330+
from pyretailscience.analysis.purchase_path import purchase_path_analysis
1331+
1332+
sample_data = pd.DataFrame({
1333+
'customer_id': [
1334+
1, 1, 1, 1, 1, 1,
1335+
2, 2, 2, 2, 2, 2,
1336+
3, 3, 3, 3, 3, 3,
1337+
4, 4, 4, 4,
1338+
5, 5, 5, 5, 5, 5,
1339+
6, 6, 6, 6
1340+
],
1341+
'transaction_id': [
1342+
101, 101, 102, 102, 103, 103,
1343+
201, 201, 202, 202, 203, 203,
1344+
301, 301, 302, 302, 303, 303,
1345+
401, 401, 402, 402,
1346+
501, 501, 502, 502, 503, 503,
1347+
601, 601, 602, 602
1348+
],
1349+
'transaction_date': [
1350+
'2024-01-01', '2024-01-01', '2024-01-10', '2024-01-10', '2024-01-20', '2024-01-20',
1351+
'2024-01-02', '2024-01-02', '2024-01-11', '2024-01-11', '2024-01-21', '2024-01-21',
1352+
'2024-01-03', '2024-01-03', '2024-01-12', '2024-01-12', '2024-01-22', '2024-01-22',
1353+
'2024-01-04', '2024-01-04', '2024-01-13', '2024-01-13',
1354+
'2024-01-05', '2024-01-05', '2024-01-14', '2024-01-14', '2024-01-23', '2024-01-23',
1355+
'2024-01-06', '2024-01-06', '2024-01-15', '2024-01-15'
1356+
],
1357+
'product_id': range(1, 33),
1358+
'product_category': [
1359+
'womens_clothing', 'womens_clothing', 'kids_clothing', 'kids_clothing', 'mens_clothing', 'mens_clothing',
1360+
'womens_clothing', 'womens_clothing', 'kids_clothing', 'kids_clothing', 'kids_clothing', 'kids_clothing',
1361+
'womens_clothing', 'womens_clothing', 'kids_clothing', 'kids_clothing', 'mens_clothing', 'mens_clothing',
1362+
'womens_clothing', 'womens_clothing', 'kids_clothing', 'kids_clothing',
1363+
'mens_clothing', 'mens_clothing', 'womens_clothing', 'womens_clothing', 'kids_clothing', 'kids_clothing',
1364+
'mens_clothing', 'mens_clothing', 'womens_clothing', 'womens_clothing'
1365+
],
1366+
'revenue': [50] * 32
1367+
})
1368+
1369+
result = purchase_path_analysis(
1370+
sample_data,
1371+
category_column='product_category',
1372+
min_customers=1,
1373+
min_transactions=3,
1374+
multi_category_handling='concatenate'
1375+
)
1376+
1377+
```
1378+
1379+
| basket_1 | basket_2 | basket_3 | customer_count | transition_probability |
1380+
|:----------------|:----------------|:--------------|---------------:|-----------------------:|
1381+
| mens_clothing | womens_clothing | kids_clothing | 1 | 0.25 |
1382+
| womens_clothing | kids_clothing | mens_clothing | 2 | 0.50 |
1383+
| womens_clothing | kids_clothing | | 1 | 0.25 |
1384+
13041385
## Utils
13051386

13061387
### Filter and Label by Periods

docs/api/analysis/purchase_path.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Purchase Path
2+
3+
::: pyretailscience.analysis.purchase_path

mkdocs.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ nav:
2828
- Customer Decision Hierarchy: api/analysis/customer_decision_hierarchy.md
2929
- Revenue Tree: api/analysis/revenue_tree.md
3030
- Composite Rank: api/analysis/composite_rank.md
31+
- Purchase Path: api/analysis/purchase_path.md
3132
- Segmentation:
3233
- Base Segmentation: api/segmentation/base.md
3334
- HML Segmentation: api/segmentation/hml.md
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
"""Module for analyzing customer purchase paths from transaction data.
2+
3+
This module defines the `purchase_path_analysis` function that tracks
4+
customer journeys through product categories over time.
5+
"""
6+
7+
import ibis
8+
import pandas as pd
9+
10+
from pyretailscience.options import ColumnHelper
11+
12+
13+
def _build_category_group_df(
14+
first_df: pd.DataFrame,
15+
category_column: str,
16+
sort_by_metric: bool,
17+
multi_category_handling: str,
18+
) -> pd.DataFrame:
19+
"""Creates a DataFrame mapping customers to concatenated or individual categories."""
20+
if multi_category_handling == "concatenate":
21+
sort_cols = ["customer_id", "first_basket_number"]
22+
if sort_by_metric:
23+
sort_cols.append("metric_value")
24+
ascending = [True, True, False]
25+
else:
26+
sort_cols.append(category_column)
27+
ascending = [True, True, True]
28+
29+
return (
30+
first_df.sort_values(sort_cols, ascending=ascending)
31+
.groupby(["customer_id", "first_basket_number"])[category_column]
32+
.apply(lambda x: ",".join(x))
33+
.reset_index()
34+
.rename(columns={category_column: "categories"})
35+
)
36+
return first_df[["customer_id", "first_basket_number", category_column]].rename(
37+
columns={category_column: "categories"},
38+
)
39+
40+
41+
def _build_paths_df(category_groups_df: pd.DataFrame) -> pd.DataFrame:
42+
"""Constructs a pivoted DataFrame representing customer purchase paths."""
43+
actual_baskets = sorted(category_groups_df["first_basket_number"].unique()) if not category_groups_df.empty else []
44+
paths_df = category_groups_df.pivot_table(
45+
index="customer_id",
46+
columns="first_basket_number",
47+
values="categories",
48+
aggfunc="first",
49+
).reset_index()
50+
51+
column_mapping = {"customer_id": "customer_id"}
52+
for i, basket_num in enumerate(sorted(actual_baskets), 1):
53+
if basket_num in paths_df.columns:
54+
column_mapping[basket_num] = f"basket_{i}"
55+
return paths_df.rename(columns=column_mapping).fillna("")
56+
57+
58+
def purchase_path_analysis(
59+
transactions_df: pd.DataFrame,
60+
category_column: str = "product_category",
61+
min_transactions: int = 3,
62+
min_basket_size: int = 2,
63+
min_basket_value: float = 10.0,
64+
max_depth: int = 10,
65+
min_customers: int = 5,
66+
exclude_negative_revenue: bool = True,
67+
multi_category_handling: str = "concatenate",
68+
sort_by: str = "alphabetical",
69+
aggregation_column: str | None = None,
70+
aggregation_function: str = "sum",
71+
) -> pd.DataFrame:
72+
"""Analyzes customer purchase paths through product categories over time."""
73+
cols = ColumnHelper()
74+
required_cols = [cols.customer_id, cols.transaction_id, cols.transaction_date, category_column]
75+
missing_cols = set(required_cols) - set(transactions_df.columns)
76+
if missing_cols:
77+
msg = f"The following columns are required but missing: {missing_cols}"
78+
raise ValueError(msg)
79+
80+
transactions_table = (
81+
ibis.memtable(transactions_df) if isinstance(transactions_df, pd.DataFrame) else transactions_df
82+
)
83+
if exclude_negative_revenue:
84+
transactions_table = transactions_table.filter(transactions_table.revenue > 0)
85+
86+
customer_baskets = (
87+
transactions_table.group_by(["customer_id", "transaction_id", "transaction_date"])
88+
.aggregate(
89+
item_count=ibis._.product_id.nunique(),
90+
basket_value=ibis._.revenue.sum(),
91+
)
92+
.filter(
93+
(ibis._.item_count >= min_basket_size) & (ibis._.basket_value >= min_basket_value),
94+
)
95+
.mutate(
96+
basket_number=ibis.row_number().over(
97+
ibis.window(group_by="customer_id", order_by="transaction_date"),
98+
),
99+
)
100+
.filter(ibis._.basket_number <= max_depth)
101+
)
102+
eligible_customers = (
103+
customer_baskets.group_by("customer_id")
104+
.aggregate(transaction_count=ibis._.basket_number.count())
105+
.filter(ibis._.transaction_count >= min_transactions)
106+
.select("customer_id")
107+
)
108+
109+
transactions_with_baskets = transactions_table.inner_join(
110+
customer_baskets.inner_join(eligible_customers, "customer_id").select(
111+
["customer_id", "transaction_id", "basket_number"],
112+
),
113+
["customer_id", "transaction_id"],
114+
)
115+
116+
use_agg_sort = (
117+
multi_category_handling == "concatenate"
118+
and sort_by == "aggregation"
119+
and aggregation_column
120+
and aggregation_function
121+
)
122+
123+
if use_agg_sort:
124+
agg_func = getattr(
125+
transactions_with_baskets[aggregation_column],
126+
{"sum": "sum", "max": "max", "min": "min", "avg": "mean"}[aggregation_function],
127+
)
128+
first_df = transactions_with_baskets.group_by(["customer_id", category_column]).aggregate(
129+
first_basket_number=ibis._.basket_number.min(),
130+
metric_value=agg_func(),
131+
)
132+
else:
133+
first_df = transactions_with_baskets.group_by(["customer_id", category_column]).aggregate(
134+
first_basket_number=ibis._.basket_number.min(),
135+
)
136+
first_df = first_df.execute()
137+
138+
if first_df.empty:
139+
return pd.DataFrame(columns=["customer_count", "transition_probability"])
140+
141+
category_groups_df = _build_category_group_df(first_df, category_column, use_agg_sort, multi_category_handling)
142+
paths_df = _build_paths_df(category_groups_df)
143+
144+
basket_cols = sorted(
145+
[col for col in paths_df.columns if col.startswith("basket_")],
146+
key=lambda x: int(x.split("_")[1]),
147+
)
148+
paths_df = paths_df[paths_df[basket_cols].ne("").any(axis=1)]
149+
150+
if paths_df.empty:
151+
return pd.DataFrame(columns=["customer_count", "transition_probability"])
152+
153+
pattern_counts = paths_df.groupby(basket_cols).size().reset_index(name="customer_count")
154+
pattern_counts = pattern_counts[pattern_counts.customer_count >= min_customers]
155+
156+
if not pattern_counts.empty:
157+
total_customers = pattern_counts.customer_count.sum()
158+
pattern_counts["transition_probability"] = (pattern_counts.customer_count / total_customers).round(3)
159+
return pattern_counts.sort_values("customer_count", ascending=False).reset_index(drop=True)
160+
161+
return pd.DataFrame(columns=[*basket_cols, "customer_count", "transition_probability"])

0 commit comments

Comments
 (0)