Skip to content

Commit a749668

Browse files
committed
feat: first comming of revenue tree code
1 parent d8c9965 commit a749668

1 file changed

Lines changed: 226 additions & 0 deletions

File tree

pyretailscience/revenue_tree.py

Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
"""Revenue Tree Analysis Module.
2+
3+
This module implements a Revenue Tree analysis for retail businesses. The Revenue Tree
4+
is a hierarchical breakdown of factors contributing to overall revenue, allowing for
5+
detailed analysis of sales performance and identification of areas for improvement.
6+
7+
8+
Key Components of the Revenue Tree:
9+
10+
1. Revenue: The top-level metric, calculated as Customers * Revenue per Customer.
11+
12+
2. Customers: Total number of customers, broken down into:
13+
- Returning Customers: Existing customers making repeat purchases.
14+
- New Customers: First-time buyers.
15+
16+
3. Revenue per Customer: Average revenue generated per customer, calculated as:
17+
Orders per Customer * Average Order Value.
18+
19+
4. Orders per Customer: Average number of orders placed by each customer.
20+
21+
5. Average Order Value: Average monetary value of each order, calculated as:
22+
Items per Order * Price per Item.
23+
24+
6. Items per Order: Average number of items in each order.
25+
26+
7. Price per Item: Average price of each item sold.
27+
28+
This module can be used to create, update, and analyze Revenue Tree data structures
29+
for retail businesses, helping to identify key drivers of revenue changes and
30+
inform strategic decision-making.
31+
"""
32+
33+
import pandas as pd
34+
35+
from pyretailscience.data.contracts import CustomContract, build_expected_columns, build_non_null_columns
36+
37+
38+
class RevenueTree:
39+
"""Revenue Tree Analysis Class."""
40+
41+
def __init__(
42+
self,
43+
df: pd.DataFrame,
44+
p1_index: list[bool] | pd.Series,
45+
p2_index: list[bool] | pd.Series,
46+
group_col: str | None = None,
47+
pre_aggregated: bool = False,
48+
) -> None:
49+
"""Initialize the Revenue Tree Analysis Class.
50+
51+
Args:
52+
df (pd.DataFrame): The input DataFrame containing transaction data.
53+
p1_index (list[bool] | pd.Series): A boolean index for the first period.
54+
p2_index (list[bool] | pd.Series): A boolean index for the second period.
55+
group_col (str, optional): The column to group the data by. Defaults to None.
56+
pre_aggregated (bool, optional): Whether the data is pre-aggregated. Defaults to False.
57+
58+
Raises:
59+
ValueError: If the required columns are not present in the DataFrame.
60+
ValueError: If the lengths of p1_index, p2_index, and df are not equal.
61+
62+
Example:
63+
>>> import pandas as pd
64+
>>> from pyretailscience import RevenueTree
65+
>>> data = {
66+
... "customer_id": [1, 2, 3, 4, 5, 6],
67+
... "transaction_id": [1, 2, 3, 4, 5, 6],
68+
... "total_price": [100, 200, 300, 400, 500, 600],
69+
... "quantity": [1, 2, 3, 4, 5, 6],
70+
... }
71+
>>> df = pd.DataFrame(data)
72+
>>> p1_index = [True, False, True, False, True, False]
73+
>>> p2_index = [False, True, False, True, False, True]
74+
>>> rev_tree = RevenueTree(df=df, p1_index=p1_index, p2_index=p2_index)
75+
"""
76+
if pre_aggregated:
77+
required_cols = ["customers", "tranactions", "total_price"]
78+
else:
79+
required_cols = ["customer_id", "transaction_id", "total_price"]
80+
81+
if "quantity" in df.columns:
82+
required_cols.append("quantity")
83+
if group_col is not None:
84+
required_cols.append(group_col)
85+
86+
contract = CustomContract(
87+
df,
88+
basic_expectations=build_expected_columns(columns=required_cols),
89+
extended_expectations=build_non_null_columns(columns=required_cols),
90+
)
91+
if contract.validate() is False:
92+
msg = f"The dataframe requires the columns {required_cols} and they must be non-null"
93+
raise ValueError(msg)
94+
95+
if not len(p1_index) == len(p2_index) == len(df):
96+
raise ValueError("p1_index, p2_index, and df should have the same length")
97+
98+
if pre_aggregated is False:
99+
df, p1_index, p2_index = self._agg_data(df=df, p1_index=p1_index, p2_index=p2_index)
100+
101+
self.revenue_tree_df = self._calc_tree_kpis(
102+
df=df,
103+
p1_index=p1_index,
104+
p2_index=p2_index,
105+
)
106+
107+
@staticmethod
108+
def _agg_data(
109+
df: pd.DataFrame,
110+
p1_index: list[bool] | pd.Series,
111+
p2_index: list[bool] | pd.Series,
112+
group_col: str | None = None,
113+
) -> tuple[pd.DataFrame, list[bool], list[bool]]:
114+
if group_col is not None:
115+
p1_group = df[p1_index].groupby(group_col)
116+
p2_group = df[p2_index].groupby(group_col)
117+
p1_df = p1_group.agg(
118+
customers=("customer_id", "nunique"),
119+
transactions=("transaction_id", "nunique"),
120+
total_price=("total_price", "sum"),
121+
)
122+
p2_df = p2_group.agg(
123+
customers=("customer_id", "nunique"),
124+
transactions=("transaction_id", "nunique"),
125+
total_price=("total_price", "sum"),
126+
)
127+
if "quantity" in df.columns:
128+
p1_df["quantity"] = p1_group["quantity"].sum()
129+
p2_df["quantity"] = p2_group["quantity"].sum()
130+
else:
131+
p1_df = df[p1_index]
132+
p2_df = df[p2_index]
133+
p1_df = pd.DataFrame(
134+
{
135+
"customers": p1_df["customer_id"].nunique(),
136+
"transactions": p1_df["transaction_id"].nunique(),
137+
"total_price": p1_df["total_price"].sum(),
138+
},
139+
index=["p1"],
140+
)
141+
p2_df = pd.DataFrame(
142+
{
143+
"customers": p2_df["customer_id"].nunique(),
144+
"transactions": p2_df["transaction_id"].nunique(),
145+
"total_price": p2_df["total_price"].sum(),
146+
},
147+
index=["p2"],
148+
)
149+
if "quantity" in df.columns:
150+
p1_df["quantity"] = df[p1_index]["quantity"].sum()
151+
p2_df["quantity"] = df[p2_index]["quantity"].sum()
152+
153+
new_p1_index = [True] * len(p1_df) + [False] * len(p2_df)
154+
new_p2_index = [not i for i in new_p1_index]
155+
156+
return pd.concat([p1_df, p2_df]), new_p1_index, new_p2_index
157+
158+
@staticmethod
159+
def _calc_tree_kpis(
160+
df: pd.DataFrame,
161+
p1_index: list[bool] | pd.Series,
162+
p2_index: list[bool] | pd.Series,
163+
) -> pd.DataFrame:
164+
df["total_price_per_cust"] = df["total_price"] / df["customers"]
165+
df["total_price_per_transaction"] = df["total_price"] / df["transactions"]
166+
df["frequency"] = df["transactions"] / df["customers"]
167+
168+
p1_df = df[p1_index]
169+
p1_df.columns = [f"{col}_p1" for col in p1_df.columns]
170+
p2_df = df[p2_index]
171+
p2_df.columns = [f"{col}_p2" for col in p2_df.columns]
172+
173+
if set(df.index.to_list()) == {"p1", "p2"}:
174+
p1_df = p1_df.reset_index(drop=True)
175+
p2_df = p2_df.reset_index(drop=True)
176+
177+
df = pd.concat([p1_df, p2_df], axis=1)
178+
179+
# Calculations
180+
df["customers_diff"] = df["customers_p2"] - df["customers_p1"]
181+
df["transactions_diff"] = df["transactions_p2"] - df["transactions_p1"]
182+
df["total_price_diff"] = df["total_price_p2"] - df["total_price_p1"]
183+
df["total_price_per_cust_diff"] = df["total_price_per_cust_p2"] - df["total_price_per_cust_p1"]
184+
df["total_price_per_transaction_diff"] = (
185+
df["total_price_per_transaction_p2"] - df["total_price_per_transaction_p1"]
186+
)
187+
df["frequency_diff"] = df["frequency_p2"] - df["frequency_p1"]
188+
189+
df["customers_pc"] = df["customers_diff"] / df["customers_p1"]
190+
df["transactions_pc"] = df["transactions_diff"] / df["transactions_p1"]
191+
df["total_price_pc"] = df["total_price_diff"] / df["total_price_p1"]
192+
df["total_price_per_cust_pc"] = df["total_price_per_cust_diff"] / df["total_price_per_cust_p1"]
193+
df["total_price_per_transaction_pc"] = (
194+
df["total_price_per_transaction_diff"] / df["total_price_per_transaction_p1"]
195+
)
196+
df["frequency_pc"] = df["frequency_diff"] / df["frequency_p1"]
197+
198+
df["customers_contrib"] = (
199+
df["total_price_p2"]
200+
- (df["customers_p1"] * df["total_price_per_cust_p2"])
201+
- ((df["customers_diff"] * df["total_price_per_cust_diff"]) / 2)
202+
)
203+
df["total_price_per_cust_contrib"] = (
204+
df["total_price_p2"]
205+
- (df["total_price_per_cust_p1"] * df["customers_p2"])
206+
- ((df["customers_diff"] * df["total_price_per_cust_diff"]) / 2)
207+
)
208+
209+
df["frequency_contrib"] = (
210+
(
211+
df["total_price_per_cust_p2"]
212+
- (df["frequency_p1"] * df["total_price_per_transaction_p2"])
213+
- ((df["frequency_diff"] * df["total_price_per_transaction_diff"]) / 2)
214+
)
215+
* df["customers_p2"]
216+
) - ((df["customers_diff"] * df["total_price_per_cust_diff"]) / 4)
217+
df["total_price_per_transaction_contrib"] = (
218+
(
219+
df["total_price_per_cust_p2"]
220+
- (df["total_price_per_transaction_p1"] * df["frequency_p2"])
221+
- ((df["frequency_diff"] * df["total_price_per_transaction_diff"]) / 2)
222+
)
223+
* df["customers_p2"]
224+
) - ((df["customers_diff"] * df["total_price_per_cust_diff"]) / 4)
225+
226+
return df

0 commit comments

Comments
 (0)