|
| 1 | +"""Revenue Tree Analysis Module. |
| 2 | +
|
| 3 | +This module implements a Revenue Tree analysis for retail businesses. The Revenue Tree |
| 4 | +is a hierarchical breakdown of factors contributing to overall revenue, allowing for |
| 5 | +detailed analysis of sales performance and identification of areas for improvement. |
| 6 | +
|
| 7 | +
|
| 8 | +Key Components of the Revenue Tree: |
| 9 | +
|
| 10 | +1. Revenue: The top-level metric, calculated as Customers * Revenue per Customer. |
| 11 | +
|
| 12 | +2. Customers: Total number of customers, broken down into: |
| 13 | + - Returning Customers: Existing customers making repeat purchases. |
| 14 | + - New Customers: First-time buyers. |
| 15 | +
|
| 16 | +3. Revenue per Customer: Average revenue generated per customer, calculated as: |
| 17 | + Orders per Customer * Average Order Value. |
| 18 | +
|
| 19 | +4. Orders per Customer: Average number of orders placed by each customer. |
| 20 | +
|
| 21 | +5. Average Order Value: Average monetary value of each order, calculated as: |
| 22 | + Items per Order * Price per Item. |
| 23 | +
|
| 24 | +6. Items per Order: Average number of items in each order. |
| 25 | +
|
| 26 | +7. Price per Item: Average price of each item sold. |
| 27 | +
|
| 28 | +This module can be used to create, update, and analyze Revenue Tree data structures |
| 29 | +for retail businesses, helping to identify key drivers of revenue changes and |
| 30 | +inform strategic decision-making. |
| 31 | +""" |
| 32 | + |
| 33 | +import pandas as pd |
| 34 | + |
| 35 | +from pyretailscience.data.contracts import CustomContract, build_expected_columns, build_non_null_columns |
| 36 | + |
| 37 | + |
| 38 | +class RevenueTree: |
| 39 | + """Revenue Tree Analysis Class.""" |
| 40 | + |
| 41 | + def __init__( |
| 42 | + self, |
| 43 | + df: pd.DataFrame, |
| 44 | + p1_index: list[bool] | pd.Series, |
| 45 | + p2_index: list[bool] | pd.Series, |
| 46 | + group_col: str | None = None, |
| 47 | + pre_aggregated: bool = False, |
| 48 | + ) -> None: |
| 49 | + """Initialize the Revenue Tree Analysis Class. |
| 50 | +
|
| 51 | + Args: |
| 52 | + df (pd.DataFrame): The input DataFrame containing transaction data. |
| 53 | + p1_index (list[bool] | pd.Series): A boolean index for the first period. |
| 54 | + p2_index (list[bool] | pd.Series): A boolean index for the second period. |
| 55 | + group_col (str, optional): The column to group the data by. Defaults to None. |
| 56 | + pre_aggregated (bool, optional): Whether the data is pre-aggregated. Defaults to False. |
| 57 | +
|
| 58 | + Raises: |
| 59 | + ValueError: If the required columns are not present in the DataFrame. |
| 60 | + ValueError: If the lengths of p1_index, p2_index, and df are not equal. |
| 61 | +
|
| 62 | + Example: |
| 63 | + >>> import pandas as pd |
| 64 | + >>> from pyretailscience import RevenueTree |
| 65 | + >>> data = { |
| 66 | + ... "customer_id": [1, 2, 3, 4, 5, 6], |
| 67 | + ... "transaction_id": [1, 2, 3, 4, 5, 6], |
| 68 | + ... "total_price": [100, 200, 300, 400, 500, 600], |
| 69 | + ... "quantity": [1, 2, 3, 4, 5, 6], |
| 70 | + ... } |
| 71 | + >>> df = pd.DataFrame(data) |
| 72 | + >>> p1_index = [True, False, True, False, True, False] |
| 73 | + >>> p2_index = [False, True, False, True, False, True] |
| 74 | + >>> rev_tree = RevenueTree(df=df, p1_index=p1_index, p2_index=p2_index) |
| 75 | + """ |
| 76 | + if pre_aggregated: |
| 77 | + required_cols = ["customers", "tranactions", "total_price"] |
| 78 | + else: |
| 79 | + required_cols = ["customer_id", "transaction_id", "total_price"] |
| 80 | + |
| 81 | + if "quantity" in df.columns: |
| 82 | + required_cols.append("quantity") |
| 83 | + if group_col is not None: |
| 84 | + required_cols.append(group_col) |
| 85 | + |
| 86 | + contract = CustomContract( |
| 87 | + df, |
| 88 | + basic_expectations=build_expected_columns(columns=required_cols), |
| 89 | + extended_expectations=build_non_null_columns(columns=required_cols), |
| 90 | + ) |
| 91 | + if contract.validate() is False: |
| 92 | + msg = f"The dataframe requires the columns {required_cols} and they must be non-null" |
| 93 | + raise ValueError(msg) |
| 94 | + |
| 95 | + if not len(p1_index) == len(p2_index) == len(df): |
| 96 | + raise ValueError("p1_index, p2_index, and df should have the same length") |
| 97 | + |
| 98 | + if pre_aggregated is False: |
| 99 | + df, p1_index, p2_index = self._agg_data(df=df, p1_index=p1_index, p2_index=p2_index) |
| 100 | + |
| 101 | + self.revenue_tree_df = self._calc_tree_kpis( |
| 102 | + df=df, |
| 103 | + p1_index=p1_index, |
| 104 | + p2_index=p2_index, |
| 105 | + ) |
| 106 | + |
| 107 | + @staticmethod |
| 108 | + def _agg_data( |
| 109 | + df: pd.DataFrame, |
| 110 | + p1_index: list[bool] | pd.Series, |
| 111 | + p2_index: list[bool] | pd.Series, |
| 112 | + group_col: str | None = None, |
| 113 | + ) -> tuple[pd.DataFrame, list[bool], list[bool]]: |
| 114 | + if group_col is not None: |
| 115 | + p1_group = df[p1_index].groupby(group_col) |
| 116 | + p2_group = df[p2_index].groupby(group_col) |
| 117 | + p1_df = p1_group.agg( |
| 118 | + customers=("customer_id", "nunique"), |
| 119 | + transactions=("transaction_id", "nunique"), |
| 120 | + total_price=("total_price", "sum"), |
| 121 | + ) |
| 122 | + p2_df = p2_group.agg( |
| 123 | + customers=("customer_id", "nunique"), |
| 124 | + transactions=("transaction_id", "nunique"), |
| 125 | + total_price=("total_price", "sum"), |
| 126 | + ) |
| 127 | + if "quantity" in df.columns: |
| 128 | + p1_df["quantity"] = p1_group["quantity"].sum() |
| 129 | + p2_df["quantity"] = p2_group["quantity"].sum() |
| 130 | + else: |
| 131 | + p1_df = df[p1_index] |
| 132 | + p2_df = df[p2_index] |
| 133 | + p1_df = pd.DataFrame( |
| 134 | + { |
| 135 | + "customers": p1_df["customer_id"].nunique(), |
| 136 | + "transactions": p1_df["transaction_id"].nunique(), |
| 137 | + "total_price": p1_df["total_price"].sum(), |
| 138 | + }, |
| 139 | + index=["p1"], |
| 140 | + ) |
| 141 | + p2_df = pd.DataFrame( |
| 142 | + { |
| 143 | + "customers": p2_df["customer_id"].nunique(), |
| 144 | + "transactions": p2_df["transaction_id"].nunique(), |
| 145 | + "total_price": p2_df["total_price"].sum(), |
| 146 | + }, |
| 147 | + index=["p2"], |
| 148 | + ) |
| 149 | + if "quantity" in df.columns: |
| 150 | + p1_df["quantity"] = df[p1_index]["quantity"].sum() |
| 151 | + p2_df["quantity"] = df[p2_index]["quantity"].sum() |
| 152 | + |
| 153 | + new_p1_index = [True] * len(p1_df) + [False] * len(p2_df) |
| 154 | + new_p2_index = [not i for i in new_p1_index] |
| 155 | + |
| 156 | + return pd.concat([p1_df, p2_df]), new_p1_index, new_p2_index |
| 157 | + |
| 158 | + @staticmethod |
| 159 | + def _calc_tree_kpis( |
| 160 | + df: pd.DataFrame, |
| 161 | + p1_index: list[bool] | pd.Series, |
| 162 | + p2_index: list[bool] | pd.Series, |
| 163 | + ) -> pd.DataFrame: |
| 164 | + df["total_price_per_cust"] = df["total_price"] / df["customers"] |
| 165 | + df["total_price_per_transaction"] = df["total_price"] / df["transactions"] |
| 166 | + df["frequency"] = df["transactions"] / df["customers"] |
| 167 | + |
| 168 | + p1_df = df[p1_index] |
| 169 | + p1_df.columns = [f"{col}_p1" for col in p1_df.columns] |
| 170 | + p2_df = df[p2_index] |
| 171 | + p2_df.columns = [f"{col}_p2" for col in p2_df.columns] |
| 172 | + |
| 173 | + if set(df.index.to_list()) == {"p1", "p2"}: |
| 174 | + p1_df = p1_df.reset_index(drop=True) |
| 175 | + p2_df = p2_df.reset_index(drop=True) |
| 176 | + |
| 177 | + df = pd.concat([p1_df, p2_df], axis=1) |
| 178 | + |
| 179 | + # Calculations |
| 180 | + df["customers_diff"] = df["customers_p2"] - df["customers_p1"] |
| 181 | + df["transactions_diff"] = df["transactions_p2"] - df["transactions_p1"] |
| 182 | + df["total_price_diff"] = df["total_price_p2"] - df["total_price_p1"] |
| 183 | + df["total_price_per_cust_diff"] = df["total_price_per_cust_p2"] - df["total_price_per_cust_p1"] |
| 184 | + df["total_price_per_transaction_diff"] = ( |
| 185 | + df["total_price_per_transaction_p2"] - df["total_price_per_transaction_p1"] |
| 186 | + ) |
| 187 | + df["frequency_diff"] = df["frequency_p2"] - df["frequency_p1"] |
| 188 | + |
| 189 | + df["customers_pc"] = df["customers_diff"] / df["customers_p1"] |
| 190 | + df["transactions_pc"] = df["transactions_diff"] / df["transactions_p1"] |
| 191 | + df["total_price_pc"] = df["total_price_diff"] / df["total_price_p1"] |
| 192 | + df["total_price_per_cust_pc"] = df["total_price_per_cust_diff"] / df["total_price_per_cust_p1"] |
| 193 | + df["total_price_per_transaction_pc"] = ( |
| 194 | + df["total_price_per_transaction_diff"] / df["total_price_per_transaction_p1"] |
| 195 | + ) |
| 196 | + df["frequency_pc"] = df["frequency_diff"] / df["frequency_p1"] |
| 197 | + |
| 198 | + df["customers_contrib"] = ( |
| 199 | + df["total_price_p2"] |
| 200 | + - (df["customers_p1"] * df["total_price_per_cust_p2"]) |
| 201 | + - ((df["customers_diff"] * df["total_price_per_cust_diff"]) / 2) |
| 202 | + ) |
| 203 | + df["total_price_per_cust_contrib"] = ( |
| 204 | + df["total_price_p2"] |
| 205 | + - (df["total_price_per_cust_p1"] * df["customers_p2"]) |
| 206 | + - ((df["customers_diff"] * df["total_price_per_cust_diff"]) / 2) |
| 207 | + ) |
| 208 | + |
| 209 | + df["frequency_contrib"] = ( |
| 210 | + ( |
| 211 | + df["total_price_per_cust_p2"] |
| 212 | + - (df["frequency_p1"] * df["total_price_per_transaction_p2"]) |
| 213 | + - ((df["frequency_diff"] * df["total_price_per_transaction_diff"]) / 2) |
| 214 | + ) |
| 215 | + * df["customers_p2"] |
| 216 | + ) - ((df["customers_diff"] * df["total_price_per_cust_diff"]) / 4) |
| 217 | + df["total_price_per_transaction_contrib"] = ( |
| 218 | + ( |
| 219 | + df["total_price_per_cust_p2"] |
| 220 | + - (df["total_price_per_transaction_p1"] * df["frequency_p2"]) |
| 221 | + - ((df["frequency_diff"] * df["total_price_per_transaction_diff"]) / 2) |
| 222 | + ) |
| 223 | + * df["customers_p2"] |
| 224 | + ) - ((df["customers_diff"] * df["total_price_per_cust_diff"]) / 4) |
| 225 | + |
| 226 | + return df |
0 commit comments