diff --git a/docs/analysis_modules.md b/docs/analysis_modules.md index b1cf84d2..fb591dda 100644 --- a/docs/analysis_modules.md +++ b/docs/analysis_modules.md @@ -92,3 +92,60 @@ time_plot( move_legend_outside=True, ) ``` + +## Analysis Modules + +### Product Association Rules + +The product association module implements functionality for generating product association rules, a powerful technique +in retail analytics and market basket analysis. + +Product association rules are used to uncover relationships between different products that customers tend to purchase +together. These rules provide valuable insights into consumer behavior and purchasing patterns, which can be leveraged +by retail businesses in various ways: + +1. Cross-selling and upselling: By identifying products frequently bought together, retailers can make targeted product + recommendations to increase sales and average order value. + +2. Store layout optimization: Understanding product associations helps in strategic product placement within stores, + potentially increasing impulse purchases and overall sales. + +3. Inventory management: Knowing which products are often bought together aids in maintaining appropriate stock levels + and predicting demand. + +4. Marketing and promotions: Association rules can guide the creation ofeffective bundle offers and promotional + campaigns. + +5. Customer segmentation: Patterns in product associations can reveal distinct customer segments with specific + preferences. + +6. New product development: Insights from association rules can inform decisions about new product lines or features. + +The module uses metrics such as support, confidence, and uplift to quantifythe strength and significance of product +associations: + +- Support: The frequency of items appearing together in transactions. +- Confidence: The likelihood of buying one product given the purchase of another. +- Uplift: The increase in purchase probability of one product when another is bought. + +Example: + +```python +from pyretailscience.product_association import ProductAssociation + +pa = ProductAssociation( + df, + value_col="product_name", + group_col="transaction_id", +) +pa.df.head() +``` + +| product_name_1 | product_name_2 | occurrences_1 | occurrences_2 | cooccurrences | support | confidence | uplift | +|:-----------------|:-----------------------------|---------------:|---------------:|---------------:|---------:|-----------:|-------:| +| 100 Animals Book | 100% Organic Cold-Pressed... | 78 | 78 | 1 | 0.000039 | 0.0128205 | 4.18 | +| 100 Animals Book | 20K Sousaphone | 78 | 81 | 3 | 0.000117 | 0.0384615 | 12.10 | +| 100 Animals Book | 360 Sport 2.0 Boxer Briefs | 78 | 79 | 1 | 0.000039 | 0.0128205 | 4.13 | +| 100 Animals Book | 4-Series 4K UHD | 78 | 82 | 1 | 0.000039 | 0.0128205 | 3.98 | +| 100 Animals Book | 700S Eterna Trumpet | 78 | 71 | 1 | 0.000039 | 0.0128205 | 4.60 | + diff --git a/docs/api/product_association.md b/docs/api/product_association.md new file mode 100644 index 00000000..24192991 --- /dev/null +++ b/docs/api/product_association.md @@ -0,0 +1,3 @@ +# Product Associations + +::: pyretailscience.product_association diff --git a/docs/examples/product_association.ipynb b/docs/examples/product_association.ipynb new file mode 100644 index 00000000..25799f74 --- /dev/null +++ b/docs/examples/product_association.ipynb @@ -0,0 +1,679 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The product association module implements functionality for generating product association rules, a powerful technique \n", + "in retail analytics and market basket analysis.\n", + "\n", + "Product association rules are used to uncover relationships between different products that customers tend to purchase\n", + "together. These rules provide valuable insights into consumer behavior and purchasing patterns, which can be leveraged\n", + "by retail businesses in various ways:\n", + "\n", + "1. Cross-selling and upselling: By identifying products frequently bought together, retailers can make targeted product\n", + " recommendations to increase sales and average order value.\n", + "\n", + "2. Store layout optimization: Understanding product associations helps in strategic product placement within stores,\n", + " potentially increasing impulse purchases and overall sales.\n", + "\n", + "3. Inventory management: Knowing which products are often bought together aids in maintaining appropriate stock levels\n", + " and predicting demand.\n", + "\n", + "4. Marketing and promotions: Association rules can guide the creation ofeffective bundle offers and promotional\n", + " campaigns.\n", + "\n", + "5. Customer segmentation: Patterns in product associations can reveal distinct customer segments with specific\n", + " preferences.\n", + "\n", + "6. New product development: Insights from association rules can inform decisions about new product lines or features.\n", + "\n", + "The module uses metrics such as support, confidence, and uplift to quantifythe strength and significance of product\n", + "associations:\n", + "\n", + "- Support: The frequency of items appearing together in transactions.\n", + "- Confidence: The likelihood of buying one product given the purchase of another.\n", + "- Uplift: The increase in purchase probability of one product when another is bought.\n", + "\n", + "### Setup\n", + "\n", + "We'll start by loading some simulated data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
transaction_idtransaction_datetimecustomer_idproduct_idproduct_namecategory_0_namecategory_0_idcategory_1_namecategory_1_idbrand_namebrand_idunit_pricequantitytotal_pricestore_id
071082023-01-12 17:44:29115Spawn FigureToys1Action Figures1McFarlane Toys327.99255.986
171082023-01-12 17:44:2911317Gone GirlBooks8Mystery & Thrillers53Alfred A. Knopf26410.49110.496
245532023-02-05 09:31:421509Ryzen 3 3300XElectronics3Computer Components21AMD102120.003360.004
345532023-02-05 09:31:421735Linden Wood Paneled MirrorHome5Home Decor30Pottery Barn147599.001599.004
445532023-02-05 09:31:4211107Pro-V Daily Moisture Renewal ConditionerBeauty7Hair Care45Pantene2224.9914.994
\n", + "
" + ], + "text/plain": [ + " transaction_id transaction_datetime customer_id product_id \\\n", + "0 7108 2023-01-12 17:44:29 1 15 \n", + "1 7108 2023-01-12 17:44:29 1 1317 \n", + "2 4553 2023-02-05 09:31:42 1 509 \n", + "3 4553 2023-02-05 09:31:42 1 735 \n", + "4 4553 2023-02-05 09:31:42 1 1107 \n", + "\n", + " product_name category_0_name category_0_id \\\n", + "0 Spawn Figure Toys 1 \n", + "1 Gone Girl Books 8 \n", + "2 Ryzen 3 3300X Electronics 3 \n", + "3 Linden Wood Paneled Mirror Home 5 \n", + "4 Pro-V Daily Moisture Renewal Conditioner Beauty 7 \n", + "\n", + " category_1_name category_1_id brand_name brand_id unit_price \\\n", + "0 Action Figures 1 McFarlane Toys 3 27.99 \n", + "1 Mystery & Thrillers 53 Alfred A. Knopf 264 10.49 \n", + "2 Computer Components 21 AMD 102 120.00 \n", + "3 Home Decor 30 Pottery Barn 147 599.00 \n", + "4 Hair Care 45 Pantene 222 4.99 \n", + "\n", + " quantity total_price store_id \n", + "0 2 55.98 6 \n", + "1 1 10.49 6 \n", + "2 3 360.00 4 \n", + "3 1 599.00 4 \n", + "4 1 4.99 4 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_parquet(\"../../data/transactions.parquet\")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of unique customers: 4250\n", + "Number of unique transactions: 25490\n" + ] + } + ], + "source": [ + "print(f\"Number of unique customers: {df['customer_id'].nunique()}\")\n", + "print(f\"Number of unique transactions: {df['transaction_id'].nunique()}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we'll see simple example to generate the production association rules." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
product_name_1product_name_2occurrences_1occurrences_2cooccurrencessupportconfidenceuplift
0100 Animals Book100% Organic Cold-Pressed Rose Hip Seed Oil787810.0000390.0128214.189678
1100 Animals Book20K Sousaphone788130.0001180.03846212.103514
2100 Animals Book360 Sport 2.0 Boxer Briefs787910.0000390.0128214.136644
3100 Animals Book4-Series 4K UHD788210.0000390.0128213.985303
4100 Animals Book700S Eterna Trumpet787110.0000390.0128214.602745
\n", + "
" + ], + "text/plain": [ + " product_name_1 product_name_2 \\\n", + "0 100 Animals Book 100% Organic Cold-Pressed Rose Hip Seed Oil \n", + "1 100 Animals Book 20K Sousaphone \n", + "2 100 Animals Book 360 Sport 2.0 Boxer Briefs \n", + "3 100 Animals Book 4-Series 4K UHD \n", + "4 100 Animals Book 700S Eterna Trumpet \n", + "\n", + " occurrences_1 occurrences_2 cooccurrences support confidence \\\n", + "0 78 78 1 0.000039 0.012821 \n", + "1 78 81 3 0.000118 0.038462 \n", + "2 78 79 1 0.000039 0.012821 \n", + "3 78 82 1 0.000039 0.012821 \n", + "4 78 71 1 0.000039 0.012821 \n", + "\n", + " uplift \n", + "0 4.189678 \n", + "1 12.103514 \n", + "2 4.136644 \n", + "3 3.985303 \n", + "4 4.602745 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pyretailscience.product_association import ProductAssociation\n", + "\n", + "pa = ProductAssociation(\n", + " df,\n", + " value_col=\"product_name\",\n", + " group_col=\"transaction_id\",\n", + ")\n", + "pa.df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also limit the returned items to those that include a specific item." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
product_name_1product_name_2occurrences_1occurrences_2cooccurrencessupportconfidenceuplift
04-Series 4K UHD100 Animals Book827810.0000390.0121953.985303
14-Series 4K UHD122HD45 Gas Hedge Trimmer829210.0000390.0121953.378844
24-Series 4K UHD2-in-1 Touch & Learn Tablet828120.0000780.0243907.675399
34-Series 4K UHD20K Sousaphone828110.0000390.0121953.837699
44-Series 4K UHD3 Minute Miracle Deep Conditioner827010.0000390.0121954.440767
\n", + "
" + ], + "text/plain": [ + " product_name_1 product_name_2 occurrences_1 \\\n", + "0 4-Series 4K UHD 100 Animals Book 82 \n", + "1 4-Series 4K UHD 122HD45 Gas Hedge Trimmer 82 \n", + "2 4-Series 4K UHD 2-in-1 Touch & Learn Tablet 82 \n", + "3 4-Series 4K UHD 20K Sousaphone 82 \n", + "4 4-Series 4K UHD 3 Minute Miracle Deep Conditioner 82 \n", + "\n", + " occurrences_2 cooccurrences support confidence uplift \n", + "0 78 1 0.000039 0.012195 3.985303 \n", + "1 92 1 0.000039 0.012195 3.378844 \n", + "2 81 2 0.000078 0.024390 7.675399 \n", + "3 81 1 0.000039 0.012195 3.837699 \n", + "4 70 1 0.000039 0.012195 4.440767 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pa_specific_item = ProductAssociation(\n", + " df,\n", + " value_col=\"product_name\",\n", + " group_col=\"transaction_id\",\n", + " target_item=\"4-Series 4K UHD\",\n", + ")\n", + "pa_specific_item.df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can filter the returned results by,\n", + "- Mininum occurrences of an item\n", + "- Mininum cooccurrences of pair of items\n", + "- Mininum support of a pair of items\n", + "- Mininum confidence of a pair of items\n", + "- Mininum uplift of a pair of items" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
product_name_1product_name_2occurrences_1occurrences_2cooccurrencessupportconfidenceuplift
0100 Animals Book20K Sousaphone788130.0001180.03846212.103514
1100 Animals BookActivia Probiotic Yogurt785720.0000780.02564111.466487
2100 Animals BookAether AG 70 Pack787220.0000780.0256419.077635
3100 Animals BookAll Natural Plain Yogurt786210.0000390.0128215.270885
4100 Animals BookAmerican Ultra Jazz Bass785920.0000780.02564111.077792
\n", + "
" + ], + "text/plain": [ + " product_name_1 product_name_2 occurrences_1 occurrences_2 \\\n", + "0 100 Animals Book 20K Sousaphone 78 81 \n", + "1 100 Animals Book Activia Probiotic Yogurt 78 57 \n", + "2 100 Animals Book Aether AG 70 Pack 78 72 \n", + "3 100 Animals Book All Natural Plain Yogurt 78 62 \n", + "4 100 Animals Book American Ultra Jazz Bass 78 59 \n", + "\n", + " cooccurrences support confidence uplift \n", + "0 3 0.000118 0.038462 12.103514 \n", + "1 2 0.000078 0.025641 11.466487 \n", + "2 2 0.000078 0.025641 9.077635 \n", + "3 1 0.000039 0.012821 5.270885 \n", + "4 2 0.000078 0.025641 11.077792 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pa_min_uplift = ProductAssociation(\n", + " df,\n", + " value_col=\"product_name\",\n", + " group_col=\"transaction_id\",\n", + " min_uplift=5,\n", + ")\n", + "pa_min_uplift.df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/mkdocs.yml b/mkdocs.yml index fb71bc76..be7a11ad 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -15,6 +15,7 @@ nav: - Cross Shop Analysis: examples/cross_shop.ipynb - Data Contracts: examples/data_contracts.ipynb - Segmentation: examples/segmentation.ipynb + - Product Association: examples/product_association.ipynb - Reference: - Gain Loss: api/gain_loss.md - Range Planning: api/range_planning.md @@ -22,6 +23,7 @@ nav: - Customer Retention: api/customer.md - Standard Graphs: api/standard_graphs.md - Cross Shop Analysis: api/cross_shop.md + - Product Association: api/product_association.md - Contracts: api/contracts.md theme: diff --git a/pyretailscience/product_association.py b/pyretailscience/product_association.py new file mode 100644 index 00000000..1f39d10b --- /dev/null +++ b/pyretailscience/product_association.py @@ -0,0 +1,304 @@ +"""Product Association Rules Generation. + +This module implements functionality for generating product association rules, a powerful technique in retail analytics +and market basket analysis. + +Product association rules are used to uncover relationships between different products that customers tend to purchase +together. These rules provide valuable insights into consumer behavior and purchasing patterns, which can be leveraged +by retail businesses in various ways: + +1. Cross-selling and upselling: By identifying products frequently bought together, retailers can make targeted product + recommendations to increase sales and average order value. + +2. Store layout optimization: Understanding product associations helps in strategic product placement within stores, + potentially increasing impulse purchases and overall sales. + +3. Inventory management: Knowing which products are often bought together aids in maintaining appropriate stock levels + and predicting demand. + +4. Marketing and promotions: Association rules can guide the creation ofeffective bundle offers and promotional + campaigns. + +5. Customer segmentation: Patterns in product associations can reveal distinct customer segments with specific + preferences. + +6. New product development: Insights from association rules can inform decisions about new product lines or features. + +The module uses metrics such as support, confidence, and uplift to quantifythe strength and significance of product +associations: + +- Support: The frequency of items appearing together in transactions. +- Confidence: The likelihood of buying one product given the purchase of another. +- Uplift: The increase in purchase probability of one product when another is bought. + +By leveraging these association rules, retailers can make data-driven decisions to enhance customer experience, optimize +operations, and drive business growth. +""" + +from itertools import combinations +from typing import Literal + +import numpy as np +import pandas as pd +from scipy.sparse import csr_matrix + +from pyretailscience.data.contracts import CustomContract, build_expected_columns, build_non_null_columns + + +class ProductAssociation: + """A class for generating and analyzing product association rules. + + This class calculates association rules between products based on transaction data, + helping to identify patterns in customer purchasing behavior. + + Args: + df (pandas.DataFrame): The input DataFrame containing transaction data. + value_col (str): The name of the column in the input DataFrame that contains + the product identifiers. + group_col (str, optional): The name of the column that identifies unique + transactions or customers. Defaults to "customer_id". + target_item (str or None, optional): A specific product to focus the + association analysis on. If None, associations for all products are + calculated. Defaults to None. + + Attributes: + df (pandas.DataFrame): A DataFrame containing the calculated association + rules and their metrics. + + Example: + >>> import pandas as pd + >>> transaction_df = pd.DataFrame({ + ... 'customer_id': [1, 1, 2, 2, 3], + ... 'product_id': ['A', 'B', 'B', 'C', 'A'] + ... }) + >>> pa = ProductAssociation(df=transaction_df, value_col='product_id', group_col='customer_id') + >>> print(pa.df) # View the calculated association rules + + Note: + The resulting DataFrame (pa.df) contains the following columns: + - product_1, product_2: The pair of products for which the association is calculated. + - occurrences_1, occurrences_2: The number of transactions containing each product. + - cooccurrences: The number of transactions containing both products. + - support: The proportion of transactions containing both products. + - confidence: The probability of buying product_2 given that product_1 was bought. + - uplift: The ratio of the observed support to the expected support if the products were independent. + + The class uses efficient sparse matrix operations to handle large datasets and + calculates associations for either pairs (2) or triples (3) of products, depending + on the 'number_of_combinations' parameter in _calc_association. + """ + + def __init__( + self, + df: pd.DataFrame, + value_col: str, + group_col: str = "customer_id", + target_item: str | None = None, + number_of_combinations: Literal[2, 3] = 2, + min_occurrences: int = 1, + min_cooccurrences: int = 1, + min_support: float = 0.0, + min_confidence: float = 0.0, + min_uplift: float = 0.0, + ) -> None: + """Initialize the ProductAssociation object. + + Args: + df (pandas.DataFrame): The input DataFrame containing transaction data. + value_col (str): The name of the column in the input DataFrame that contains the product identifiers. + group_col (str, optional): The name of the column that identifies unique transactions or customers. Defaults + to "customer_id". + target_item (str or None, optional): A specific product to focus the association analysis on. If None, + associations for all products are calculated. Defaults to None. + number_of_combinations (int, optional): The number of products to consider in the association analysis. Can + be either 2 or 3. Defaults to 2. + min_occurrences (int, optional): The minimum number of occurrences required for each product in the + association analysis. Defaults to 1. Must be at least 1. + min_cooccurrences (int, optional): The minimum number of co-occurrences required for the product pairs in + the association analysis. Defaults to 1. Must be at least 1. + min_support (float, optional): The minimum support value required for the association rules. Defaults to + 0.0. Must be between 0 and 1. + min_confidence (float, optional): The minimum confidence value required for the association rules. Defaults + to 0.0. Must be between 0 and 1. + min_uplift (float, optional): The minimum uplift value required for the association rules. Defaults to 0.0. + Must be greater or equal to 0. + + Raises: + ValueError: If the number of combinations is not 2 or 3, or if any of the minimum values are invalid. + ValueError: If the minimum support, confidence, or uplift values are outside the valid range. + ValueError: If the minimum occurrences or cooccurrences are less than 1. + ValueError: If the input DataFrame does not contain the required columns or if they have null values. + """ + required_cols = [group_col, value_col] + contract = CustomContract( + df, + basic_expectations=build_expected_columns(columns=required_cols), + extended_expectations=build_non_null_columns(columns=required_cols), + ) + if contract.validate() is False: + msg = f"The dataframe requires the columns {required_cols} and they must be non-null" + raise ValueError(msg) + + self.df = self._calc_association( + df=df, + value_col=value_col, + group_col=group_col, + target_item=target_item, + number_of_combinations=number_of_combinations, + min_occurrences=min_occurrences, + min_cooccurrences=min_cooccurrences, + min_support=min_support, + min_confidence=min_confidence, + min_uplift=min_uplift, + ) + + @staticmethod + def _calc_association( # noqa: C901 (ignore complexity) - Excluded due to min_* arguments checks + df: pd.DataFrame, + value_col: str, + group_col: str = "customer_id", + target_item: str | None = None, + number_of_combinations: Literal[2, 3] = 2, + min_occurrences: int = 1, + min_cooccurrences: int = 1, + min_support: float = 0.0, + min_confidence: float = 0.0, + min_uplift: float = 0.0, + ) -> pd.DataFrame: + """Calculate product association rules based on transaction data. + + This method calculates association rules between products based on transaction data, + helping to identify patterns in customer purchasing behavior. + + Args: + df (pandas.DataFrame): The input DataFrame containing transaction data. + value_col (str): The name of the column in the input DataFrame that contains the product identifiers. + group_col (str, optional): The name of the column that identifies unique transactions or customers. Defaults + to "customer_id". + target_item (str or None, optional): A specific product to focus the association analysis on. If None, + associations for all products are calculated. Defaults to None. + number_of_combinations (int, optional): The number of products to consider in the association analysis. Can + be either 2 or 3. Defaults to 2. + min_occurrences (int, optional): The minimum number of occurrences required for each product in the + association analysis. Defaults to 1. Must be at least 1. + min_cooccurrences (int, optional): The minimum number of co-occurrences required for the product pairs in + the association analysis. Defaults to 1. Must be at least 1. + min_support (float, optional): The minimum support value required for the association rules. Defaults to + 0.0. Must be between 0 and 1. + min_confidence (float, optional): The minimum confidence value required for the association rules. Defaults + to 0.0. Must be between 0 and 1. + min_uplift (float, optional): The minimum uplift value required for the association rules. Defaults to 0.0. + Must be greater or equal to 0. + + Returns: + pandas.DataFrame: A DataFrame containing the calculated association rules and their metrics. + + Raises: + ValueError: If the number of combinations is not 2 or 3, or if any of the minimum values are invalid. + ValueError: If the minimum support, confidence, or uplift values are outside the valid range. + ValueError: If the minimum occurrences or cooccurrences are less than 1. + + Note: + The resulting DataFrame contains the following columns: + - product_1, product_2: The pair of products for which the association is calculated. + - occurrences_1, occurrences_2: The number of transactions containing each product. + - cooccurrences: The number of transactions containing both products. + - support: The proportion of transactions containing both products. + - confidence: The probability of buying product_2 given that product_1 was bought. + - uplift: The ratio of the observed support to the expected support if the products were independent. + + The method uses efficient sparse matrix operations to handle large datasets and + calculates associations for either pairs (2) or triples (3) of products, depending + on the 'number_of_combinations' parameter. + """ + if number_of_combinations not in [2, 3]: + raise ValueError("Number of combinations must be either 2 or 3.") + if min_occurrences < 1: + raise ValueError("Minimum occurrences must be at least 1.") + if min_cooccurrences < 1: + raise ValueError("Minimum cooccurrences must be at least 1.") + if min_support < 0.0 or min_support > 1.0: + raise ValueError("Minimum support must be between 0 and 1.") + if min_confidence < 0.0 or min_confidence > 1.0: + raise ValueError("Minimum confidence must be between 0 and 1.") + if min_uplift < 0.0: + raise ValueError("Minimum uplift must be greater or equal to 0.") + + unique_combo_df = df[[group_col, value_col]].drop_duplicates() + unique_combo_df[value_col] = pd.Categorical(unique_combo_df[value_col], ordered=True) + unique_combo_df[group_col] = pd.Categorical(unique_combo_df[group_col], ordered=True) + + sparse_matrix = csr_matrix( + ( + [1] * len(unique_combo_df), + ( + unique_combo_df[group_col].cat.codes, + unique_combo_df[value_col].cat.codes, + ), + ), + ) + + row_count = sparse_matrix.shape[0] + + results = [] + + occurrences = np.array(sparse_matrix.sum(axis=0)).flatten() + occurence_prob = occurrences / row_count + + items = [target_item] + if target_item is None: + if number_of_combinations == 2: # noqa: PLR2004 + items = unique_combo_df[value_col].cat.categories + elif number_of_combinations == 3: # noqa: PLR2004 + items = sorted(combinations(unique_combo_df[value_col].cat.categories, 2)) + + for item_2 in items: + if isinstance(item_2, tuple): + target_item_col_index = [unique_combo_df[value_col].cat.categories.get_loc(i) for i in item_2] + rows_with_target_item = (sparse_matrix[:, target_item_col_index].toarray() == 1).all(axis=1) + else: + target_item_col_index = unique_combo_df[value_col].cat.categories.get_loc(item_2) + rows_with_target_item = sparse_matrix[:, target_item_col_index].toarray().ravel() == 1 + + rows_with_target_item_sum = rows_with_target_item.sum() + + cooccurrences = np.array(sparse_matrix[rows_with_target_item, :].sum(axis=0)).flatten() + if (cooccurrences == 0).all(): + continue + + coocurrence_prob = cooccurrences / row_count + + target_prob = rows_with_target_item_sum / row_count + expected_prob = target_prob * occurence_prob + + pa_df = pd.DataFrame( + { + f"{value_col}_1": [item_2] * sparse_matrix.shape[1], + f"{value_col}_2": unique_combo_df[value_col].cat.categories, + "occurrences_1": rows_with_target_item_sum, + "occurrences_2": occurrences, + "cooccurrences": cooccurrences, + "support": coocurrence_prob, + "confidence": cooccurrences / rows_with_target_item_sum, + "uplift": coocurrence_prob / expected_prob, + }, + ) + + if isinstance(item_2, tuple): + dupe_pairs_idx = pa_df.apply(lambda x: x[f"{value_col}_2"] in x[f"{value_col}_1"], axis=1) + else: + dupe_pairs_idx = pa_df[f"{value_col}_1"] == pa_df[f"{value_col}_2"] + + excl_pairs_idx = ( + dupe_pairs_idx + | (pa_df["occurrences_1"] < min_occurrences) + | (pa_df["occurrences_2"] < min_occurrences) + | (pa_df["cooccurrences"] < min_cooccurrences) + | (pa_df["support"] < min_support) + | (pa_df["confidence"] < min_confidence) + | (pa_df["uplift"] < min_uplift) + ) + + results.append(pa_df[~excl_pairs_idx]) + + return pd.concat(results).sort_values([f"{value_col}_1", f"{value_col}_2"]).reset_index(drop=True) diff --git a/tests/test_product_association.py b/tests/test_product_association.py new file mode 100644 index 00000000..15b03935 --- /dev/null +++ b/tests/test_product_association.py @@ -0,0 +1,330 @@ +"""Tests for the ProductAssociation module.""" + +import pandas as pd +import pytest + +from pyretailscience.product_association import ProductAssociation + + +class TestProductAssociations: + """Tests for the ProductAssociations class.""" + + @pytest.fixture() + def transactions_df(self) -> pd.DataFrame: + """Return a sample DataFrame for testing.""" + # fmt: off + return pd.DataFrame({ + "transaction_id": [1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 5], + "product": ["milk", "bread", "fruit", "butter", "eggs", "fruit", "beer", "diapers", + "milk", "bread", "butter", "eggs", "fruit", "bread"], + }) + # fmt: on + + @pytest.fixture() + def expected_results_single_items_df(self) -> pd.DataFrame: + """Return the expected results for the single items association analysis.""" + # fmt: off + return pd.DataFrame( + { + "product_1": [ + "beer", "bread", "bread", "bread", "bread", "butter", "butter", "butter", "butter", "diapers", + "eggs", "eggs", "eggs", "eggs", "fruit", "fruit", "fruit", "fruit", "milk", "milk", "milk", + "milk", + ], + "product_2": [ + "diapers", "butter", "eggs", "fruit", "milk", "bread", "eggs", "fruit", "milk", "beer", "bread", + "butter", "fruit", "milk", "bread", "butter", "eggs", "milk", "bread", "butter", "eggs", + "fruit", + ], + "occurrences_1": [1, 3, 3, 3, 3, 2, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 2], + "occurrences_2": [1, 2, 2, 3, 2, 3, 2, 3, 2, 1, 3, 2, 3, 2, 3, 2, 2, 2, 3, 2, 2, 3], + "cooccurrences": [1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2], + "support": [ + 0.2, 0.2, 0.2, 0.4, 0.4, 0.2, 0.4, 0.4, 0.2, 0.2, 0.2, 0.4, 0.4, 0.2, 0.4, 0.4, 0.4, 0.4, 0.4, + 0.2, 0.2, 0.4, + ], + "confidence": [ + 1.0, 0.333333, 0.333333, 0.666667, 0.666667, 0.5, 1.0, 1.0, 0.5, 1.0, 0.5, 1.0, 1.0, 0.5, + 0.666667, 0.666667, 0.666667, 0.666667, 1.0, 0.5, 0.5, 1.0, + ], + "uplift": [ + 5.0, 0.833333, 0.833333, 1.111111, 1.666667, 0.833333, 2.5, 1.666667, 1.25, 5.0, 0.833333, 2.5, + 1.666667, 1.25, 1.111111, 1.666667, 1.666667, 1.666667, 1.666667, 1.25, 1.25, 1.666667, + ], + }, + ) + # fmt: on + + @pytest.fixture() + def expected_results_pair_items_df(self) -> pd.DataFrame: + """Return the expected results for the pair items association analysis.""" + # fmt: off + return pd.DataFrame( + { + "product_1": [ + ("bread", "butter"), ("bread", "butter"), ("bread", "butter"), ("bread", "eggs"), ("bread", "eggs"), + ("bread", "eggs"), ("bread", "fruit"), ("bread", "fruit"), ("bread", "fruit"), ("bread", "milk"), + ("bread", "milk"), ("bread", "milk"), ("butter", "eggs"), ("butter", "eggs"), ("butter", "eggs"), + ("butter", "fruit"), ("butter", "fruit"), ("butter", "fruit"), ("butter", "milk"), + ("butter", "milk"), ("butter", "milk"), ("eggs", "fruit"), ("eggs", "fruit"), ("eggs", "fruit"), + ("eggs", "milk"), ("eggs", "milk"), ("eggs", "milk"), ("fruit", "milk"), ("fruit", "milk"), + ("fruit", "milk"), + ], + "product_2": [ + "eggs", "fruit", "milk", "butter", "fruit", "milk", "butter", "eggs", "milk", "butter", "eggs", + "fruit", "bread", "fruit", "milk", "bread", "eggs", "milk", "bread", "eggs", "fruit", "bread", + "butter", "milk", "bread", "butter", "fruit", "bread", "butter", "eggs", + ], + "occurrences_1": [ + 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, + ], + "occurrences_2": [ + 2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2, 3, 3, 3, 2, 3, 2, 2, 3, 2, 3, 3, 2, 2, 3, 2, 3, 3, 2, 2, + ], + "cooccurrences": [ + 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, + ], + "support": [ + 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.2, 0.2, 0.4, 0.2, 0.4, 0.2, 0.2, 0.4, 0.2, 0.2, 0.2, + 0.2, 0.2, 0.4, 0.2, 0.2, 0.2, 0.2, 0.4, 0.2, 0.2, + ], + "confidence": [ + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 0.5, 1.0, 0.5, 0.5, 1.0, 0.5, 1.0, 0.5, 0.5, 1.0, 0.5, 1.0, 1.0, + 1.0, 0.5, 1.0, 0.5, 1.0, 1.0, 1.0, 1.0, 0.5, 0.5, + ], + "uplift": [ + 2.5, 1.666667, 2.5, 2.5, 1.666667, 2.5, 1.25, 1.25, 2.5, 1.25, 1.25, 1.666667, 0.833333, 1.666667, + 1.25, 0.833333, 2.5, 1.25, 1.666667, 2.5, 1.666667, 0.833333, 2.5, 1.25, 1.666667, 2.5, 1.666667, + 1.666667, 1.25,1.25, + ], + }, + ) + # fmt: on + + def test_calc_association_all_single_items(self, transactions_df, expected_results_single_items_df): + """Test calculating association rules for a single item versus another of item for all items.""" + associations_df = ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + ) + + pd.testing.assert_frame_equal(associations_df, expected_results_single_items_df) + + def test_calc_association_target_single_items(self, transactions_df, expected_results_single_items_df): + """Test calculating association rules for target single item versus another of item.""" + target_item = "bread" + + calc_df = ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + target_item=target_item, + ) + + pd.testing.assert_frame_equal( + calc_df, + expected_results_single_items_df[expected_results_single_items_df["product_1"] == target_item].reset_index( + drop=True, + ), + ) + + def test_calc_association_all_pair_items(self, transactions_df, expected_results_pair_items_df): + """Test calculating association rules for a pairs of items versus another item for all items.""" + calc_df = ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + number_of_combinations=3, + ) + + pd.testing.assert_frame_equal(calc_df, expected_results_pair_items_df) + + def test_calc_association_target_pair_items(self, transactions_df, expected_results_pair_items_df): + """Test calculating association rules for a target pairs of items versus another item.""" + calc_df = ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + number_of_combinations=3, + target_item=("bread", "butter"), + ) + + pd.testing.assert_frame_equal( + calc_df, + expected_results_pair_items_df[ + expected_results_pair_items_df["product_1"] == ("bread", "butter") + ].reset_index(drop=True), + ) + + def test_calc_association_min_occurrences(self, transactions_df, expected_results_single_items_df): + """Test calculating association rules with a min occurrences level.""" + min_occurrences = 2 + + calc_df = ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + min_occurrences=min_occurrences, + ) + + pd.testing.assert_frame_equal( + calc_df, + expected_results_single_items_df[ + (expected_results_single_items_df["occurrences_1"] >= min_occurrences) + & (expected_results_single_items_df["occurrences_2"] >= min_occurrences) + ].reset_index(drop=True), + ) + + def test_calc_association_min_cooccurrences(self, transactions_df, expected_results_single_items_df): + """Test calculating association rules with a min occurrences level.""" + min_cooccurrences = 2 + + calc_df = ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + min_cooccurrences=min_cooccurrences, + ) + + pd.testing.assert_frame_equal( + calc_df, + expected_results_single_items_df[ + (expected_results_single_items_df["cooccurrences"] >= min_cooccurrences) + ].reset_index(drop=True), + ) + + def test_calc_association_min_support(self, transactions_df, expected_results_single_items_df): + """Test calculating association rules with a min occurrences level.""" + min_support = 0.25 + + calc_df = ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + min_support=min_support, + ) + + pd.testing.assert_frame_equal( + calc_df, + expected_results_single_items_df[(expected_results_single_items_df["support"] >= min_support)].reset_index( + drop=True, + ), + ) + + def test_calc_association_min_confidence(self, transactions_df, expected_results_single_items_df): + """Test calculating association rules with a min occurrences level.""" + min_confidence = 0.25 + + calc_df = ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + min_confidence=min_confidence, + ) + + pd.testing.assert_frame_equal( + calc_df, + expected_results_single_items_df[ + (expected_results_single_items_df["confidence"] >= min_confidence) + ].reset_index(drop=True), + ) + + def test_calc_association_min_uplift(self, transactions_df, expected_results_single_items_df): + """Test calculating association rules with a min occurrences level.""" + min_uplift = 1 + + calc_df = ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + min_uplift=min_uplift, + ) + + pd.testing.assert_frame_equal( + calc_df, + expected_results_single_items_df[(expected_results_single_items_df["uplift"] >= min_uplift)].reset_index( + drop=True, + ), + ) + + def test_calc_association_invalid_number_of_combinations(self, transactions_df): + """Test calculating association rules with an invalid number of combinations.""" + with pytest.raises(ValueError, match="Number of combinations must be either 2 or 3."): + ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + number_of_combinations=4, + ) + with pytest.raises(ValueError, match="Number of combinations must be either 2 or 3."): + ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + number_of_combinations=1, + ) + + def test_calc_association_invalid_min_occurrences(self, transactions_df): + """Test calculating association rules with an invalid minimum occurrences value.""" + with pytest.raises(ValueError, match="Minimum occurrences must be at least 1."): + ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + min_occurrences=0, + ) + + def test_calc_association_invalid_min_cooccurrences(self, transactions_df): + """Test calculating association rules with an invalid minimum cooccurrences value.""" + with pytest.raises(ValueError, match="Minimum cooccurrences must be at least 1."): + ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + min_cooccurrences=0, + ) + + def test_calc_association_min_support_invalid_range(self, transactions_df): + """Test calculating association rules with an invalid minimum support range.""" + with pytest.raises(ValueError, match="Minimum support must be between 0 and 1."): + ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + min_support=-0.1, + ) + with pytest.raises(ValueError, match="Minimum support must be between 0 and 1."): + ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + min_support=1.1, + ) + + def test_calc_association_min_confidence_invalid_range(self, transactions_df): + """Test calculating association rules with an invalid minimum confidence range.""" + with pytest.raises(ValueError, match="Minimum confidence must be between 0 and 1."): + ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + min_confidence=-0.1, + ) + with pytest.raises(ValueError, match="Minimum confidence must be between 0 and 1."): + ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + min_confidence=1.1, + ) + + def test_calc_association_min_uplift_invalid_range(self, transactions_df): + """Test calculating association rules with an invalid minimum uplift range.""" + with pytest.raises(ValueError, match="Minimum uplift must be greater or equal to 0."): + ProductAssociation._calc_association( + df=transactions_df, + value_col="product", + group_col="transaction_id", + min_uplift=-0.1, + )