diff --git a/docs/analysis_modules.md b/docs/analysis_modules.md
index b1cf84d2..fb591dda 100644
--- a/docs/analysis_modules.md
+++ b/docs/analysis_modules.md
@@ -92,3 +92,60 @@ time_plot(
move_legend_outside=True,
)
```
+
+## Analysis Modules
+
+### Product Association Rules
+
+The product association module implements functionality for generating product association rules, a powerful technique
+in retail analytics and market basket analysis.
+
+Product association rules are used to uncover relationships between different products that customers tend to purchase
+together. These rules provide valuable insights into consumer behavior and purchasing patterns, which can be leveraged
+by retail businesses in various ways:
+
+1. Cross-selling and upselling: By identifying products frequently bought together, retailers can make targeted product
+ recommendations to increase sales and average order value.
+
+2. Store layout optimization: Understanding product associations helps in strategic product placement within stores,
+ potentially increasing impulse purchases and overall sales.
+
+3. Inventory management: Knowing which products are often bought together aids in maintaining appropriate stock levels
+ and predicting demand.
+
+4. Marketing and promotions: Association rules can guide the creation ofeffective bundle offers and promotional
+ campaigns.
+
+5. Customer segmentation: Patterns in product associations can reveal distinct customer segments with specific
+ preferences.
+
+6. New product development: Insights from association rules can inform decisions about new product lines or features.
+
+The module uses metrics such as support, confidence, and uplift to quantifythe strength and significance of product
+associations:
+
+- Support: The frequency of items appearing together in transactions.
+- Confidence: The likelihood of buying one product given the purchase of another.
+- Uplift: The increase in purchase probability of one product when another is bought.
+
+Example:
+
+```python
+from pyretailscience.product_association import ProductAssociation
+
+pa = ProductAssociation(
+ df,
+ value_col="product_name",
+ group_col="transaction_id",
+)
+pa.df.head()
+```
+
+| product_name_1 | product_name_2 | occurrences_1 | occurrences_2 | cooccurrences | support | confidence | uplift |
+|:-----------------|:-----------------------------|---------------:|---------------:|---------------:|---------:|-----------:|-------:|
+| 100 Animals Book | 100% Organic Cold-Pressed... | 78 | 78 | 1 | 0.000039 | 0.0128205 | 4.18 |
+| 100 Animals Book | 20K Sousaphone | 78 | 81 | 3 | 0.000117 | 0.0384615 | 12.10 |
+| 100 Animals Book | 360 Sport 2.0 Boxer Briefs | 78 | 79 | 1 | 0.000039 | 0.0128205 | 4.13 |
+| 100 Animals Book | 4-Series 4K UHD | 78 | 82 | 1 | 0.000039 | 0.0128205 | 3.98 |
+| 100 Animals Book | 700S Eterna Trumpet | 78 | 71 | 1 | 0.000039 | 0.0128205 | 4.60 |
+
diff --git a/docs/api/product_association.md b/docs/api/product_association.md
new file mode 100644
index 00000000..24192991
--- /dev/null
+++ b/docs/api/product_association.md
@@ -0,0 +1,3 @@
+# Product Associations
+
+::: pyretailscience.product_association
diff --git a/docs/examples/product_association.ipynb b/docs/examples/product_association.ipynb
new file mode 100644
index 00000000..25799f74
--- /dev/null
+++ b/docs/examples/product_association.ipynb
@@ -0,0 +1,679 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The product association module implements functionality for generating product association rules, a powerful technique \n",
+ "in retail analytics and market basket analysis.\n",
+ "\n",
+ "Product association rules are used to uncover relationships between different products that customers tend to purchase\n",
+ "together. These rules provide valuable insights into consumer behavior and purchasing patterns, which can be leveraged\n",
+ "by retail businesses in various ways:\n",
+ "\n",
+ "1. Cross-selling and upselling: By identifying products frequently bought together, retailers can make targeted product\n",
+ " recommendations to increase sales and average order value.\n",
+ "\n",
+ "2. Store layout optimization: Understanding product associations helps in strategic product placement within stores,\n",
+ " potentially increasing impulse purchases and overall sales.\n",
+ "\n",
+ "3. Inventory management: Knowing which products are often bought together aids in maintaining appropriate stock levels\n",
+ " and predicting demand.\n",
+ "\n",
+ "4. Marketing and promotions: Association rules can guide the creation ofeffective bundle offers and promotional\n",
+ " campaigns.\n",
+ "\n",
+ "5. Customer segmentation: Patterns in product associations can reveal distinct customer segments with specific\n",
+ " preferences.\n",
+ "\n",
+ "6. New product development: Insights from association rules can inform decisions about new product lines or features.\n",
+ "\n",
+ "The module uses metrics such as support, confidence, and uplift to quantifythe strength and significance of product\n",
+ "associations:\n",
+ "\n",
+ "- Support: The frequency of items appearing together in transactions.\n",
+ "- Confidence: The likelihood of buying one product given the purchase of another.\n",
+ "- Uplift: The increase in purchase probability of one product when another is bought.\n",
+ "\n",
+ "### Setup\n",
+ "\n",
+ "We'll start by loading some simulated data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " transaction_id | \n",
+ " transaction_datetime | \n",
+ " customer_id | \n",
+ " product_id | \n",
+ " product_name | \n",
+ " category_0_name | \n",
+ " category_0_id | \n",
+ " category_1_name | \n",
+ " category_1_id | \n",
+ " brand_name | \n",
+ " brand_id | \n",
+ " unit_price | \n",
+ " quantity | \n",
+ " total_price | \n",
+ " store_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 7108 | \n",
+ " 2023-01-12 17:44:29 | \n",
+ " 1 | \n",
+ " 15 | \n",
+ " Spawn Figure | \n",
+ " Toys | \n",
+ " 1 | \n",
+ " Action Figures | \n",
+ " 1 | \n",
+ " McFarlane Toys | \n",
+ " 3 | \n",
+ " 27.99 | \n",
+ " 2 | \n",
+ " 55.98 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 7108 | \n",
+ " 2023-01-12 17:44:29 | \n",
+ " 1 | \n",
+ " 1317 | \n",
+ " Gone Girl | \n",
+ " Books | \n",
+ " 8 | \n",
+ " Mystery & Thrillers | \n",
+ " 53 | \n",
+ " Alfred A. Knopf | \n",
+ " 264 | \n",
+ " 10.49 | \n",
+ " 1 | \n",
+ " 10.49 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4553 | \n",
+ " 2023-02-05 09:31:42 | \n",
+ " 1 | \n",
+ " 509 | \n",
+ " Ryzen 3 3300X | \n",
+ " Electronics | \n",
+ " 3 | \n",
+ " Computer Components | \n",
+ " 21 | \n",
+ " AMD | \n",
+ " 102 | \n",
+ " 120.00 | \n",
+ " 3 | \n",
+ " 360.00 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4553 | \n",
+ " 2023-02-05 09:31:42 | \n",
+ " 1 | \n",
+ " 735 | \n",
+ " Linden Wood Paneled Mirror | \n",
+ " Home | \n",
+ " 5 | \n",
+ " Home Decor | \n",
+ " 30 | \n",
+ " Pottery Barn | \n",
+ " 147 | \n",
+ " 599.00 | \n",
+ " 1 | \n",
+ " 599.00 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4553 | \n",
+ " 2023-02-05 09:31:42 | \n",
+ " 1 | \n",
+ " 1107 | \n",
+ " Pro-V Daily Moisture Renewal Conditioner | \n",
+ " Beauty | \n",
+ " 7 | \n",
+ " Hair Care | \n",
+ " 45 | \n",
+ " Pantene | \n",
+ " 222 | \n",
+ " 4.99 | \n",
+ " 1 | \n",
+ " 4.99 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " transaction_id transaction_datetime customer_id product_id \\\n",
+ "0 7108 2023-01-12 17:44:29 1 15 \n",
+ "1 7108 2023-01-12 17:44:29 1 1317 \n",
+ "2 4553 2023-02-05 09:31:42 1 509 \n",
+ "3 4553 2023-02-05 09:31:42 1 735 \n",
+ "4 4553 2023-02-05 09:31:42 1 1107 \n",
+ "\n",
+ " product_name category_0_name category_0_id \\\n",
+ "0 Spawn Figure Toys 1 \n",
+ "1 Gone Girl Books 8 \n",
+ "2 Ryzen 3 3300X Electronics 3 \n",
+ "3 Linden Wood Paneled Mirror Home 5 \n",
+ "4 Pro-V Daily Moisture Renewal Conditioner Beauty 7 \n",
+ "\n",
+ " category_1_name category_1_id brand_name brand_id unit_price \\\n",
+ "0 Action Figures 1 McFarlane Toys 3 27.99 \n",
+ "1 Mystery & Thrillers 53 Alfred A. Knopf 264 10.49 \n",
+ "2 Computer Components 21 AMD 102 120.00 \n",
+ "3 Home Decor 30 Pottery Barn 147 599.00 \n",
+ "4 Hair Care 45 Pantene 222 4.99 \n",
+ "\n",
+ " quantity total_price store_id \n",
+ "0 2 55.98 6 \n",
+ "1 1 10.49 6 \n",
+ "2 3 360.00 4 \n",
+ "3 1 599.00 4 \n",
+ "4 1 4.99 4 "
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "df = pd.read_parquet(\"../../data/transactions.parquet\")\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of unique customers: 4250\n",
+ "Number of unique transactions: 25490\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f\"Number of unique customers: {df['customer_id'].nunique()}\")\n",
+ "print(f\"Number of unique transactions: {df['transaction_id'].nunique()}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Here we'll see simple example to generate the production association rules."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " product_name_1 | \n",
+ " product_name_2 | \n",
+ " occurrences_1 | \n",
+ " occurrences_2 | \n",
+ " cooccurrences | \n",
+ " support | \n",
+ " confidence | \n",
+ " uplift | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 100 Animals Book | \n",
+ " 100% Organic Cold-Pressed Rose Hip Seed Oil | \n",
+ " 78 | \n",
+ " 78 | \n",
+ " 1 | \n",
+ " 0.000039 | \n",
+ " 0.012821 | \n",
+ " 4.189678 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 100 Animals Book | \n",
+ " 20K Sousaphone | \n",
+ " 78 | \n",
+ " 81 | \n",
+ " 3 | \n",
+ " 0.000118 | \n",
+ " 0.038462 | \n",
+ " 12.103514 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 100 Animals Book | \n",
+ " 360 Sport 2.0 Boxer Briefs | \n",
+ " 78 | \n",
+ " 79 | \n",
+ " 1 | \n",
+ " 0.000039 | \n",
+ " 0.012821 | \n",
+ " 4.136644 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 100 Animals Book | \n",
+ " 4-Series 4K UHD | \n",
+ " 78 | \n",
+ " 82 | \n",
+ " 1 | \n",
+ " 0.000039 | \n",
+ " 0.012821 | \n",
+ " 3.985303 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 100 Animals Book | \n",
+ " 700S Eterna Trumpet | \n",
+ " 78 | \n",
+ " 71 | \n",
+ " 1 | \n",
+ " 0.000039 | \n",
+ " 0.012821 | \n",
+ " 4.602745 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " product_name_1 product_name_2 \\\n",
+ "0 100 Animals Book 100% Organic Cold-Pressed Rose Hip Seed Oil \n",
+ "1 100 Animals Book 20K Sousaphone \n",
+ "2 100 Animals Book 360 Sport 2.0 Boxer Briefs \n",
+ "3 100 Animals Book 4-Series 4K UHD \n",
+ "4 100 Animals Book 700S Eterna Trumpet \n",
+ "\n",
+ " occurrences_1 occurrences_2 cooccurrences support confidence \\\n",
+ "0 78 78 1 0.000039 0.012821 \n",
+ "1 78 81 3 0.000118 0.038462 \n",
+ "2 78 79 1 0.000039 0.012821 \n",
+ "3 78 82 1 0.000039 0.012821 \n",
+ "4 78 71 1 0.000039 0.012821 \n",
+ "\n",
+ " uplift \n",
+ "0 4.189678 \n",
+ "1 12.103514 \n",
+ "2 4.136644 \n",
+ "3 3.985303 \n",
+ "4 4.602745 "
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from pyretailscience.product_association import ProductAssociation\n",
+ "\n",
+ "pa = ProductAssociation(\n",
+ " df,\n",
+ " value_col=\"product_name\",\n",
+ " group_col=\"transaction_id\",\n",
+ ")\n",
+ "pa.df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can also limit the returned items to those that include a specific item."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " product_name_1 | \n",
+ " product_name_2 | \n",
+ " occurrences_1 | \n",
+ " occurrences_2 | \n",
+ " cooccurrences | \n",
+ " support | \n",
+ " confidence | \n",
+ " uplift | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 4-Series 4K UHD | \n",
+ " 100 Animals Book | \n",
+ " 82 | \n",
+ " 78 | \n",
+ " 1 | \n",
+ " 0.000039 | \n",
+ " 0.012195 | \n",
+ " 3.985303 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 4-Series 4K UHD | \n",
+ " 122HD45 Gas Hedge Trimmer | \n",
+ " 82 | \n",
+ " 92 | \n",
+ " 1 | \n",
+ " 0.000039 | \n",
+ " 0.012195 | \n",
+ " 3.378844 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4-Series 4K UHD | \n",
+ " 2-in-1 Touch & Learn Tablet | \n",
+ " 82 | \n",
+ " 81 | \n",
+ " 2 | \n",
+ " 0.000078 | \n",
+ " 0.024390 | \n",
+ " 7.675399 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4-Series 4K UHD | \n",
+ " 20K Sousaphone | \n",
+ " 82 | \n",
+ " 81 | \n",
+ " 1 | \n",
+ " 0.000039 | \n",
+ " 0.012195 | \n",
+ " 3.837699 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4-Series 4K UHD | \n",
+ " 3 Minute Miracle Deep Conditioner | \n",
+ " 82 | \n",
+ " 70 | \n",
+ " 1 | \n",
+ " 0.000039 | \n",
+ " 0.012195 | \n",
+ " 4.440767 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " product_name_1 product_name_2 occurrences_1 \\\n",
+ "0 4-Series 4K UHD 100 Animals Book 82 \n",
+ "1 4-Series 4K UHD 122HD45 Gas Hedge Trimmer 82 \n",
+ "2 4-Series 4K UHD 2-in-1 Touch & Learn Tablet 82 \n",
+ "3 4-Series 4K UHD 20K Sousaphone 82 \n",
+ "4 4-Series 4K UHD 3 Minute Miracle Deep Conditioner 82 \n",
+ "\n",
+ " occurrences_2 cooccurrences support confidence uplift \n",
+ "0 78 1 0.000039 0.012195 3.985303 \n",
+ "1 92 1 0.000039 0.012195 3.378844 \n",
+ "2 81 2 0.000078 0.024390 7.675399 \n",
+ "3 81 1 0.000039 0.012195 3.837699 \n",
+ "4 70 1 0.000039 0.012195 4.440767 "
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pa_specific_item = ProductAssociation(\n",
+ " df,\n",
+ " value_col=\"product_name\",\n",
+ " group_col=\"transaction_id\",\n",
+ " target_item=\"4-Series 4K UHD\",\n",
+ ")\n",
+ "pa_specific_item.df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can filter the returned results by,\n",
+ "- Mininum occurrences of an item\n",
+ "- Mininum cooccurrences of pair of items\n",
+ "- Mininum support of a pair of items\n",
+ "- Mininum confidence of a pair of items\n",
+ "- Mininum uplift of a pair of items"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " product_name_1 | \n",
+ " product_name_2 | \n",
+ " occurrences_1 | \n",
+ " occurrences_2 | \n",
+ " cooccurrences | \n",
+ " support | \n",
+ " confidence | \n",
+ " uplift | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 100 Animals Book | \n",
+ " 20K Sousaphone | \n",
+ " 78 | \n",
+ " 81 | \n",
+ " 3 | \n",
+ " 0.000118 | \n",
+ " 0.038462 | \n",
+ " 12.103514 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 100 Animals Book | \n",
+ " Activia Probiotic Yogurt | \n",
+ " 78 | \n",
+ " 57 | \n",
+ " 2 | \n",
+ " 0.000078 | \n",
+ " 0.025641 | \n",
+ " 11.466487 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 100 Animals Book | \n",
+ " Aether AG 70 Pack | \n",
+ " 78 | \n",
+ " 72 | \n",
+ " 2 | \n",
+ " 0.000078 | \n",
+ " 0.025641 | \n",
+ " 9.077635 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 100 Animals Book | \n",
+ " All Natural Plain Yogurt | \n",
+ " 78 | \n",
+ " 62 | \n",
+ " 1 | \n",
+ " 0.000039 | \n",
+ " 0.012821 | \n",
+ " 5.270885 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 100 Animals Book | \n",
+ " American Ultra Jazz Bass | \n",
+ " 78 | \n",
+ " 59 | \n",
+ " 2 | \n",
+ " 0.000078 | \n",
+ " 0.025641 | \n",
+ " 11.077792 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " product_name_1 product_name_2 occurrences_1 occurrences_2 \\\n",
+ "0 100 Animals Book 20K Sousaphone 78 81 \n",
+ "1 100 Animals Book Activia Probiotic Yogurt 78 57 \n",
+ "2 100 Animals Book Aether AG 70 Pack 78 72 \n",
+ "3 100 Animals Book All Natural Plain Yogurt 78 62 \n",
+ "4 100 Animals Book American Ultra Jazz Bass 78 59 \n",
+ "\n",
+ " cooccurrences support confidence uplift \n",
+ "0 3 0.000118 0.038462 12.103514 \n",
+ "1 2 0.000078 0.025641 11.466487 \n",
+ "2 2 0.000078 0.025641 9.077635 \n",
+ "3 1 0.000039 0.012821 5.270885 \n",
+ "4 2 0.000078 0.025641 11.077792 "
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pa_min_uplift = ProductAssociation(\n",
+ " df,\n",
+ " value_col=\"product_name\",\n",
+ " group_col=\"transaction_id\",\n",
+ " min_uplift=5,\n",
+ ")\n",
+ "pa_min_uplift.df.head()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/mkdocs.yml b/mkdocs.yml
index fb71bc76..be7a11ad 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -15,6 +15,7 @@ nav:
- Cross Shop Analysis: examples/cross_shop.ipynb
- Data Contracts: examples/data_contracts.ipynb
- Segmentation: examples/segmentation.ipynb
+ - Product Association: examples/product_association.ipynb
- Reference:
- Gain Loss: api/gain_loss.md
- Range Planning: api/range_planning.md
@@ -22,6 +23,7 @@ nav:
- Customer Retention: api/customer.md
- Standard Graphs: api/standard_graphs.md
- Cross Shop Analysis: api/cross_shop.md
+ - Product Association: api/product_association.md
- Contracts: api/contracts.md
theme:
diff --git a/pyretailscience/product_association.py b/pyretailscience/product_association.py
new file mode 100644
index 00000000..1f39d10b
--- /dev/null
+++ b/pyretailscience/product_association.py
@@ -0,0 +1,304 @@
+"""Product Association Rules Generation.
+
+This module implements functionality for generating product association rules, a powerful technique in retail analytics
+and market basket analysis.
+
+Product association rules are used to uncover relationships between different products that customers tend to purchase
+together. These rules provide valuable insights into consumer behavior and purchasing patterns, which can be leveraged
+by retail businesses in various ways:
+
+1. Cross-selling and upselling: By identifying products frequently bought together, retailers can make targeted product
+ recommendations to increase sales and average order value.
+
+2. Store layout optimization: Understanding product associations helps in strategic product placement within stores,
+ potentially increasing impulse purchases and overall sales.
+
+3. Inventory management: Knowing which products are often bought together aids in maintaining appropriate stock levels
+ and predicting demand.
+
+4. Marketing and promotions: Association rules can guide the creation ofeffective bundle offers and promotional
+ campaigns.
+
+5. Customer segmentation: Patterns in product associations can reveal distinct customer segments with specific
+ preferences.
+
+6. New product development: Insights from association rules can inform decisions about new product lines or features.
+
+The module uses metrics such as support, confidence, and uplift to quantifythe strength and significance of product
+associations:
+
+- Support: The frequency of items appearing together in transactions.
+- Confidence: The likelihood of buying one product given the purchase of another.
+- Uplift: The increase in purchase probability of one product when another is bought.
+
+By leveraging these association rules, retailers can make data-driven decisions to enhance customer experience, optimize
+operations, and drive business growth.
+"""
+
+from itertools import combinations
+from typing import Literal
+
+import numpy as np
+import pandas as pd
+from scipy.sparse import csr_matrix
+
+from pyretailscience.data.contracts import CustomContract, build_expected_columns, build_non_null_columns
+
+
+class ProductAssociation:
+ """A class for generating and analyzing product association rules.
+
+ This class calculates association rules between products based on transaction data,
+ helping to identify patterns in customer purchasing behavior.
+
+ Args:
+ df (pandas.DataFrame): The input DataFrame containing transaction data.
+ value_col (str): The name of the column in the input DataFrame that contains
+ the product identifiers.
+ group_col (str, optional): The name of the column that identifies unique
+ transactions or customers. Defaults to "customer_id".
+ target_item (str or None, optional): A specific product to focus the
+ association analysis on. If None, associations for all products are
+ calculated. Defaults to None.
+
+ Attributes:
+ df (pandas.DataFrame): A DataFrame containing the calculated association
+ rules and their metrics.
+
+ Example:
+ >>> import pandas as pd
+ >>> transaction_df = pd.DataFrame({
+ ... 'customer_id': [1, 1, 2, 2, 3],
+ ... 'product_id': ['A', 'B', 'B', 'C', 'A']
+ ... })
+ >>> pa = ProductAssociation(df=transaction_df, value_col='product_id', group_col='customer_id')
+ >>> print(pa.df) # View the calculated association rules
+
+ Note:
+ The resulting DataFrame (pa.df) contains the following columns:
+ - product_1, product_2: The pair of products for which the association is calculated.
+ - occurrences_1, occurrences_2: The number of transactions containing each product.
+ - cooccurrences: The number of transactions containing both products.
+ - support: The proportion of transactions containing both products.
+ - confidence: The probability of buying product_2 given that product_1 was bought.
+ - uplift: The ratio of the observed support to the expected support if the products were independent.
+
+ The class uses efficient sparse matrix operations to handle large datasets and
+ calculates associations for either pairs (2) or triples (3) of products, depending
+ on the 'number_of_combinations' parameter in _calc_association.
+ """
+
+ def __init__(
+ self,
+ df: pd.DataFrame,
+ value_col: str,
+ group_col: str = "customer_id",
+ target_item: str | None = None,
+ number_of_combinations: Literal[2, 3] = 2,
+ min_occurrences: int = 1,
+ min_cooccurrences: int = 1,
+ min_support: float = 0.0,
+ min_confidence: float = 0.0,
+ min_uplift: float = 0.0,
+ ) -> None:
+ """Initialize the ProductAssociation object.
+
+ Args:
+ df (pandas.DataFrame): The input DataFrame containing transaction data.
+ value_col (str): The name of the column in the input DataFrame that contains the product identifiers.
+ group_col (str, optional): The name of the column that identifies unique transactions or customers. Defaults
+ to "customer_id".
+ target_item (str or None, optional): A specific product to focus the association analysis on. If None,
+ associations for all products are calculated. Defaults to None.
+ number_of_combinations (int, optional): The number of products to consider in the association analysis. Can
+ be either 2 or 3. Defaults to 2.
+ min_occurrences (int, optional): The minimum number of occurrences required for each product in the
+ association analysis. Defaults to 1. Must be at least 1.
+ min_cooccurrences (int, optional): The minimum number of co-occurrences required for the product pairs in
+ the association analysis. Defaults to 1. Must be at least 1.
+ min_support (float, optional): The minimum support value required for the association rules. Defaults to
+ 0.0. Must be between 0 and 1.
+ min_confidence (float, optional): The minimum confidence value required for the association rules. Defaults
+ to 0.0. Must be between 0 and 1.
+ min_uplift (float, optional): The minimum uplift value required for the association rules. Defaults to 0.0.
+ Must be greater or equal to 0.
+
+ Raises:
+ ValueError: If the number of combinations is not 2 or 3, or if any of the minimum values are invalid.
+ ValueError: If the minimum support, confidence, or uplift values are outside the valid range.
+ ValueError: If the minimum occurrences or cooccurrences are less than 1.
+ ValueError: If the input DataFrame does not contain the required columns or if they have null values.
+ """
+ required_cols = [group_col, value_col]
+ contract = CustomContract(
+ df,
+ basic_expectations=build_expected_columns(columns=required_cols),
+ extended_expectations=build_non_null_columns(columns=required_cols),
+ )
+ if contract.validate() is False:
+ msg = f"The dataframe requires the columns {required_cols} and they must be non-null"
+ raise ValueError(msg)
+
+ self.df = self._calc_association(
+ df=df,
+ value_col=value_col,
+ group_col=group_col,
+ target_item=target_item,
+ number_of_combinations=number_of_combinations,
+ min_occurrences=min_occurrences,
+ min_cooccurrences=min_cooccurrences,
+ min_support=min_support,
+ min_confidence=min_confidence,
+ min_uplift=min_uplift,
+ )
+
+ @staticmethod
+ def _calc_association( # noqa: C901 (ignore complexity) - Excluded due to min_* arguments checks
+ df: pd.DataFrame,
+ value_col: str,
+ group_col: str = "customer_id",
+ target_item: str | None = None,
+ number_of_combinations: Literal[2, 3] = 2,
+ min_occurrences: int = 1,
+ min_cooccurrences: int = 1,
+ min_support: float = 0.0,
+ min_confidence: float = 0.0,
+ min_uplift: float = 0.0,
+ ) -> pd.DataFrame:
+ """Calculate product association rules based on transaction data.
+
+ This method calculates association rules between products based on transaction data,
+ helping to identify patterns in customer purchasing behavior.
+
+ Args:
+ df (pandas.DataFrame): The input DataFrame containing transaction data.
+ value_col (str): The name of the column in the input DataFrame that contains the product identifiers.
+ group_col (str, optional): The name of the column that identifies unique transactions or customers. Defaults
+ to "customer_id".
+ target_item (str or None, optional): A specific product to focus the association analysis on. If None,
+ associations for all products are calculated. Defaults to None.
+ number_of_combinations (int, optional): The number of products to consider in the association analysis. Can
+ be either 2 or 3. Defaults to 2.
+ min_occurrences (int, optional): The minimum number of occurrences required for each product in the
+ association analysis. Defaults to 1. Must be at least 1.
+ min_cooccurrences (int, optional): The minimum number of co-occurrences required for the product pairs in
+ the association analysis. Defaults to 1. Must be at least 1.
+ min_support (float, optional): The minimum support value required for the association rules. Defaults to
+ 0.0. Must be between 0 and 1.
+ min_confidence (float, optional): The minimum confidence value required for the association rules. Defaults
+ to 0.0. Must be between 0 and 1.
+ min_uplift (float, optional): The minimum uplift value required for the association rules. Defaults to 0.0.
+ Must be greater or equal to 0.
+
+ Returns:
+ pandas.DataFrame: A DataFrame containing the calculated association rules and their metrics.
+
+ Raises:
+ ValueError: If the number of combinations is not 2 or 3, or if any of the minimum values are invalid.
+ ValueError: If the minimum support, confidence, or uplift values are outside the valid range.
+ ValueError: If the minimum occurrences or cooccurrences are less than 1.
+
+ Note:
+ The resulting DataFrame contains the following columns:
+ - product_1, product_2: The pair of products for which the association is calculated.
+ - occurrences_1, occurrences_2: The number of transactions containing each product.
+ - cooccurrences: The number of transactions containing both products.
+ - support: The proportion of transactions containing both products.
+ - confidence: The probability of buying product_2 given that product_1 was bought.
+ - uplift: The ratio of the observed support to the expected support if the products were independent.
+
+ The method uses efficient sparse matrix operations to handle large datasets and
+ calculates associations for either pairs (2) or triples (3) of products, depending
+ on the 'number_of_combinations' parameter.
+ """
+ if number_of_combinations not in [2, 3]:
+ raise ValueError("Number of combinations must be either 2 or 3.")
+ if min_occurrences < 1:
+ raise ValueError("Minimum occurrences must be at least 1.")
+ if min_cooccurrences < 1:
+ raise ValueError("Minimum cooccurrences must be at least 1.")
+ if min_support < 0.0 or min_support > 1.0:
+ raise ValueError("Minimum support must be between 0 and 1.")
+ if min_confidence < 0.0 or min_confidence > 1.0:
+ raise ValueError("Minimum confidence must be between 0 and 1.")
+ if min_uplift < 0.0:
+ raise ValueError("Minimum uplift must be greater or equal to 0.")
+
+ unique_combo_df = df[[group_col, value_col]].drop_duplicates()
+ unique_combo_df[value_col] = pd.Categorical(unique_combo_df[value_col], ordered=True)
+ unique_combo_df[group_col] = pd.Categorical(unique_combo_df[group_col], ordered=True)
+
+ sparse_matrix = csr_matrix(
+ (
+ [1] * len(unique_combo_df),
+ (
+ unique_combo_df[group_col].cat.codes,
+ unique_combo_df[value_col].cat.codes,
+ ),
+ ),
+ )
+
+ row_count = sparse_matrix.shape[0]
+
+ results = []
+
+ occurrences = np.array(sparse_matrix.sum(axis=0)).flatten()
+ occurence_prob = occurrences / row_count
+
+ items = [target_item]
+ if target_item is None:
+ if number_of_combinations == 2: # noqa: PLR2004
+ items = unique_combo_df[value_col].cat.categories
+ elif number_of_combinations == 3: # noqa: PLR2004
+ items = sorted(combinations(unique_combo_df[value_col].cat.categories, 2))
+
+ for item_2 in items:
+ if isinstance(item_2, tuple):
+ target_item_col_index = [unique_combo_df[value_col].cat.categories.get_loc(i) for i in item_2]
+ rows_with_target_item = (sparse_matrix[:, target_item_col_index].toarray() == 1).all(axis=1)
+ else:
+ target_item_col_index = unique_combo_df[value_col].cat.categories.get_loc(item_2)
+ rows_with_target_item = sparse_matrix[:, target_item_col_index].toarray().ravel() == 1
+
+ rows_with_target_item_sum = rows_with_target_item.sum()
+
+ cooccurrences = np.array(sparse_matrix[rows_with_target_item, :].sum(axis=0)).flatten()
+ if (cooccurrences == 0).all():
+ continue
+
+ coocurrence_prob = cooccurrences / row_count
+
+ target_prob = rows_with_target_item_sum / row_count
+ expected_prob = target_prob * occurence_prob
+
+ pa_df = pd.DataFrame(
+ {
+ f"{value_col}_1": [item_2] * sparse_matrix.shape[1],
+ f"{value_col}_2": unique_combo_df[value_col].cat.categories,
+ "occurrences_1": rows_with_target_item_sum,
+ "occurrences_2": occurrences,
+ "cooccurrences": cooccurrences,
+ "support": coocurrence_prob,
+ "confidence": cooccurrences / rows_with_target_item_sum,
+ "uplift": coocurrence_prob / expected_prob,
+ },
+ )
+
+ if isinstance(item_2, tuple):
+ dupe_pairs_idx = pa_df.apply(lambda x: x[f"{value_col}_2"] in x[f"{value_col}_1"], axis=1)
+ else:
+ dupe_pairs_idx = pa_df[f"{value_col}_1"] == pa_df[f"{value_col}_2"]
+
+ excl_pairs_idx = (
+ dupe_pairs_idx
+ | (pa_df["occurrences_1"] < min_occurrences)
+ | (pa_df["occurrences_2"] < min_occurrences)
+ | (pa_df["cooccurrences"] < min_cooccurrences)
+ | (pa_df["support"] < min_support)
+ | (pa_df["confidence"] < min_confidence)
+ | (pa_df["uplift"] < min_uplift)
+ )
+
+ results.append(pa_df[~excl_pairs_idx])
+
+ return pd.concat(results).sort_values([f"{value_col}_1", f"{value_col}_2"]).reset_index(drop=True)
diff --git a/tests/test_product_association.py b/tests/test_product_association.py
new file mode 100644
index 00000000..15b03935
--- /dev/null
+++ b/tests/test_product_association.py
@@ -0,0 +1,330 @@
+"""Tests for the ProductAssociation module."""
+
+import pandas as pd
+import pytest
+
+from pyretailscience.product_association import ProductAssociation
+
+
+class TestProductAssociations:
+ """Tests for the ProductAssociations class."""
+
+ @pytest.fixture()
+ def transactions_df(self) -> pd.DataFrame:
+ """Return a sample DataFrame for testing."""
+ # fmt: off
+ return pd.DataFrame({
+ "transaction_id": [1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 5],
+ "product": ["milk", "bread", "fruit", "butter", "eggs", "fruit", "beer", "diapers",
+ "milk", "bread", "butter", "eggs", "fruit", "bread"],
+ })
+ # fmt: on
+
+ @pytest.fixture()
+ def expected_results_single_items_df(self) -> pd.DataFrame:
+ """Return the expected results for the single items association analysis."""
+ # fmt: off
+ return pd.DataFrame(
+ {
+ "product_1": [
+ "beer", "bread", "bread", "bread", "bread", "butter", "butter", "butter", "butter", "diapers",
+ "eggs", "eggs", "eggs", "eggs", "fruit", "fruit", "fruit", "fruit", "milk", "milk", "milk",
+ "milk",
+ ],
+ "product_2": [
+ "diapers", "butter", "eggs", "fruit", "milk", "bread", "eggs", "fruit", "milk", "beer", "bread",
+ "butter", "fruit", "milk", "bread", "butter", "eggs", "milk", "bread", "butter", "eggs",
+ "fruit",
+ ],
+ "occurrences_1": [1, 3, 3, 3, 3, 2, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 2],
+ "occurrences_2": [1, 2, 2, 3, 2, 3, 2, 3, 2, 1, 3, 2, 3, 2, 3, 2, 2, 2, 3, 2, 2, 3],
+ "cooccurrences": [1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2],
+ "support": [
+ 0.2, 0.2, 0.2, 0.4, 0.4, 0.2, 0.4, 0.4, 0.2, 0.2, 0.2, 0.4, 0.4, 0.2, 0.4, 0.4, 0.4, 0.4, 0.4,
+ 0.2, 0.2, 0.4,
+ ],
+ "confidence": [
+ 1.0, 0.333333, 0.333333, 0.666667, 0.666667, 0.5, 1.0, 1.0, 0.5, 1.0, 0.5, 1.0, 1.0, 0.5,
+ 0.666667, 0.666667, 0.666667, 0.666667, 1.0, 0.5, 0.5, 1.0,
+ ],
+ "uplift": [
+ 5.0, 0.833333, 0.833333, 1.111111, 1.666667, 0.833333, 2.5, 1.666667, 1.25, 5.0, 0.833333, 2.5,
+ 1.666667, 1.25, 1.111111, 1.666667, 1.666667, 1.666667, 1.666667, 1.25, 1.25, 1.666667,
+ ],
+ },
+ )
+ # fmt: on
+
+ @pytest.fixture()
+ def expected_results_pair_items_df(self) -> pd.DataFrame:
+ """Return the expected results for the pair items association analysis."""
+ # fmt: off
+ return pd.DataFrame(
+ {
+ "product_1": [
+ ("bread", "butter"), ("bread", "butter"), ("bread", "butter"), ("bread", "eggs"), ("bread", "eggs"),
+ ("bread", "eggs"), ("bread", "fruit"), ("bread", "fruit"), ("bread", "fruit"), ("bread", "milk"),
+ ("bread", "milk"), ("bread", "milk"), ("butter", "eggs"), ("butter", "eggs"), ("butter", "eggs"),
+ ("butter", "fruit"), ("butter", "fruit"), ("butter", "fruit"), ("butter", "milk"),
+ ("butter", "milk"), ("butter", "milk"), ("eggs", "fruit"), ("eggs", "fruit"), ("eggs", "fruit"),
+ ("eggs", "milk"), ("eggs", "milk"), ("eggs", "milk"), ("fruit", "milk"), ("fruit", "milk"),
+ ("fruit", "milk"),
+ ],
+ "product_2": [
+ "eggs", "fruit", "milk", "butter", "fruit", "milk", "butter", "eggs", "milk", "butter", "eggs",
+ "fruit", "bread", "fruit", "milk", "bread", "eggs", "milk", "bread", "eggs", "fruit", "bread",
+ "butter", "milk", "bread", "butter", "fruit", "bread", "butter", "eggs",
+ ],
+ "occurrences_1": [
+ 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2,
+ ],
+ "occurrences_2": [
+ 2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2, 3, 3, 3, 2, 3, 2, 2, 3, 2, 3, 3, 2, 2, 3, 2, 3, 3, 2, 2,
+ ],
+ "cooccurrences": [
+ 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1,
+ ],
+ "support": [
+ 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.2, 0.2, 0.4, 0.2, 0.4, 0.2, 0.2, 0.4, 0.2, 0.2, 0.2,
+ 0.2, 0.2, 0.4, 0.2, 0.2, 0.2, 0.2, 0.4, 0.2, 0.2,
+ ],
+ "confidence": [
+ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 0.5, 1.0, 0.5, 0.5, 1.0, 0.5, 1.0, 0.5, 0.5, 1.0, 0.5, 1.0, 1.0,
+ 1.0, 0.5, 1.0, 0.5, 1.0, 1.0, 1.0, 1.0, 0.5, 0.5,
+ ],
+ "uplift": [
+ 2.5, 1.666667, 2.5, 2.5, 1.666667, 2.5, 1.25, 1.25, 2.5, 1.25, 1.25, 1.666667, 0.833333, 1.666667,
+ 1.25, 0.833333, 2.5, 1.25, 1.666667, 2.5, 1.666667, 0.833333, 2.5, 1.25, 1.666667, 2.5, 1.666667,
+ 1.666667, 1.25,1.25,
+ ],
+ },
+ )
+ # fmt: on
+
+ def test_calc_association_all_single_items(self, transactions_df, expected_results_single_items_df):
+ """Test calculating association rules for a single item versus another of item for all items."""
+ associations_df = ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ )
+
+ pd.testing.assert_frame_equal(associations_df, expected_results_single_items_df)
+
+ def test_calc_association_target_single_items(self, transactions_df, expected_results_single_items_df):
+ """Test calculating association rules for target single item versus another of item."""
+ target_item = "bread"
+
+ calc_df = ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ target_item=target_item,
+ )
+
+ pd.testing.assert_frame_equal(
+ calc_df,
+ expected_results_single_items_df[expected_results_single_items_df["product_1"] == target_item].reset_index(
+ drop=True,
+ ),
+ )
+
+ def test_calc_association_all_pair_items(self, transactions_df, expected_results_pair_items_df):
+ """Test calculating association rules for a pairs of items versus another item for all items."""
+ calc_df = ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ number_of_combinations=3,
+ )
+
+ pd.testing.assert_frame_equal(calc_df, expected_results_pair_items_df)
+
+ def test_calc_association_target_pair_items(self, transactions_df, expected_results_pair_items_df):
+ """Test calculating association rules for a target pairs of items versus another item."""
+ calc_df = ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ number_of_combinations=3,
+ target_item=("bread", "butter"),
+ )
+
+ pd.testing.assert_frame_equal(
+ calc_df,
+ expected_results_pair_items_df[
+ expected_results_pair_items_df["product_1"] == ("bread", "butter")
+ ].reset_index(drop=True),
+ )
+
+ def test_calc_association_min_occurrences(self, transactions_df, expected_results_single_items_df):
+ """Test calculating association rules with a min occurrences level."""
+ min_occurrences = 2
+
+ calc_df = ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ min_occurrences=min_occurrences,
+ )
+
+ pd.testing.assert_frame_equal(
+ calc_df,
+ expected_results_single_items_df[
+ (expected_results_single_items_df["occurrences_1"] >= min_occurrences)
+ & (expected_results_single_items_df["occurrences_2"] >= min_occurrences)
+ ].reset_index(drop=True),
+ )
+
+ def test_calc_association_min_cooccurrences(self, transactions_df, expected_results_single_items_df):
+ """Test calculating association rules with a min occurrences level."""
+ min_cooccurrences = 2
+
+ calc_df = ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ min_cooccurrences=min_cooccurrences,
+ )
+
+ pd.testing.assert_frame_equal(
+ calc_df,
+ expected_results_single_items_df[
+ (expected_results_single_items_df["cooccurrences"] >= min_cooccurrences)
+ ].reset_index(drop=True),
+ )
+
+ def test_calc_association_min_support(self, transactions_df, expected_results_single_items_df):
+ """Test calculating association rules with a min occurrences level."""
+ min_support = 0.25
+
+ calc_df = ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ min_support=min_support,
+ )
+
+ pd.testing.assert_frame_equal(
+ calc_df,
+ expected_results_single_items_df[(expected_results_single_items_df["support"] >= min_support)].reset_index(
+ drop=True,
+ ),
+ )
+
+ def test_calc_association_min_confidence(self, transactions_df, expected_results_single_items_df):
+ """Test calculating association rules with a min occurrences level."""
+ min_confidence = 0.25
+
+ calc_df = ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ min_confidence=min_confidence,
+ )
+
+ pd.testing.assert_frame_equal(
+ calc_df,
+ expected_results_single_items_df[
+ (expected_results_single_items_df["confidence"] >= min_confidence)
+ ].reset_index(drop=True),
+ )
+
+ def test_calc_association_min_uplift(self, transactions_df, expected_results_single_items_df):
+ """Test calculating association rules with a min occurrences level."""
+ min_uplift = 1
+
+ calc_df = ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ min_uplift=min_uplift,
+ )
+
+ pd.testing.assert_frame_equal(
+ calc_df,
+ expected_results_single_items_df[(expected_results_single_items_df["uplift"] >= min_uplift)].reset_index(
+ drop=True,
+ ),
+ )
+
+ def test_calc_association_invalid_number_of_combinations(self, transactions_df):
+ """Test calculating association rules with an invalid number of combinations."""
+ with pytest.raises(ValueError, match="Number of combinations must be either 2 or 3."):
+ ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ number_of_combinations=4,
+ )
+ with pytest.raises(ValueError, match="Number of combinations must be either 2 or 3."):
+ ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ number_of_combinations=1,
+ )
+
+ def test_calc_association_invalid_min_occurrences(self, transactions_df):
+ """Test calculating association rules with an invalid minimum occurrences value."""
+ with pytest.raises(ValueError, match="Minimum occurrences must be at least 1."):
+ ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ min_occurrences=0,
+ )
+
+ def test_calc_association_invalid_min_cooccurrences(self, transactions_df):
+ """Test calculating association rules with an invalid minimum cooccurrences value."""
+ with pytest.raises(ValueError, match="Minimum cooccurrences must be at least 1."):
+ ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ min_cooccurrences=0,
+ )
+
+ def test_calc_association_min_support_invalid_range(self, transactions_df):
+ """Test calculating association rules with an invalid minimum support range."""
+ with pytest.raises(ValueError, match="Minimum support must be between 0 and 1."):
+ ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ min_support=-0.1,
+ )
+ with pytest.raises(ValueError, match="Minimum support must be between 0 and 1."):
+ ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ min_support=1.1,
+ )
+
+ def test_calc_association_min_confidence_invalid_range(self, transactions_df):
+ """Test calculating association rules with an invalid minimum confidence range."""
+ with pytest.raises(ValueError, match="Minimum confidence must be between 0 and 1."):
+ ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ min_confidence=-0.1,
+ )
+ with pytest.raises(ValueError, match="Minimum confidence must be between 0 and 1."):
+ ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ min_confidence=1.1,
+ )
+
+ def test_calc_association_min_uplift_invalid_range(self, transactions_df):
+ """Test calculating association rules with an invalid minimum uplift range."""
+ with pytest.raises(ValueError, match="Minimum uplift must be greater or equal to 0."):
+ ProductAssociation._calc_association(
+ df=transactions_df,
+ value_col="product",
+ group_col="transaction_id",
+ min_uplift=-0.1,
+ )