Skip to content

Product Association #131

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 119 additions & 100 deletions pyretailscience/analysis/product_association.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,8 @@
operations, and drive business growth.
"""

from itertools import combinations
from typing import Literal

import numpy as np
import ibis
import pandas as pd
from scipy.sparse import csc_matrix
from tqdm import tqdm

from pyretailscience.options import get_option

Expand Down Expand Up @@ -83,19 +78,14 @@ class ProductAssociation:
- support: The proportion of transactions containing both products.
- confidence: The probability of buying product_2 given that product_1 was bought.
- uplift: The ratio of the observed support to the expected support if the products were independent.

The class uses efficient sparse matrix operations to handle large datasets and
calculates associations for either pairs (2) or triples (3) of products, depending
on the 'number_of_combinations' parameter in _calc_association.
"""

def __init__(
self,
df: pd.DataFrame,
df: pd.DataFrame | ibis.Table,
value_col: str,
group_col: str = get_option("column.customer_id"),
target_item: str | None = None,
number_of_combinations: Literal[2, 3] = 2,
min_occurrences: int = 1,
min_cooccurrences: int = 1,
min_support: float = 0.0,
Expand All @@ -106,14 +96,12 @@ def __init__(
"""Initialize the ProductAssociation object.

Args:
df (pandas.DataFrame): The input DataFrame containing transaction data.
df (pd.DataFrame | ibis.Table) : The input DataFrame or ibis Table containing transaction data.
value_col (str): The name of the column in the input DataFrame that contains the product identifiers.
group_col (str, optional): The name of the column that identifies unique transactions or customers. Defaults
to option column.unit_spend.
target_item (str or None, optional): A specific product to focus the association analysis on. If None,
associations for all products are calculated. Defaults to None.
number_of_combinations (int, optional): The number of products to consider in the association analysis. Can
be either 2 or 3. Defaults to 2.
min_occurrences (int, optional): The minimum number of occurrences required for each product in the
association analysis. Defaults to 1. Must be at least 1.
min_cooccurrences (int, optional): The minimum number of co-occurrences required for the product pairs in
Expand Down Expand Up @@ -143,7 +131,6 @@ def __init__(
value_col=value_col,
group_col=group_col,
target_item=target_item,
number_of_combinations=number_of_combinations,
min_occurrences=min_occurrences,
min_cooccurrences=min_cooccurrences,
min_support=min_support,
Expand All @@ -153,33 +140,29 @@ def __init__(
)

@staticmethod
def _calc_association( # noqa: C901 (ignore complexity) - Excluded due to min_* arguments checks
df: pd.DataFrame,
def _calc_association(
df: pd.DataFrame | ibis.Table,
value_col: str,
group_col: str = get_option("column.customer_id"),
target_item: str | None = None,
number_of_combinations: Literal[2, 3] = 2,
min_occurrences: int = 1,
min_cooccurrences: int = 1,
min_support: float = 0.0,
min_confidence: float = 0.0,
min_uplift: float = 0.0,
show_progress: bool = False,
) -> pd.DataFrame:
"""Calculate product association rules based on transaction data.

This method calculates association rules between products based on transaction data,
helping to identify patterns in customer purchasing behavior.

Args:
df (pandas.DataFrame): The input DataFrame containing transaction data.
df (pd.DataFrame | ibis.Table) : The input DataFrame or ibis Table containing transaction data.
value_col (str): The name of the column in the input DataFrame that contains the product identifiers.
group_col (str, optional): The name of the column that identifies unique transactions or customers. Defaults
to option column.unit_spend.
target_item (str or None, optional): A specific product to focus the association analysis on. If None,
associations for all products are calculated. Defaults to None.
number_of_combinations (int, optional): The number of products to consider in the association analysis. Can
be either 2 or 3. Defaults to 2.
min_occurrences (int, optional): The minimum number of occurrences required for each product in the
association analysis. Defaults to 1. Must be at least 1.
min_cooccurrences (int, optional): The minimum number of co-occurrences required for the product pairs in
Expand Down Expand Up @@ -208,13 +191,7 @@ def _calc_association( # noqa: C901 (ignore complexity) - Excluded due to min_*
- support: The proportion of transactions containing both products.
- confidence: The probability of buying product_2 given that product_1 was bought.
- uplift: The ratio of the observed support to the expected support if the products were independent.

The method uses efficient sparse matrix operations to handle large datasets and
calculates associations for either pairs (2) or triples (3) of products, depending
on the 'number_of_combinations' parameter.
"""
if number_of_combinations not in [2, 3]:
raise ValueError("Number of combinations must be either 2 or 3.")
if min_occurrences < 1:
raise ValueError("Minimum occurrences must be at least 1.")
if min_cooccurrences < 1:
Expand All @@ -226,83 +203,125 @@ def _calc_association( # noqa: C901 (ignore complexity) - Excluded due to min_*
if min_uplift < 0.0:
raise ValueError("Minimum uplift must be greater or equal to 0.")

unique_combo_df = df[[group_col, value_col]].drop_duplicates()
unique_combo_df[value_col] = pd.Categorical(unique_combo_df[value_col], ordered=True)
unique_combo_df[group_col] = pd.Categorical(unique_combo_df[group_col], ordered=True)

sparse_matrix = csc_matrix(
(
[1] * len(unique_combo_df),
(
unique_combo_df[group_col].cat.codes,
unique_combo_df[value_col].cat.codes,
),
),
if isinstance(df, pd.DataFrame):
df = ibis.memtable(df)

unique_transactions = df.select(df[group_col], df[value_col]).distinct()
total_transactions = unique_transactions.alias("t")[group_col].nunique().name("total_count")

product_occurrences = (
unique_transactions.group_by(value_col)
.aggregate(
occurrences=lambda t: t[group_col].nunique(),
)
.mutate(occurrence_probability=lambda t: t.occurrences / total_transactions)
.filter(lambda t: t.occurrences >= min_occurrences)
)

row_count = sparse_matrix.shape[0]
left_table = unique_transactions.rename({"item_1": value_col})
right_table = unique_transactions.rename({"item_2": value_col})

join_logic = [left_table[group_col] == right_table[group_col]]
if target_item is None:
join_logic.append(left_table["item_1"] < right_table["item_2"])
else:
join_logic.extend(
[
left_table["item_1"] != right_table["item_2"],
left_table["item_1"] == target_item,
],
)
merged_df = left_table.join(
right_table,
predicates=join_logic,
lname="",
rname="{name}_right",
)

results = []
product_occurrences_1 = product_occurrences.rename(
{"item_1": value_col, "occurrences_1": "occurrences", "occurrence_probability_1": "occurrence_probability"},
)
product_occurrences_2 = product_occurrences.rename(
{"item_2": value_col, "occurrences_2": "occurrences", "occurrence_probability_2": "occurrence_probability"},
)

occurrences = np.array(sparse_matrix.sum(axis=0)).flatten()
occurence_prob = occurrences / row_count
merged_df = merged_df.join(
product_occurrences_1,
predicates=[merged_df["item_1"] == product_occurrences_1["item_1"]],
)

base_items = [target_item]
if number_of_combinations == 2: # noqa: PLR2004
if target_item is None:
base_items = unique_combo_df[value_col].cat.categories
items = [([unique_combo_df[value_col].cat.categories.get_loc(cat)], cat) for cat in base_items]
elif number_of_combinations == 3: # noqa: PLR2004
if target_item is None:
base_items = sorted(combinations(unique_combo_df[value_col].cat.categories, 2))
items = [
([unique_combo_df[value_col].cat.categories.get_loc(i) for i in cats], cats) for cats in base_items
]
merged_df = merged_df.join(
product_occurrences_2,
predicates=[merged_df["item_2"] == product_occurrences_2["item_2"]],
)
Comment on lines +248 to +256
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Join predicates reference may not match renamed columns.

The join predicates reference columns like merged_df["item_1"] and product_occurrences_1["item_1"], but given the rename issues above, these columns might not exist as expected.

This will need to be fixed after addressing the rename operations in lines 241-246. The join predicates should reference the correctly named columns.


if show_progress:
items = tqdm(items)

cols_mask = np.zeros(sparse_matrix.shape[1], dtype=bool)

for target_item_loc, item_2 in items:
target_item_col_index = cols_mask.copy()
target_item_col_index[target_item_loc] = True
rows_with_target_item = sparse_matrix[:, target_item_col_index].getnnz(axis=1) == len(target_item_loc)

cooccurrences = np.array(sparse_matrix[rows_with_target_item, :].sum(axis=0)).flatten()
if (cooccurrences == 0).all():
continue

rows_with_target_item_sum = rows_with_target_item.sum()
coocurrence_prob = cooccurrences / row_count

target_prob = rows_with_target_item_sum / row_count
expected_prob = target_prob * occurence_prob

# TODO: Try to avoid constructing a pandas Dataframe
pa_df = pd.DataFrame(
{
f"{value_col}_1": [item_2] * sparse_matrix.shape[1],
f"{value_col}_2": unique_combo_df[value_col].cat.categories.values,
"occurrences_1": rows_with_target_item_sum,
"occurrences_2": occurrences,
"cooccurrences": cooccurrences,
"support": coocurrence_prob,
"confidence": cooccurrences / rows_with_target_item_sum,
"uplift": coocurrence_prob / expected_prob,
},
)
cooccurrences = merged_df.group_by(["item_1", "item_2"]).aggregate(cooccurrences=merged_df[group_col].nunique())
cooccurrences = cooccurrences.mutate(
support=cooccurrences.cooccurrences / total_transactions,
)
cooccurrences = cooccurrences.filter(
(cooccurrences.cooccurrences >= min_cooccurrences) & (cooccurrences.support >= min_support),
)

excl_pairs_idx = (
target_item_col_index
| (pa_df["occurrences_1"] < min_occurrences)
| (pa_df["occurrences_2"] < min_occurrences)
| (pa_df["cooccurrences"] < min_cooccurrences)
| (pa_df["support"] < min_support)
| (pa_df["confidence"] < min_confidence)
| (pa_df["uplift"] < min_uplift)
)
product_occurrences_1_rename = product_occurrences.rename(
{"item_1": value_col, "occurrences_1": "occurrences", "prob_1": "occurrence_probability"},
)
product_occurrences_2_rename = product_occurrences.rename(
{"item_2": value_col, "occurrences_2": "occurrences", "prob_2": "occurrence_probability"},
)

results.append(pa_df[~excl_pairs_idx])
product_pairs = cooccurrences.join(
product_occurrences_1_rename,
predicates=[cooccurrences["item_1"] == product_occurrences_1_rename["item_1"]],
)
product_pairs = product_pairs.join(
product_occurrences_2_rename,
predicates=[product_pairs["item_2"] == product_occurrences_2_rename["item_2"]],
)
Comment on lines +273 to +280
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Inconsistency in join predicates with renamed tables.

The join operations use predicates that reference columns like cooccurrences["item_1"] and product_occurrences_1_rename["item_1"], but given the rename issues above, these columns might not match as expected.

This will need to be fixed after addressing the rename operations in lines 266-271.


product_pairs = product_pairs.mutate(
confidence=product_pairs["cooccurrences"] / product_pairs["occurrences_1"],
uplift=product_pairs["support"] / (product_pairs["prob_1"] * product_pairs["prob_2"]),
)
Comment on lines +282 to +285

This comment was marked as resolved.


return pd.concat(results).sort_values([f"{value_col}_1", f"{value_col}_2"]).reset_index(drop=True)
result = product_pairs.filter(product_pairs.uplift >= min_uplift)

if target_item is None:
col_order = [
"item_1",
"item_2",
"occurrences_1",
"occurrences_2",
"cooccurrences",
"support",
"confidence",
"uplift",
]
inverse_pairs = result.mutate(
item_1=result["item_2"],
item_2=result["item_1"],
occurrences_1=result["occurrences_2"],
occurrences_2=result["occurrences_1"],
prob_1=result["prob_2"],
prob_2=result["prob_1"],
confidence=result["cooccurrences"] / result["occurrences_2"],
)
result = result[col_order].union(inverse_pairs[col_order])

result = result.filter(result.confidence >= min_confidence)

final_result = result.execute().sort_values(by=["item_1", "item_2"]).reset_index(drop=True)
final_result = final_result.rename(columns={"item_1": f"{value_col}_1", "item_2": f"{value_col}_2"})

return final_result[
[
f"{value_col}_1",
f"{value_col}_2",
"occurrences_1",
"occurrences_2",
"cooccurrences",
"support",
"confidence",
"uplift",
]
]
45 changes: 0 additions & 45 deletions tests/analysis/test_product_association.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,34 +132,6 @@ def test_calc_association_target_single_items(self, transactions_df, expected_re
),
)

def test_calc_association_all_pair_items(self, transactions_df, expected_results_pair_items_df):
"""Test calculating association rules for a pairs of items versus another item for all items."""
calc_df = ProductAssociation._calc_association(
df=transactions_df,
value_col="product",
group_col=cols.transaction_id,
number_of_combinations=3,
)

pd.testing.assert_frame_equal(calc_df, expected_results_pair_items_df)

def test_calc_association_target_pair_items(self, transactions_df, expected_results_pair_items_df):
"""Test calculating association rules for a target pairs of items versus another item."""
calc_df = ProductAssociation._calc_association(
df=transactions_df,
value_col="product",
group_col=cols.transaction_id,
number_of_combinations=3,
target_item=("bread", "butter"),
)

pd.testing.assert_frame_equal(
calc_df,
expected_results_pair_items_df[
expected_results_pair_items_df["product_1"] == ("bread", "butter")
].reset_index(drop=True),
)

def test_calc_association_min_occurrences(self, transactions_df, expected_results_single_items_df):
"""Test calculating association rules with a min occurrences level."""
min_occurrences = 2
Expand Down Expand Up @@ -251,23 +223,6 @@ def test_calc_association_min_uplift(self, transactions_df, expected_results_sin
),
)

def test_calc_association_invalid_number_of_combinations(self, transactions_df):
"""Test calculating association rules with an invalid number of combinations."""
with pytest.raises(ValueError, match="Number of combinations must be either 2 or 3."):
ProductAssociation._calc_association(
df=transactions_df,
value_col="product",
group_col=cols.transaction_id,
number_of_combinations=4,
)
with pytest.raises(ValueError, match="Number of combinations must be either 2 or 3."):
ProductAssociation._calc_association(
df=transactions_df,
value_col="product",
group_col=cols.transaction_id,
number_of_combinations=1,
)

def test_calc_association_invalid_min_occurrences(self, transactions_df):
"""Test calculating association rules with an invalid minimum occurrences value."""
with pytest.raises(ValueError, match="Minimum occurrences must be at least 1."):
Expand Down