Skip to content

product association #149

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions pyretailscience/analysis/product_association.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ class ProductAssociation:
- uplift: The ratio of the observed support to the expected support if the products were independent.
"""

_df: pd.DataFrame | None = None

def __init__(
self,
df: pd.DataFrame | ibis.Table,
Expand Down Expand Up @@ -124,7 +126,7 @@ def __init__(
msg = f"The following columns are required but missing: {missing_cols}"
raise ValueError(msg)

self.df = self._calc_association(
self.table = self._calc_association(
df=df,
value_col=value_col,
group_col=group_col,
Expand Down Expand Up @@ -305,10 +307,13 @@ def _calc_association(
result = result[col_order].union(inverse_pairs[col_order])

result = result.filter(result.confidence >= min_confidence)

final_result = result.execute().sort_values(by=["item_1", "item_2"]).reset_index(drop=True)
final_result = final_result.rename(columns={"item_1": f"{value_col}_1", "item_2": f"{value_col}_2"})

final_result = result.order_by(["item_1", "item_2"])
final_result = final_result.rename(
{
f"{value_col}_1": "item_1",
f"{value_col}_2": "item_2",
},
)
return final_result[
[
f"{value_col}_1",
Expand All @@ -321,3 +326,10 @@ def _calc_association(
"uplift",
]
]

@property
def df(self) -> pd.DataFrame:
"""Returns the executed DataFrame."""
if self._df is None:
self._df = self.table.execute().reset_index(drop=True)
return self._df
37 changes: 21 additions & 16 deletions tests/analysis/test_product_association.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,27 +106,27 @@ def expected_results_pair_items_df(self) -> pd.DataFrame:

def test_calc_association_all_single_items(self, transactions_df, expected_results_single_items_df):
"""Test calculating association rules for a single item versus another of item for all items."""
associations_df = ProductAssociation._calc_association(
associations_df = ProductAssociation(
df=transactions_df,
value_col="product",
group_col=cols.transaction_id,
)

pd.testing.assert_frame_equal(associations_df, expected_results_single_items_df)
result = associations_df.df
pd.testing.assert_frame_equal(result, expected_results_single_items_df)

def test_calc_association_target_single_items(self, transactions_df, expected_results_single_items_df):
"""Test calculating association rules for target single item versus another of item."""
target_item = "bread"

calc_df = ProductAssociation._calc_association(
calc_df = ProductAssociation(
df=transactions_df,
value_col="product",
group_col=cols.transaction_id,
target_item=target_item,
)

result = calc_df.df
pd.testing.assert_frame_equal(
calc_df,
result,
expected_results_single_items_df[expected_results_single_items_df["product_1"] == target_item].reset_index(
drop=True,
),
Expand All @@ -136,15 +136,16 @@ def test_calc_association_min_occurrences(self, transactions_df, expected_result
"""Test calculating association rules with a min occurrences level."""
min_occurrences = 2

calc_df = ProductAssociation._calc_association(
calc_df = ProductAssociation(
df=transactions_df,
value_col="product",
group_col=cols.transaction_id,
min_occurrences=min_occurrences,
)

result = calc_df.df
pd.testing.assert_frame_equal(
calc_df,
result,
expected_results_single_items_df[
(expected_results_single_items_df["occurrences_1"] >= min_occurrences)
& (expected_results_single_items_df["occurrences_2"] >= min_occurrences)
Expand All @@ -155,15 +156,16 @@ def test_calc_association_min_cooccurrences(self, transactions_df, expected_resu
"""Test calculating association rules with a min occurrences level."""
min_cooccurrences = 2

calc_df = ProductAssociation._calc_association(
calc_df = ProductAssociation(
df=transactions_df,
value_col="product",
group_col=cols.transaction_id,
min_cooccurrences=min_cooccurrences,
)

result = calc_df.df
pd.testing.assert_frame_equal(
calc_df,
result,
expected_results_single_items_df[
(expected_results_single_items_df["cooccurrences"] >= min_cooccurrences)
].reset_index(drop=True),
Expand All @@ -173,15 +175,16 @@ def test_calc_association_min_support(self, transactions_df, expected_results_si
"""Test calculating association rules with a min occurrences level."""
min_support = 0.25

calc_df = ProductAssociation._calc_association(
calc_df = ProductAssociation(
df=transactions_df,
value_col="product",
group_col=cols.transaction_id,
min_support=min_support,
)

result = calc_df.df
pd.testing.assert_frame_equal(
calc_df,
result,
expected_results_single_items_df[(expected_results_single_items_df["support"] >= min_support)].reset_index(
drop=True,
),
Expand All @@ -191,15 +194,16 @@ def test_calc_association_min_confidence(self, transactions_df, expected_results
"""Test calculating association rules with a min occurrences level."""
min_confidence = 0.25

calc_df = ProductAssociation._calc_association(
calc_df = ProductAssociation(
df=transactions_df,
value_col="product",
group_col=cols.transaction_id,
min_confidence=min_confidence,
)

result = calc_df.df
pd.testing.assert_frame_equal(
calc_df,
result,
expected_results_single_items_df[
(expected_results_single_items_df["confidence"] >= min_confidence)
].reset_index(drop=True),
Expand All @@ -209,15 +213,16 @@ def test_calc_association_min_uplift(self, transactions_df, expected_results_sin
"""Test calculating association rules with a min occurrences level."""
min_uplift = 1

calc_df = ProductAssociation._calc_association(
calc_df = ProductAssociation(
df=transactions_df,
value_col="product",
group_col=cols.transaction_id,
min_uplift=min_uplift,
)

result = calc_df.df
pd.testing.assert_frame_equal(
calc_df,
result,
expected_results_single_items_df[(expected_results_single_items_df["uplift"] >= min_uplift)].reset_index(
drop=True,
),
Expand Down