From 8133202cf50a5dff89e50207518d64ef3cfe7c76 Mon Sep 17 00:00:00 2001 From: MayurK Date: Fri, 21 Mar 2025 16:21:40 +0530 Subject: [PATCH] fix: added function to convert ibis tablee into df --- .../analysis/product_association.py | 22 ++++++++--- tests/analysis/test_product_association.py | 37 +++++++++++-------- 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/pyretailscience/analysis/product_association.py b/pyretailscience/analysis/product_association.py index 37f8f921..797896b3 100644 --- a/pyretailscience/analysis/product_association.py +++ b/pyretailscience/analysis/product_association.py @@ -80,6 +80,8 @@ class ProductAssociation: - uplift: The ratio of the observed support to the expected support if the products were independent. """ + _df: pd.DataFrame | None = None + def __init__( self, df: pd.DataFrame | ibis.Table, @@ -124,7 +126,7 @@ def __init__( msg = f"The following columns are required but missing: {missing_cols}" raise ValueError(msg) - self.df = self._calc_association( + self.table = self._calc_association( df=df, value_col=value_col, group_col=group_col, @@ -305,10 +307,13 @@ def _calc_association( result = result[col_order].union(inverse_pairs[col_order]) result = result.filter(result.confidence >= min_confidence) - - final_result = result.execute().sort_values(by=["item_1", "item_2"]).reset_index(drop=True) - final_result = final_result.rename(columns={"item_1": f"{value_col}_1", "item_2": f"{value_col}_2"}) - + final_result = result.order_by(["item_1", "item_2"]) + final_result = final_result.rename( + { + f"{value_col}_1": "item_1", + f"{value_col}_2": "item_2", + }, + ) return final_result[ [ f"{value_col}_1", @@ -321,3 +326,10 @@ def _calc_association( "uplift", ] ] + + @property + def df(self) -> pd.DataFrame: + """Returns the executed DataFrame.""" + if self._df is None: + self._df = self.table.execute().reset_index(drop=True) + return self._df diff --git a/tests/analysis/test_product_association.py b/tests/analysis/test_product_association.py index 7c0f56dd..f61b73f9 100644 --- a/tests/analysis/test_product_association.py +++ b/tests/analysis/test_product_association.py @@ -106,27 +106,27 @@ def expected_results_pair_items_df(self) -> pd.DataFrame: def test_calc_association_all_single_items(self, transactions_df, expected_results_single_items_df): """Test calculating association rules for a single item versus another of item for all items.""" - associations_df = ProductAssociation._calc_association( + associations_df = ProductAssociation( df=transactions_df, value_col="product", group_col=cols.transaction_id, ) - - pd.testing.assert_frame_equal(associations_df, expected_results_single_items_df) + result = associations_df.df + pd.testing.assert_frame_equal(result, expected_results_single_items_df) def test_calc_association_target_single_items(self, transactions_df, expected_results_single_items_df): """Test calculating association rules for target single item versus another of item.""" target_item = "bread" - calc_df = ProductAssociation._calc_association( + calc_df = ProductAssociation( df=transactions_df, value_col="product", group_col=cols.transaction_id, target_item=target_item, ) - + result = calc_df.df pd.testing.assert_frame_equal( - calc_df, + result, expected_results_single_items_df[expected_results_single_items_df["product_1"] == target_item].reset_index( drop=True, ), @@ -136,15 +136,16 @@ def test_calc_association_min_occurrences(self, transactions_df, expected_result """Test calculating association rules with a min occurrences level.""" min_occurrences = 2 - calc_df = ProductAssociation._calc_association( + calc_df = ProductAssociation( df=transactions_df, value_col="product", group_col=cols.transaction_id, min_occurrences=min_occurrences, ) + result = calc_df.df pd.testing.assert_frame_equal( - calc_df, + result, expected_results_single_items_df[ (expected_results_single_items_df["occurrences_1"] >= min_occurrences) & (expected_results_single_items_df["occurrences_2"] >= min_occurrences) @@ -155,15 +156,16 @@ def test_calc_association_min_cooccurrences(self, transactions_df, expected_resu """Test calculating association rules with a min occurrences level.""" min_cooccurrences = 2 - calc_df = ProductAssociation._calc_association( + calc_df = ProductAssociation( df=transactions_df, value_col="product", group_col=cols.transaction_id, min_cooccurrences=min_cooccurrences, ) + result = calc_df.df pd.testing.assert_frame_equal( - calc_df, + result, expected_results_single_items_df[ (expected_results_single_items_df["cooccurrences"] >= min_cooccurrences) ].reset_index(drop=True), @@ -173,15 +175,16 @@ def test_calc_association_min_support(self, transactions_df, expected_results_si """Test calculating association rules with a min occurrences level.""" min_support = 0.25 - calc_df = ProductAssociation._calc_association( + calc_df = ProductAssociation( df=transactions_df, value_col="product", group_col=cols.transaction_id, min_support=min_support, ) + result = calc_df.df pd.testing.assert_frame_equal( - calc_df, + result, expected_results_single_items_df[(expected_results_single_items_df["support"] >= min_support)].reset_index( drop=True, ), @@ -191,15 +194,16 @@ def test_calc_association_min_confidence(self, transactions_df, expected_results """Test calculating association rules with a min occurrences level.""" min_confidence = 0.25 - calc_df = ProductAssociation._calc_association( + calc_df = ProductAssociation( df=transactions_df, value_col="product", group_col=cols.transaction_id, min_confidence=min_confidence, ) + result = calc_df.df pd.testing.assert_frame_equal( - calc_df, + result, expected_results_single_items_df[ (expected_results_single_items_df["confidence"] >= min_confidence) ].reset_index(drop=True), @@ -209,15 +213,16 @@ def test_calc_association_min_uplift(self, transactions_df, expected_results_sin """Test calculating association rules with a min occurrences level.""" min_uplift = 1 - calc_df = ProductAssociation._calc_association( + calc_df = ProductAssociation( df=transactions_df, value_col="product", group_col=cols.transaction_id, min_uplift=min_uplift, ) + result = calc_df.df pd.testing.assert_frame_equal( - calc_df, + result, expected_results_single_items_df[(expected_results_single_items_df["uplift"] >= min_uplift)].reset_index( drop=True, ),