Skip to content

Commit 79b8cc6

Browse files
committed
Merge branch 'main' of github.com:data-simply/pyretailscience into feature/product-association
2 parents 2743f04 + c4d0039 commit 79b8cc6

File tree

10 files changed

+692
-105
lines changed

10 files changed

+692
-105
lines changed

docs/api/analysis/options.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Options Analysis
2+
3+
::: pyretailscience.options

pyretailscience/analysis/product_association.py

Lines changed: 17 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,6 @@
4141

4242
from pyretailscience.options import get_option
4343

44-
SUPPORTED_COMBINATIONS = 2
45-
4644

4745
class ProductAssociation:
4846
"""A class for generating and analyzing product association rules.
@@ -143,7 +141,7 @@ def __init__(
143141
)
144142

145143
@staticmethod
146-
def _calc_association( # (ignore complexity) - Excluded due to min_* arguments checks
144+
def _calc_association(
147145
df: pd.DataFrame | ibis.Table,
148146
value_col: str,
149147
group_col: str = get_option("column.customer_id"),
@@ -209,40 +207,34 @@ def _calc_association( # (ignore complexity) - Excluded due to min_* arguments
209207
if isinstance(df, pd.DataFrame):
210208
df = ibis.memtable(df)
211209

212-
unique_transactions = (
213-
df.group_by(group_col).aggregate(products=lambda t, col=value_col: t[col].collect()).order_by(group_col)
214-
)
215-
unique_transactions = unique_transactions.mutate(
216-
item=ibis.expr.operations.Unnest(unique_transactions["products"]),
217-
).drop("products")
210+
unique_transactions = df.select(df[group_col], df[value_col].name("item")).distinct()
218211

219212
total_transactions = unique_transactions[group_col].nunique().execute()
220-
221213
product_occurrences = (
222214
unique_transactions.group_by("item")
223-
.aggregate(occurrences=lambda t, col=group_col: t[col].nunique())
224-
.order_by("item")
225-
)
226-
product_occurrences = product_occurrences.mutate(
227-
occurrence_probability=product_occurrences["occurrences"] / total_transactions,
215+
.aggregate(
216+
occurrences=lambda t, col=group_col: t[col].nunique(),
217+
occurrence_probability=lambda t, col=group_col: t[col].nunique() / total_transactions,
218+
)
219+
.filter(lambda t: t.occurrences >= min_occurrences)
228220
)
229-
product_occurrences = product_occurrences.filter(product_occurrences["occurrences"] >= min_occurrences)
230221

231222
left_table = unique_transactions.rename({"item_1": "item"})
232223
right_table = unique_transactions.rename({"item_2": "item"})
233224

234-
merged_df = ibis.join(
235-
left_table,
225+
merged_df = left_table.join(
236226
right_table,
237-
predicates=[left_table[group_col] == right_table[group_col]],
227+
predicates=[
228+
left_table[group_col] == right_table[group_col],
229+
left_table["item_1"] < right_table["item_2"],
230+
],
238231
)
239-
merged_df = merged_df.filter(merged_df["item_1"] < merged_df["item_2"])
240232

241233
product_occurrences_1 = product_occurrences.rename(
242-
{"item_1": "item", "occurrences_x": "occurrences", "occurrence_probability_x": "occurrence_probability"},
234+
{"item_1": "item", "occurrences_1": "occurrences", "occurrence_probability_1": "occurrence_probability"},
243235
)
244236
product_occurrences_2 = product_occurrences.rename(
245-
{"item_2": "item", "occurrences_y": "occurrences", "occurrence_probability_y": "occurrence_probability"},
237+
{"item_2": "item", "occurrences_2": "occurrences", "occurrence_probability_2": "occurrence_probability"},
246238
)
247239

248240
merged_df = ibis.join(
@@ -255,13 +247,9 @@ def _calc_association( # (ignore complexity) - Excluded due to min_* arguments
255247
merged_df,
256248
product_occurrences_2,
257249
predicates=[merged_df["item_2"] == product_occurrences_2["item_2"]],
258-
).order_by([group_col, "item_1", "item_2"])
259-
260-
cooccurrences = (
261-
merged_df.group_by(["item_1", "item_2"])
262-
.aggregate(cooccurrences=merged_df[group_col].nunique())
263-
.order_by(["item_1", "cooccurrences"])
264250
)
251+
252+
cooccurrences = merged_df.group_by(["item_1", "item_2"]).aggregate(cooccurrences=merged_df[group_col].nunique())
265253
cooccurrences = cooccurrences.mutate(
266254
total_count=total_transactions,
267255
support=cooccurrences.cooccurrences / total_transactions,
@@ -286,7 +274,7 @@ def _calc_association( # (ignore complexity) - Excluded due to min_* arguments
286274
product_pairs,
287275
product_occurrences_2_rename,
288276
predicates=[product_pairs["item_2"] == product_occurrences_2_rename["item_2"]],
289-
).order_by(["item_1", "item_2"])
277+
)
290278

291279
product_pairs = product_pairs.mutate(
292280
confidence=product_pairs["cooccurrences"] / product_pairs["occurrences_1"],

pyretailscience/options.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ def __init__(self) -> None:
6262
"column.suffix.difference": "diff",
6363
"column.suffix.percent_difference": "pct_diff",
6464
"column.suffix.contribution": "contrib",
65-
"column.suffix.period_1": "_p1",
66-
"column.suffix.period_2": "_p2",
65+
"column.suffix.period_1": "p1",
66+
"column.suffix.period_2": "p2",
6767
}
6868
self._descriptions: dict[str, str] = {
6969
# Database columns

0 commit comments

Comments
 (0)