Skip to content

Commit 89b66d5

Browse files
committed
Merge branch 'main' of github.com:data-simply/pyretailscience into feature/product-association
2 parents 35f0677 + 54b8bbf commit 89b66d5

27 files changed

+314
-219
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
repos:
22
- repo: https://github.com/astral-sh/ruff-pre-commit
3-
rev: "v0.2.2"
3+
rev: "v0.11.0"
44
hooks:
55
- id: ruff
66
args: ["--fix"]
77
- id: ruff-format
88
- repo: https://github.com/pre-commit/pre-commit-hooks
9-
rev: v4.5.0
9+
rev: v5.0.0
1010
hooks:
1111
- id: trailing-whitespace
1212
- id: end-of-file-fixer

docs/examples/cross_shop.ipynb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,9 @@
238238
"source": [
239239
"shoes_idx = df[\"category_1_name\"] == \"Shoes\"\n",
240240
"df.loc[shoes_idx, \"category_1_name\"] = np.random.RandomState(42).choice(\n",
241-
" [\"Shoes\", \"Jeans\"], size=shoes_idx.sum(), p=[0.5, 0.5],\n",
241+
" [\"Shoes\", \"Jeans\"],\n",
242+
" size=shoes_idx.sum(),\n",
243+
" p=[0.5, 0.5],\n",
242244
")"
243245
]
244246
},

docs/examples/gain_loss.ipynb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,9 @@
254254
"# Reasign half the rows to Calvin Klein and leave the other half as Diesel\n",
255255
"p2_diesel_idx = time_period_2 & (df[\"brand_name\"] == \"Diesel\")\n",
256256
"df.loc[p2_diesel_idx, \"brand_name\"] = np.random.RandomState(42).choice(\n",
257-
" [\"Calvin Klein\", \"Diesel\"], size=p2_diesel_idx.sum(), p=[0.75, 0.25],\n",
257+
" [\"Calvin Klein\", \"Diesel\"],\n",
258+
" size=p2_diesel_idx.sum(),\n",
259+
" p=[0.75, 0.25],\n",
258260
")\n",
259261
"\n",
260262
"# Apply a 20% discount to Calvin Klein products and increase the quantity by 50%\n",

docs/examples/segmentation.ipynb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -701,10 +701,10 @@
701701
" },\n",
702702
" color=\"black\",\n",
703703
" bbox={\n",
704-
" \"facecolor\":\"white\",\n",
705-
" \"edgecolor\":\"white\",\n",
706-
" \"boxstyle\":\"round,rounding_size=0.75\",\n",
707-
" \"pad\":0.75,\n",
704+
" \"facecolor\": \"white\",\n",
705+
" \"edgecolor\": \"white\",\n",
706+
" \"boxstyle\": \"round,rounding_size=0.75\",\n",
707+
" \"pad\": 0.75,\n",
708708
" },\n",
709709
" linespacing=1.5,\n",
710710
")\n",

pyproject.toml

Lines changed: 13 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -1,129 +1,51 @@
11
[project]
22
name = "pyretailscience"
3-
version = "0.9.0"
3+
version = "0.10.0"
44
description = "Retail Data Science Tools"
55
requires-python = ">=3.10,<3.13"
66
readme = "README.md"
77
license = "Elastic-2.0"
8-
dependencies = [
9-
"pandas>=2.1.4,<3",
10-
"pyarrow>=14.0.2,<15",
11-
"matplotlib>=3.9.1,<4",
12-
"numpy>=1.26.3,<2",
13-
"loguru>=0.7.2,<0.8",
14-
"tqdm>=4.66.1,<5",
15-
"scipy>=1.13.0,<2",
16-
"scikit-learn>=1.4.2,<2",
17-
"matplotlib-set-diagrams~=0.0.2",
18-
"toml>=0.10.2,<0.11",
19-
"duckdb>=1.0.0,<2",
20-
"graphviz>=0.20.3,<0.21",
21-
"ibis-framework[duckdb]>=9.5.0,<10",
22-
]
8+
dependencies = [ "pandas>=2.1.4,<3", "pyarrow>=14.0.2,<15", "matplotlib>=3.9.1,<4", "numpy>=1.26.3,<2", "loguru>=0.7.2,<0.8", "tqdm>=4.66.1,<5", "scipy>=1.13.0,<2", "scikit-learn>=1.4.2,<2", "matplotlib-set-diagrams~=0.0.2", "toml>=0.10.2,<0.11", "duckdb>=1.0.0,<2", "graphviz>=0.20.3,<0.21", "ibis-framework[duckdb]>=9.5.0,<10",]
239
[[project.authors]]
2410
name = "Murray Vanwyk"
2511
2612

2713
[dependency-groups]
28-
dev = [
29-
"pytest>=8.0.0,<9",
30-
"pytest-cov>=4.1.0,<5",
31-
"nbstripout>=0.7.1,<0.8",
32-
"ruff>=0.9,<0.10",
33-
"pre-commit>=3.6.2,<4",
34-
"pytest-mock>=3.14.0,<4",
35-
]
36-
examples = ["jupyterlab>=4.2.5,<5", "tqdm>=4.66.1,<5"]
37-
docs = [
38-
"mkdocs-material>=9.5.4,<10",
39-
"mkdocstrings[python]>=0.24.0,<0.25",
40-
"mkdocs>=1.5.3,<2",
41-
"mkdocs-jupyter>=0.24.6,<0.25",
42-
]
14+
dev = [ "pytest>=8.0.0,<9", "pytest-cov>=4.1.0,<5", "nbstripout>=0.7.1,<0.8", "ruff>=0.9,<0.10", "pre-commit>=3.6.2,<4", "pytest-mock>=3.14.0,<4",]
15+
examples = [ "jupyterlab>=4.2.5,<5", "tqdm>=4.66.1,<5",]
16+
docs = [ "mkdocs-material>=9.5.4,<10", "mkdocstrings[python]>=0.24.0,<0.25", "mkdocs>=1.5.3,<2", "mkdocs-jupyter>=0.24.6,<0.25",]
4317

4418
[build-system]
45-
requires = ["hatchling"]
19+
requires = [ "hatchling",]
4620
build-backend = "hatchling.build"
4721

4822
[tool.uv]
49-
default-groups = ["dev", "examples", "docs"]
23+
default-groups = [ "dev", "examples", "docs",]
5024

5125
[tool.ruff]
5226
target-version = "py310"
5327
line-length = 120
5428
show-fixes = true
5529

5630
[tool.ruff.lint]
57-
ignore = ["ANN101", "ANN102", "EM101", "TRY003", "PT011", "PTH123", "SLF001"]
58-
select = [
59-
"A",
60-
"ANN",
61-
"ARG",
62-
"B",
63-
"BLE",
64-
"C4",
65-
"C90",
66-
"COM",
67-
"D",
68-
"D1",
69-
"D2",
70-
"D3",
71-
"D4",
72-
"DTZ",
73-
"EM",
74-
"ERA",
75-
"EXE",
76-
"F",
77-
"FA",
78-
"FLY",
79-
"G",
80-
"I",
81-
"ICN",
82-
"INP",
83-
"INT",
84-
"ISC",
85-
"N",
86-
"NPY",
87-
"PERF",
88-
"PGH",
89-
"PIE",
90-
"PL",
91-
"PT",
92-
"PTH",
93-
"PYI",
94-
"Q",
95-
"RET",
96-
"RUF",
97-
"RSE",
98-
"S",
99-
"SIM",
100-
"SLF",
101-
"SLOT",
102-
"T10",
103-
"T20",
104-
"TCH",
105-
"TID",
106-
"TRY",
107-
"UP",
108-
"W",
109-
"YTT",
110-
]
31+
ignore = [ "ANN101", "ANN102", "EM101", "TRY003", "PT011", "PTH123", "SLF001",]
32+
select = [ "A", "ANN", "ARG", "B", "BLE", "C4", "C90", "COM", "D", "D1", "D2", "D3", "D4", "DTZ", "EM", "ERA", "EXE", "F", "FA", "FLY", "G", "I", "ICN", "INP", "INT", "ISC", "N", "NPY", "PERF", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "Q", "RET", "RUF", "RSE", "S", "SIM", "SLF", "SLOT", "T10", "T20", "TCH", "TID", "TRY", "UP", "W", "YTT",]
11133

11234
[tool.pytest.ini_options]
11335
addopts = "--cov=pyretailscience --cov-report=term-missing --cov-branch"
11436

11537
[tool.coverage.run]
11638
branch = true
117-
source = ["pyretailscience"]
39+
source = [ "pyretailscience",]
11840

11941
[tool.coverage.report]
12042
show_missing = true
12143
skip_covered = true
12244

12345
[tool.ruff.lint.per-file-ignores]
124-
"__init__.py" = ["F401", "F403", "F405", "D104"]
125-
"tests/*" = ["ANN", "ARG", "INP001", "S101", "SLF001"]
126-
"*.ipynb" = ["T201"]
46+
"__init__.py" = [ "F401", "F403", "F405", "D104",]
47+
"tests/*" = [ "ANN", "ARG", "INP001", "S101", "SLF001",]
48+
"*.ipynb" = [ "T201",]
12749

12850
[tool.ruff.lint.pylint]
12951
max-args = 15

pyretailscience/analysis/cross_shop.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""This module contains the CrossShop class that is used to create a cross-shop diagram."""
22

3-
43
import ibis
54
import matplotlib.pyplot as plt
65
import pandas as pd

pyretailscience/analysis/haversine.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
- **Requires Ibis-Compatible Backend**: Ensure your Ibis backend supports trigonometric functions.
2222
- **Assumes Spherical Earth**: Uses the Haversine formula, which introduces slight inaccuracies due to Earth's oblate shape.
2323
"""
24+
2425
import ibis
2526

2627

pyretailscience/analysis/product_association.py

Lines changed: 38 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
operations, and drive business growth.
3636
"""
3737

38-
3938
import ibis
4039
import pandas as pd
4140

@@ -163,7 +162,7 @@ def _calc_association(
163162
group_col (str, optional): The name of the column that identifies unique transactions or customers. Defaults
164163
to option column.unit_spend.
165164
target_item (str or None, optional): A specific product to focus the association analysis on. If None,
166-
associations for all products are calculated. Defaults to None.
165+
associations for all products are calculated. Defaults to None.
167166
min_occurrences (int, optional): The minimum number of occurrences required for each product in the
168167
association analysis. Defaults to 1. Must be at least 1.
169168
min_cooccurrences (int, optional): The minimum number of co-occurrences required for the product pairs in
@@ -207,7 +206,7 @@ def _calc_association(
207206
if isinstance(df, pd.DataFrame):
208207
df = ibis.memtable(df)
209208

210-
unique_transactions = df
209+
unique_transactions = df.select(df[group_col], df[value_col]).distinct()
211210
total_transactions = unique_transactions.alias("t")[group_col].nunique().name("total_count")
212211

213212
product_occurrences = (
@@ -219,15 +218,22 @@ def _calc_association(
219218
.filter(lambda t: t.occurrences >= min_occurrences)
220219
)
221220

222-
left_table = unique_transactions.mutate(item_1=unique_transactions[value_col]).drop(value_col)
223-
right_table = unique_transactions.mutate(item_2=unique_transactions[value_col]).drop(value_col)
224-
221+
left_table = unique_transactions.rename({"item_1": value_col})
222+
right_table = unique_transactions.rename({"item_2": value_col})
223+
224+
join_logic = [left_table[group_col] == right_table[group_col]]
225+
if target_item is None:
226+
join_logic.append(left_table["item_1"] < right_table["item_2"])
227+
else:
228+
join_logic.extend(
229+
[
230+
left_table["item_1"] != right_table["item_2"],
231+
left_table["item_1"] == target_item,
232+
],
233+
)
225234
merged_df = left_table.join(
226235
right_table,
227-
predicates=[
228-
left_table[group_col] == right_table[group_col],
229-
left_table["item_1"] < right_table["item_2"],
230-
],
236+
predicates=join_logic,
231237
lname="",
232238
rname="{name}_right",
233239
)
@@ -239,14 +245,12 @@ def _calc_association(
239245
{"item_2": value_col, "occurrences_2": "occurrences", "occurrence_probability_2": "occurrence_probability"},
240246
)
241247

242-
merged_df = ibis.join(
243-
merged_df,
248+
merged_df = merged_df.join(
244249
product_occurrences_1,
245250
predicates=[merged_df["item_1"] == product_occurrences_1["item_1"]],
246251
)
247252

248-
merged_df = ibis.join(
249-
merged_df,
253+
merged_df = merged_df.join(
250254
product_occurrences_2,
251255
predicates=[merged_df["item_2"] == product_occurrences_2["item_2"]],
252256
)
@@ -266,13 +270,11 @@ def _calc_association(
266270
{"item_2": value_col, "occurrences_2": "occurrences", "prob_2": "occurrence_probability"},
267271
)
268272

269-
product_pairs = ibis.join(
270-
cooccurrences,
273+
product_pairs = cooccurrences.join(
271274
product_occurrences_1_rename,
272275
predicates=[cooccurrences["item_1"] == product_occurrences_1_rename["item_1"]],
273276
)
274-
product_pairs = ibis.join(
275-
product_pairs,
277+
product_pairs = product_pairs.join(
276278
product_occurrences_2_rename,
277279
predicates=[product_pairs["item_2"] == product_occurrences_2_rename["item_2"]],
278280
)
@@ -282,74 +284,34 @@ def _calc_association(
282284
uplift=product_pairs["support"] / (product_pairs["prob_1"] * product_pairs["prob_2"]),
283285
)
284286

285-
result = product_pairs.filter(
286-
(product_pairs.confidence >= min_confidence) & (product_pairs.uplift >= min_uplift),
287-
)
288-
289-
inverse_pairs = result.rename(
290-
{
291-
f"{value_col}_2": "item_1",
292-
f"{value_col}_1": "item_2",
293-
"occurrences_2": "occurrences_1",
294-
"occurrences_1": "occurrences_2",
295-
},
296-
)
297-
298-
product_occurrences_1_rename2 = product_occurrences.rename({f"{value_col}_1": value_col})
299-
product_occurrences_2_rename2 = product_occurrences.rename({f"{value_col}_2": value_col})
287+
result = product_pairs.filter(product_pairs.uplift >= min_uplift)
300288

301-
inverse_pairs = ibis.join(
302-
inverse_pairs,
303-
product_occurrences_1_rename2,
304-
predicates=[inverse_pairs[f"{value_col}_1"] == product_occurrences_1_rename2[f"{value_col}_1"]],
305-
)
306-
inverse_pairs = ibis.join(
307-
inverse_pairs,
308-
product_occurrences_2_rename2,
309-
predicates=[inverse_pairs[f"{value_col}_2"] == product_occurrences_2_rename2[f"{value_col}_2"]],
310-
)
311-
inverse_pairs = inverse_pairs.mutate(
312-
confidence=inverse_pairs["cooccurrences"] / inverse_pairs["occurrences_1"],
313-
uplift=inverse_pairs["support"] / (inverse_pairs["prob_1"] * inverse_pairs["prob_2"]),
314-
)
315-
316-
result = result.rename({f"{value_col}_1": "item_1", f"{value_col}_2": "item_2"})
317-
result = result[
318-
[
319-
f"{value_col}_1",
320-
f"{value_col}_2",
289+
if target_item is None:
290+
col_order = [
291+
"item_1",
292+
"item_2",
321293
"occurrences_1",
322294
"occurrences_2",
323295
"cooccurrences",
324296
"support",
325297
"confidence",
326298
"uplift",
327299
]
328-
]
329-
inverse_pairs = inverse_pairs[
330-
[
331-
f"{value_col}_1",
332-
f"{value_col}_2",
333-
"occurrences_1",
334-
"occurrences_2",
335-
"cooccurrences",
336-
"support",
337-
"confidence",
338-
"uplift",
339-
]
340-
]
341-
342-
result = result.execute()
343-
inverse_pairs = inverse_pairs.execute()
300+
inverse_pairs = result.mutate(
301+
item_1=result["item_2"],
302+
item_2=result["item_1"],
303+
occurrences_1=result["occurrences_2"],
304+
occurrences_2=result["occurrences_1"],
305+
prob_1=result["prob_2"],
306+
prob_2=result["prob_1"],
307+
confidence=result["cooccurrences"] / result["occurrences_2"],
308+
)
309+
result = result[col_order].union(inverse_pairs[col_order])
344310

345-
final_result = (
346-
pd.concat([result, inverse_pairs], ignore_index=True)
347-
.sort_values(by=[f"{value_col}_1", f"{value_col}_2"])
348-
.reset_index(drop=True)
349-
)
311+
result = result.filter(result.confidence >= min_confidence)
350312

351-
if target_item is not None:
352-
final_result = final_result[final_result[f"{value_col}_1"] == target_item].reset_index(drop=True)
313+
final_result = result.execute().sort_values(by=["item_1", "item_2"]).reset_index(drop=True)
314+
final_result = final_result.rename(columns={"item_1": f"{value_col}_1", "item_2": f"{value_col}_2"})
353315

354316
return final_result[
355317
[

0 commit comments

Comments
 (0)