Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: "v0.2.2"
rev: "v0.11.0"
hooks:
- id: ruff
args: ["--fix"]
- id: ruff-format
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
Expand Down
4 changes: 3 additions & 1 deletion docs/examples/cross_shop.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,9 @@
"source": [
"shoes_idx = df[\"category_1_name\"] == \"Shoes\"\n",
"df.loc[shoes_idx, \"category_1_name\"] = np.random.RandomState(42).choice(\n",
" [\"Shoes\", \"Jeans\"], size=shoes_idx.sum(), p=[0.5, 0.5],\n",
" [\"Shoes\", \"Jeans\"],\n",
" size=shoes_idx.sum(),\n",
" p=[0.5, 0.5],\n",
")"
]
},
Expand Down
4 changes: 3 additions & 1 deletion docs/examples/gain_loss.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,9 @@
"# Reasign half the rows to Calvin Klein and leave the other half as Diesel\n",
"p2_diesel_idx = time_period_2 & (df[\"brand_name\"] == \"Diesel\")\n",
"df.loc[p2_diesel_idx, \"brand_name\"] = np.random.RandomState(42).choice(\n",
" [\"Calvin Klein\", \"Diesel\"], size=p2_diesel_idx.sum(), p=[0.75, 0.25],\n",
" [\"Calvin Klein\", \"Diesel\"],\n",
" size=p2_diesel_idx.sum(),\n",
" p=[0.75, 0.25],\n",
")\n",
"\n",
"# Apply a 20% discount to Calvin Klein products and increase the quantity by 50%\n",
Expand Down
8 changes: 4 additions & 4 deletions docs/examples/segmentation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -701,10 +701,10 @@
" },\n",
" color=\"black\",\n",
" bbox={\n",
" \"facecolor\":\"white\",\n",
" \"edgecolor\":\"white\",\n",
" \"boxstyle\":\"round,rounding_size=0.75\",\n",
" \"pad\":0.75,\n",
" \"facecolor\": \"white\",\n",
" \"edgecolor\": \"white\",\n",
" \"boxstyle\": \"round,rounding_size=0.75\",\n",
" \"pad\": 0.75,\n",
" },\n",
" linespacing=1.5,\n",
")\n",
Expand Down
1 change: 0 additions & 1 deletion pyretailscience/analysis/cross_shop.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""This module contains the CrossShop class that is used to create a cross-shop diagram."""


import ibis
import matplotlib.pyplot as plt
import pandas as pd
Expand Down
1 change: 1 addition & 0 deletions pyretailscience/analysis/haversine.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
- **Requires Ibis-Compatible Backend**: Ensure your Ibis backend supports trigonometric functions.
- **Assumes Spherical Earth**: Uses the Haversine formula, which introduces slight inaccuracies due to Earth's oblate shape.
"""

import ibis


Expand Down
25 changes: 17 additions & 8 deletions pyretailscience/analysis/segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@
def __init__(
self,
data: pd.DataFrame | ibis.Table,
segment_col: str = "segment_name",
segment_col: str | list[str] = "segment_name",
extra_aggs: dict[str, tuple[str, str]] | None = None,
) -> None:
"""Calculates transaction statistics by segment.
Expand All @@ -202,7 +202,8 @@
customer_id, unit_spend and transaction_id. If the dataframe contains the column unit_quantity, then
the columns unit_spend and unit_quantity are used to calculate the price_per_unit and
units_per_transaction.
segment_col (str, optional): The column to use for the segmentation. Defaults to "segment_name".
segment_col (str | list[str], optional): The column or list of columns to use for the segmentation.
Defaults to "segment_name".
extra_aggs (dict[str, tuple[str, str]], optional): Additional aggregations to perform.
The keys in the dictionary will be the column names for the aggregation results.
The values are tuples with (column_name, aggregation_function), where:
Expand All @@ -211,11 +212,14 @@
Example: {"stores": ("store_id", "nunique")} would count unique store_ids.
"""
cols = ColumnHelper()

if isinstance(segment_col, str):
segment_col = [segment_col]
required_cols = [
cols.customer_id,
cols.unit_spend,
cols.transaction_id,
segment_col,
*segment_col,
]
if cols.unit_qty in data.columns:
required_cols.append(cols.unit_qty)
Expand Down Expand Up @@ -273,14 +277,14 @@
@staticmethod
def _calc_seg_stats(
data: pd.DataFrame | ibis.Table,
segment_col: str,
segment_col: list[str],
extra_aggs: dict[str, tuple[str, str]] | None = None,
) -> ibis.Table:
"""Calculates the transaction statistics by segment.

Args:
data (pd.DataFrame | ibis.Table): The transaction data.
segment_col (str): The column to use for the segmentation.
segment_col (list[str]): The columns to use for the segmentation.
extra_aggs (dict[str, tuple[str, str]], optional): Additional aggregations to perform.
The keys in the dictionary will be the column names for the aggregation results.
The values are tuples with (column_name, aggregation_function).
Expand Down Expand Up @@ -314,7 +318,7 @@

# Calculate metrics for segments and total
segment_metrics = data.group_by(segment_col).aggregate(**aggs)
total_metrics = data.aggregate(**aggs).mutate(segment_name=ibis.literal("Total"))
total_metrics = data.aggregate(**aggs).mutate({col: ibis.literal("Total") for col in segment_col})
total_customers = data[cols.customer_id].nunique()

# Cross join with total_customers to make it available for percentage calculation
Expand Down Expand Up @@ -343,7 +347,7 @@
if self._df is None:
cols = ColumnHelper()
col_order = [
self.segment_col,
*self.segment_col,
*SegTransactionStats._get_col_order(include_quantity=cols.agg_unit_qty in self.table.columns),
]

Expand Down Expand Up @@ -392,18 +396,23 @@
Raises:
ValueError: If the sort_order is not "ascending", "descending" or None.
ValueError: If the orientation is not "vertical" or "horizontal".
ValueError: If multiple segment columns are used, as plotting is only supported for a single segment column.
"""
if sort_order not in ["ascending", "descending", None]:
raise ValueError("sort_order must be either 'ascending' or 'descending' or None")
if orientation not in ["vertical", "horizontal"]:
raise ValueError("orientation must be either 'vertical' or 'horizontal'")
if len(self.segment_col) > 1:
raise ValueError("Plotting is only supported for a single segment column")

default_title = f"{value_col.title()} by Segment"
kind = "bar"
if orientation == "horizontal":
kind = "barh"

val_s = self.df.set_index(self.segment_col)[value_col]
# Use the first segment column for plotting
plot_segment_col = self.segment_col[0]
val_s = self.df.set_index(plot_segment_col)[value_col]

Check warning on line 415 in pyretailscience/analysis/segmentation.py

View check run for this annotation

Codecov / codecov/patch

pyretailscience/analysis/segmentation.py#L414-L415

Added lines #L414 - L415 were not covered by tests
if hide_total:
val_s = val_s[val_s.index != "Total"]

Expand Down
1 change: 0 additions & 1 deletion pyretailscience/plots/time.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
- **Helper functions**: Utilizes utility functions from the `pyretailscience` package to handle styling, formatting, and other plot adjustments.
"""


import numpy as np
import pandas as pd
from matplotlib.axes import Axes, SubplotBase
Expand Down
1 change: 1 addition & 0 deletions pyretailscience/plots/venn.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
- **Pre-Aggregated Data Required**: The module does not perform data aggregation; input data should already be structured correctly.

"""

from collections.abc import Callable

import pandas as pd
Expand Down
2 changes: 1 addition & 1 deletion tests/analysis/test_cross_shop.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
cols = ColumnHelper()


@pytest.fixture()
@pytest.fixture
def sample_data():
"""Sample data for testing."""
return pd.DataFrame(
Expand Down
3 changes: 2 additions & 1 deletion tests/analysis/test_haversine.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
"""Tests for the haversine distance module."""

import ibis
import pandas as pd
import pytest

from pyretailscience.analysis.haversine import haversine_distance


@pytest.fixture()
@pytest.fixture
def sample_ibis_table():
"""Fixture to provide a sample Ibis table for testing."""
data = {
Expand Down
6 changes: 3 additions & 3 deletions tests/analysis/test_product_association.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
class TestProductAssociations:
"""Tests for the ProductAssociations class."""

@pytest.fixture()
@pytest.fixture
def transactions_df(self) -> pd.DataFrame:
"""Return a sample DataFrame for testing."""
# fmt: off
Expand All @@ -23,7 +23,7 @@ def transactions_df(self) -> pd.DataFrame:
})
# fmt: on

@pytest.fixture()
@pytest.fixture
def expected_results_single_items_df(self) -> pd.DataFrame:
"""Return the expected results for the single items association analysis."""
# fmt: off
Expand Down Expand Up @@ -58,7 +58,7 @@ def expected_results_single_items_df(self) -> pd.DataFrame:
)
# fmt: on

@pytest.fixture()
@pytest.fixture
def expected_results_pair_items_df(self) -> pd.DataFrame:
"""Return the expected results for the pair items association analysis."""
# fmt: off
Expand Down
2 changes: 1 addition & 1 deletion tests/analysis/test_revenue_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
class TestRevenueTree:
"""Test the RevenueTree class."""

@pytest.fixture()
@pytest.fixture
def cols(self):
"""Return a ColumnHelper instance."""
return ColumnHelper()
Expand Down
76 changes: 71 additions & 5 deletions tests/analysis/test_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
class TestCalcSegStats:
"""Tests for the _calc_seg_stats method."""

@pytest.fixture()
@pytest.fixture
def base_df(self):
"""Return a base DataFrame for testing."""
return pd.DataFrame(
Expand Down Expand Up @@ -314,6 +314,70 @@ def test_handles_empty_dataframe_with_errors(self):
with pytest.raises(ValueError):
SegTransactionStats(df, "segment_name")

def test_multiple_segment_columns(self):
"""Test that the class correctly handles multiple segment columns."""
df = pd.DataFrame(
{
cols.customer_id: [1, 1, 2, 2, 3, 3],
cols.unit_spend: [100.0, 150.0, 200.0, 250.0, 300.0, 350.0],
cols.transaction_id: [101, 102, 103, 104, 105, 106],
"segment_name": ["A", "A", "B", "B", "A", "A"],
"region": ["North", "North", "South", "South", "East", "East"],
},
)

# Test with a list of segment columns
seg_stats = SegTransactionStats(df, ["segment_name", "region"])

# Create expected DataFrame with the combinations actually produced
expected_output = pd.DataFrame(
{
"segment_name": ["A", "A", "B", "Total"],
"region": ["East", "North", "South", "Total"],
cols.agg_unit_spend: [650.0, 250.0, 450.0, 1350.0],
cols.agg_transaction_id: [2, 2, 2, 6],
cols.agg_customer_id: [1, 1, 1, 3],
cols.calc_spend_per_cust: [650.0, 250.0, 450.0, 450.0],
cols.calc_spend_per_trans: [325.0, 125.0, 225.0, 225.0],
cols.calc_trans_per_cust: [2.0, 2.0, 2.0, 2.0],
cols.customers_pct: [1 / 3, 1 / 3, 1 / 3, 1.0],
},
)

# Sort both dataframes by the segment columns for consistent comparison
result_df = seg_stats.df.sort_values(["segment_name", "region"]).reset_index(drop=True)
expected_output = expected_output.sort_values(["segment_name", "region"]).reset_index(drop=True)

# Check that both segment columns are in the result
assert "segment_name" in result_df.columns
assert "region" in result_df.columns

# Check number of rows - the implementation only returns actual combinations that exist in data
# plus the Total row, not all possible combinations
assert len(result_df) == len(expected_output)

# Use pandas testing to compare the dataframes
pd.testing.assert_frame_equal(result_df[expected_output.columns], expected_output)

def test_plot_with_multiple_segment_columns(self):
"""Test that plotting with multiple segment columns raises a ValueError."""
df = pd.DataFrame(
{
cols.customer_id: [1, 2, 3],
cols.unit_spend: [100.0, 200.0, 300.0],
cols.transaction_id: [101, 102, 103],
"segment_name": ["A", "B", "A"],
"region": ["North", "South", "East"],
},
)

seg_stats = SegTransactionStats(df, ["segment_name", "region"])

with pytest.raises(ValueError) as excinfo:
seg_stats.plot("spend")

assert "Plotting is only supported for a single segment column" in str(excinfo.value)

def test_extra_aggs_functionality(self):
"""Test that the extra_aggs parameter works correctly."""
# Constants for expected values
Expand Down Expand Up @@ -365,9 +429,11 @@ def test_extra_aggs_functionality(self):
# Sort by segment_name to ensure consistent order
result_df_multi = seg_stats_multi.df.sort_values("segment_name").reset_index(drop=True)

assert result_df_multi.loc[0, "distinct_products"] == segment_a_product_count # Segment A
assert result_df_multi.loc[1, "distinct_products"] == segment_b_product_count # Segment B
assert result_df_multi.loc[2, "distinct_products"] == total_product_count # Total
assert result_df_multi["distinct_products"].to_list() == [
segment_a_product_count,
segment_b_product_count,
total_product_count,
]

def test_extra_aggs_with_invalid_column(self):
"""Test that an error is raised when an invalid column is specified in extra_aggs."""
Expand Down Expand Up @@ -405,7 +471,7 @@ def test_extra_aggs_with_invalid_function(self):
class TestHMLSegmentation:
"""Tests for the HMLSegmentation class."""

@pytest.fixture()
@pytest.fixture
def base_df(self):
"""Return a base DataFrame for testing."""
return pd.DataFrame(
Expand Down
7 changes: 4 additions & 3 deletions tests/plots/test_area.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Tests for the plots.area module."""

from itertools import cycle

import numpy as np
Expand All @@ -13,7 +14,7 @@
RNG = np.random.default_rng(42)


@pytest.fixture()
@pytest.fixture
def sample_dataframe():
"""A sample dataframe for Jeans sales data."""
data = {
Expand All @@ -24,7 +25,7 @@ def sample_dataframe():
return pd.DataFrame(data)


@pytest.fixture()
@pytest.fixture
def _mock_color_generators(mocker):
"""Mock the color generators for single and multi color maps."""
single_color_gen = cycle(["#FF0000"]) # Mocked single-color generator (e.g., red)
Expand All @@ -34,7 +35,7 @@ def _mock_color_generators(mocker):
mocker.patch("pyretailscience.style.tailwind.get_multi_color_cmap", return_value=multi_color_gen)


@pytest.fixture()
@pytest.fixture
def _mock_gu_functions(mocker):
mocker.patch("pyretailscience.style.graph_utils.standard_graph_styles", side_effect=lambda ax, **kwargs: ax)
mocker.patch("pyretailscience.style.graph_utils.standard_tick_styles", side_effect=lambda ax: ax)
Expand Down
Loading