-
Notifications
You must be signed in to change notification settings - Fork 1
Setup GCP BigQuery Integration Tests for Analysis Modules #244
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
f51ea78
00783ab
54db275
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
GCP_PROJECT_ID = |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
name: BigQuery Integration Tests | ||
|
||
on: | ||
workflow_dispatch: | ||
inputs: | ||
test_suite: | ||
type: choice | ||
description: Test Suite to Run | ||
default: "all" | ||
options: | ||
- all | ||
- cohort_analysis | ||
- composite_rank | ||
- cross_shop | ||
- customer_decision_hierarchy | ||
- haversine | ||
- hml_segmentation | ||
- product_association | ||
- revenue_tree | ||
- rfm_segmentation | ||
- segstats_segmentation | ||
- threshold_segmentation | ||
|
||
permissions: | ||
contents: read | ||
|
||
concurrency: | ||
group: "bigquery-tests" | ||
cancel-in-progress: true | ||
|
||
jobs: | ||
integration-tests: | ||
name: Run BigQuery Integration Tests | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v4 | ||
|
||
- name: Setup Python | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: "3.11" | ||
|
||
- name: Install uv Package | ||
run: | | ||
pip install --upgrade pip | ||
pip install uv==0.5.30 | ||
|
||
- name: Install Dependencies | ||
run: | | ||
uv sync | ||
uv sync --group dev | ||
|
||
- name: Set up GCP Authentication | ||
uses: google-github-actions/auth@v2 | ||
with: | ||
credentials_json: ${{ secrets.GCP_SA_KEY }} | ||
|
||
- name: Run Integration Tests | ||
env: | ||
TEST_SUITE: ${{ inputs.test_suite }} | ||
run: | | ||
uv run pytest tests/integration/bigquery -v \ | ||
$(if [ "$TEST_SUITE" != "all" ]; then echo "-k $TEST_SUITE"; fi) |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -25,7 +25,7 @@ repos: | |||||
hooks: | ||||||
- id: pytest | ||||||
name: pytest | ||||||
entry: uv run pytest --cov=pyretailscience --cov-report=xml --cov-branch tests | ||||||
entry: uv run pytest --cov=pyretailscience --cov-report=xml --cov-branch tests --ignore=tests/integration/bigquery | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Eventually we'll also have other DB integration tests. Maybe we just ignore the whole
Suggested change
|
||||||
language: system | ||||||
types: [python] | ||||||
pass_filenames: false | ||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -83,14 +83,11 @@ def __init__( | |||||
window = ibis.window(order_by=ibis.asc(df[value_col])) | ||||||
df = df.mutate(ptile=ibis.percent_rank().over(window)) | ||||||
|
||||||
case = ibis.case() | ||||||
|
||||||
case_args = [] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can do this with list comprehension, ie
Suggested change
|
||||||
for quantile, segment in zip(thresholds, segments, strict=True): | ||||||
case = case.when(df["ptile"] <= quantile, segment) | ||||||
|
||||||
case = case.end() | ||||||
case_args.append((df["ptile"] <= quantile, segment)) | ||||||
|
||||||
df = df.mutate(segment_name=case).drop(["ptile"]) | ||||||
df = df.mutate(segment_name=ibis.cases(*case_args)).drop(["ptile"]) | ||||||
|
||||||
if zero_value_customers == "separate_segment": | ||||||
df = ibis.union(df, zero_df) | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
"""BigQuery integration test fixtures.""" | ||
|
||
import os | ||
|
||
import ibis | ||
import pytest | ||
from dotenv import load_dotenv | ||
from google.cloud import bigquery | ||
from loguru import logger | ||
|
||
load_dotenv() | ||
client = bigquery.Client(project="pyretailscience-infra") | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def bigquery_connection(): | ||
"""Connect to BigQuery for integration tests.""" | ||
try: | ||
conn = ibis.bigquery.connect( | ||
project_id=os.environ.get("GCP_PROJECT_ID"), | ||
) | ||
logger.info("Connected to BigQuery") | ||
except Exception as e: | ||
logger.error(f"Failed to connect to BigQuery: {e}") | ||
raise | ||
else: | ||
return conn | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def transactions_table(bigquery_connection): | ||
"""Get the transactions table for testing.""" | ||
return bigquery_connection.table("test_data.transactions") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
"""Integration tests for Cohort Analysis with BigQuery.""" | ||
|
||
import pytest | ||
|
||
from pyretailscience.analysis.cohort import CohortAnalysis | ||
|
||
|
||
def test_cohort_analysis_with_bigquery(transactions_table): | ||
"""Integration test for CohortAnalysis using BigQuery backend and Ibis table. | ||
|
||
This test ensures that the CohortAnalysis class initializes and executes successfully | ||
using BigQuery data with various combinations of aggregation parameters. | ||
""" | ||
limited_table = transactions_table.limit(5000) | ||
|
||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you remove the try/except blocks from around the code. They're unnecessary |
||
CohortAnalysis( | ||
df=limited_table, | ||
aggregation_column="unit_spend", | ||
agg_func="sum", | ||
period="week", | ||
percentage=True, | ||
) | ||
except Exception as e: # noqa: BLE001 | ||
pytest.fail( | ||
f"CohortAnalysis failed: {e}", | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
"""Integration tests for Composite Rank Analysis with BigQuery.""" | ||
|
||
import pytest | ||
|
||
from pyretailscience.analysis.composite_rank import CompositeRank | ||
|
||
|
||
@pytest.fixture | ||
def test_transactions_table(transactions_table): | ||
"""Fetch test transactions data from BigQuery and convert it to a pandas DataFrame. | ||
|
||
The expected table should include columns like `product_id`, `unit_spend`, and `customer_id`. | ||
Adds a calculated column `spend_per_customer`. | ||
""" | ||
try: | ||
ibis_table = transactions_table.mutate( | ||
spend_per_customer=transactions_table.unit_spend / transactions_table.customer_id, | ||
) | ||
mvanwyk marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
except Exception as e: # noqa: BLE001 | ||
pytest.fail(f"Failed to fetch or preprocess test data: {e}") | ||
else: | ||
return ibis_table | ||
Comment on lines
+8
to
+23
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't need to add this to test the composite rank. I'd just remove this code and the column from the test. |
||
|
||
|
||
def test_composite_rank_basic(test_transactions_table): | ||
"""Test basic CompositeRank functionality with BigQuery data.""" | ||
rank_cols = [ | ||
("unit_spend", "desc"), | ||
("customer_id", "desc"), | ||
("spend_per_customer", "desc"), | ||
] | ||
try: | ||
result = CompositeRank( | ||
df=test_transactions_table, | ||
rank_cols=rank_cols, | ||
agg_func="mean", | ||
ignore_ties=False, | ||
) | ||
assert result is not None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You need to .execute this otherwise it's not actually testing on BQ. |
||
except Exception as e: # noqa: BLE001 | ||
pytest.fail(f"CompositeRank basic test failed: {e}") | ||
murray-ds marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
@pytest.mark.parametrize("ignore_ties", [False, True]) | ||
def test_tie_handling(test_transactions_table, ignore_ties): | ||
"""Test handling of ties during rank calculation.""" | ||
rank_cols = [("unit_spend", "desc")] | ||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you remove the try/except block here. Parameterize handles this. |
||
result = CompositeRank( | ||
df=test_transactions_table, | ||
rank_cols=rank_cols, | ||
agg_func="mean", | ||
ignore_ties=ignore_ties, | ||
) | ||
assert result is not None | ||
except Exception as e: # noqa: BLE001 | ||
pytest.fail(f"CompositeRank failed with ignore_ties={ignore_ties}: {e}") | ||
mvanwyk marked this conversation as resolved.
Show resolved
Hide resolved
|
mvanwyk marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
"""Integration tests for Cross Shop Analysis with BigQuery.""" | ||
|
||
import pytest | ||
|
||
from pyretailscience.analysis.cross_shop import CrossShop | ||
|
||
|
||
def test_cross_shop_with_bigquery(transactions_table): | ||
"""Test CrossShop with data fetched from BigQuery. | ||
|
||
This parameterized test verifies that CrossShop can be initialized | ||
and run with data from BigQuery using different combinations of group columns, | ||
value columns, and aggregation functions without throwing exceptions. | ||
""" | ||
transactions_df = transactions_table.limit(5000) | ||
group_1_col = "brand_name" | ||
group_2_col = "category_0_name" | ||
group_3_col = "category_1_name" | ||
group_1_vals = transactions_df[group_1_col].execute().dropna().unique() | ||
group_2_vals = transactions_df[group_2_col].execute().dropna().unique() | ||
|
||
group_1_val = group_1_vals[0] | ||
group_2_val = group_2_vals[0] | ||
|
||
group_3_val = None | ||
if group_3_col is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This needs to be paramertized otherwise it doesn't make sense. |
||
group_3_vals = transactions_df[group_3_col].execute().dropna().unique() | ||
if len(group_3_vals) == 0: | ||
pytest.skip(f"Not enough unique values for {group_3_col}") | ||
group_3_val = group_3_vals[0] | ||
|
||
labels = ["Group 1", "Group 2"] if group_3_col is None else ["Group 1", "Group 2", "Group 3"] | ||
|
||
try: | ||
CrossShop( | ||
df=transactions_table, | ||
group_1_col=group_1_col, | ||
group_1_val=group_1_val, | ||
group_2_col=group_2_col, | ||
group_2_val=group_2_val, | ||
group_3_col=group_3_col, | ||
group_3_val=group_3_val, | ||
labels=labels, | ||
value_col="unit_quantity", | ||
agg_func="count", | ||
) | ||
|
||
except Exception as e: # noqa: BLE001 | ||
pytest.fail(f"CrossShop failed with parameters {group_1_col}, {group_2_col}, {group_3_col}: {e}") |
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
@@ -0,0 +1,46 @@ | ||||
"""Integration tests for Customer Decision Hierarchy Analysis with BigQuery.""" | ||||
|
||||
import pytest | ||||
|
||||
from pyretailscience.analysis.customer_decision_hierarchy import CustomerDecisionHierarchy | ||||
from pyretailscience.options import ColumnHelper | ||||
|
||||
cols = ColumnHelper() | ||||
|
||||
|
||||
@pytest.mark.parametrize( | ||||
("method", "min_var_explained", "exclude_same_transaction"), | ||||
[ | ||||
("truncated_svd", 0.7, False), | ||||
("truncated_svd", 0.7, None), | ||||
("truncated_svd", None, False), | ||||
("yules_q", 0.7, False), | ||||
("yules_q", 0.7, None), | ||||
("yules_q", None, False), | ||||
("yules_q", None, None), | ||||
], | ||||
) | ||||
Comment on lines
+11
to
+22
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to test "min_var_explained". It's won't be related to Ibis functionality. Can you please remove it from the test. |
||||
def test_customer_decision_hierarchy_with_bigquery( | ||||
transactions_table, | ||||
method, | ||||
min_var_explained, | ||||
exclude_same_transaction, | ||||
): | ||||
"""Test CustomerDecisionHierarchy with data fetched from BigQuery. | ||||
|
||||
This parameterized test verifies that CustomerDecisionHierarchy can be initialized | ||||
and run with data from BigQuery using different combinations of product columns | ||||
and methods without throwing exceptions. | ||||
""" | ||||
transactions_df = transactions_table.limit(5000).execute() | ||||
|
||||
try: | ||||
CustomerDecisionHierarchy( | ||||
df=transactions_df, | ||||
product_col="product_name", | ||||
exclude_same_transaction_products=exclude_same_transaction, | ||||
method=method, | ||||
min_var_explained=min_var_explained if min_var_explained is not None else 0.8, | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can just leave this as the default
Suggested change
|
||||
) | ||||
except Exception as e: # noqa: BLE001 | ||||
pytest.fail(f"CustomerDecisionHierarchy failed with, method={method}: {e}") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You don't need both here.
uv sync
will also install group dev