Data-Simply
diff --git a/‎.env_sample
Lines changed: 1 addition & 0 deletions b/‎.env_sample
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/bigquery-integration.yml
Lines changed: 66 additions & 0 deletions b/‎.github/workflows/bigquery-integration.yml
Lines changed: 66 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 99 additions & 0 deletions b/‎README.md
Lines changed: 99 additions & 0 deletions
diff --git a/‎pyproject.toml
Lines changed: 2 additions & 0 deletions b/‎pyproject.toml
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/integration/bigquery/conftest.py
Lines changed: 33 additions & 0 deletions b/‎tests/integration/bigquery/conftest.py
Lines changed: 33 additions & 0 deletions
diff --git a/‎tests/integration/bigquery/test_cohort_analysis.py
Lines changed: 49 additions & 0 deletions b/‎tests/integration/bigquery/test_cohort_analysis.py
Lines changed: 49 additions & 0 deletions
diff --git a/‎tests/integration/bigquery/test_composite_rank.py
Lines changed: 113 additions & 0 deletions b/‎tests/integration/bigquery/test_composite_rank.py
Lines changed: 113 additions & 0 deletions
@@ -0,0 +1 @@
+GCP_PROJECT_ID =
@@ -0,0 +1,66 @@
+name: BigQuery Integration Tests
+
+on:
+  workflow_dispatch:
+    inputs:
+      test_suite:
+        type: choice
+        description: Test Suite to Run
+        default: "all"
+        options:
+          - all
+          - cohort_analysis
+          - composite_rank
+          - cross_shop
+          - customer_decision_hierarchy
+          - hml_segmentation
+          - product_association
+          - revenue_tree
+          - rfm_segmentation
+          - segstats_segmentation
+          - threshold_segmentation
+
+permissions:
+  contents: read
+
+concurrency:
+  group: "bigquery-tests"
+  cancel-in-progress: true
+
+jobs:
+  integration-tests:
+    name: Run BigQuery Integration Tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install uv Package
+        run: |
+          pip install --upgrade pip
+          pip install uv==0.5.30
+
+      - name: Install Dependencies
+        run: |
+          uv sync
+          uv sync --group dev
+
+      - name: Set up GCP Authentication
+        uses: google-github-actions/auth@v2
+        with:
+          credentials_json: ${{ secrets.GCP_SA_KEY }}
+
+      - name: Set up Google Cloud SDK
+        uses: google-github-actions/setup-gcloud@v2
+
+      - name: Run Integration Tests
+        env:
+          TEST_SUITE: ${{ inputs.test_suite }}
+        run: |
+          uv run pytest tests/integration/bigquery -v \
+            $(if [ "$TEST_SUITE" != "all" ]; then echo "-k $TEST_SUITE"; fi)
@@ -25,7 +25,7 @@ repos:
     hooks:
       - id: pytest
         name: pytest
-        entry: uv run pytest --cov=pyretailscience --cov-report=xml --cov-branch tests
+        entry: uv run pytest --cov=pyretailscience --cov-report=xml --cov-branch tests --ignore=tests/integration/bigquery
         language: system
         types: [python]
         pass_filenames: false
 
@@ -1,3 +1,4 @@
+<!-- README.md -->
 ![PyRetailScience Logo](https://raw.githubusercontent.com/Data-Simply/pyretailscience/main/readme_assets/logo.png)
 
 # PyRetailScience
@@ -208,3 +209,101 @@ Built with expertise doing analytics and data science for scale-ups to multi-nat
 ## License
 
 This project is licensed under the Elastic License 2.0 - see the [LICENSE](LICENSE) file for details.
+
+# BigQuery Integration Tests
+
+## Overview
+
+This directory contains integration tests that verify all PyRetailScience analysis modules
+work correctly with Google BigQuery as a backend. These tests confirm that the Ibis-based
+code paths function correctly when connected to BigQuery.
+
+## Test Coverage
+
+The integration tests cover the following analysis modules:
+
+- **Cohort Analysis** - Tests customer cohort retention metrics
+- **Cross Shop Analysis** - Tests product/category cross-shopping patterns
+- **Customer Analysis** - Tests customer lifetime value and purchase frequency metrics
+- **Gain Loss Analysis** - Tests comparative performance analysis
+- **Haversine Analysis** - Tests geographic distance calculations
+- **Product Association Analysis** - Tests market basket analysis
+- **Customer Decision Hierarchy** - Tests customer purchase decision patterns
+- **Revenue Tree Analysis** - Tests hierarchical revenue breakdowns
+- **Composite Rank Analysis** - Tests weighted ranking of entities
+- **Segmentation Analysis** - Tests RFM and value-frequency customer segmentation
+
+## Prerequisites
+
+To run these tests, you need:
+
+1. Access to a Google Cloud Platform account
+2. A service account with BigQuery permissions
+3. The service account key JSON file
+4. The test dataset must be loaded in BigQuery (dataset: `test_data`, table: `transactions`)
+
+## Running the Tests
+
+### Manual Setup
+
+- Set up authentication:
+
+```bash
+export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your/service-account-key.json
+export GCP_PROJECT_ID=your-project-id
+```
+
+- Install dependencies:
+
+```bash
+uv pip install -e .
+uv pip install "ibis-framework[bigquery]>=10.0.0,<11"
+```
+
+- Run the tests:
+
+```bash
+# Run all tests
+uv run pytest tests/integration/bigquery -v
+
+# Run specific test module
+uv run pytest tests/integration/bigquery/test_cohort_analysis.py -v
+```
+
+## Using GitHub Actions
+
+These tests can be run manually in GitHub Actions via the "BigQuery Integration Tests" workflow. To run:
+
+1. Go to the "Actions" tab in the GitHub repository
+2. Select the "BigQuery Integration Tests" workflow
+3. Click "Run workflow"
+4. Optionally enter a test filter pattern (e.g., "test_cohort_analysis")
+5. Click "Run workflow"
+
+### Required Secrets
+
+To run the workflow in GitHub Actions, add these secrets to your repository:
+
+- `GCP_SA_KEY`: The entire JSON content of your GCP service account key file
+- `GCP_PROJECT_ID`: Your GCP project ID
+
+## Test Data
+
+The tests expect a BigQuery dataset named `test_data` with a table named `transactions` containing the following columns:
+
+- `transaction_id`
+- `transaction_date`
+- `transaction_time`
+- `customer_id`
+- `product_id`
+- `product_name`
+- `category_0_name`
+- `category_0_id`
+- `category_1_name`
+- `category_1_id`
+- `brand_name`
+- `brand_id`
+- `unit_quantity`
+- `unit_cost`
+- `unit_spend`
+- `store_id`
@@ -27,11 +27,13 @@ name  = "Murray Vanwyk"
 [dependency-groups]
 dev = [
     "freezegun>=1.5.1,<2",
+    "ibis-framework[bigquery]>=10.0.0,<11",
     "nbstripout>=0.7.1,<0.8",
     "pre-commit>=3.6.2,<4",
     "pytest-cov>=4.1.0,<5",
     "pytest-mock>=3.14.0,<4",
     "pytest>=8.0.0,<9",
+    "python-dotenv>=1.0.0,<2",
     "ruff>=0.9,<0.10",
     "tomlkit>=0.12,<1",
 ]
 
@@ -0,0 +1,33 @@
+"""BigQuery integration test fixtures."""
+
+import os
+
+import ibis
+import pytest
+from dotenv import load_dotenv
+from google.cloud import bigquery
+from loguru import logger
+
+load_dotenv()
+client = bigquery.Client(project="pyretailscience-infra")
+
+
+@pytest.fixture(scope="session")
+def bigquery_connection():
+    """Connect to BigQuery for integration tests."""
+    try:
+        conn = ibis.bigquery.connect(
+            project_id=os.environ.get("GCP_PROJECT_ID"),
+        )
+        logger.info("Connected to BigQuery")
+    except Exception as e:
+        logger.error(f"Failed to connect to BigQuery: {e}")
+        raise
+    else:
+        return conn
+
+
+@pytest.fixture(scope="session")
+def transactions_table(bigquery_connection):
+    """Get the transactions table for testing."""
+    return bigquery_connection.table("test_data.transactions")
@@ -0,0 +1,49 @@
+"""Integration tests for Cohort Analysis with BigQuery."""
+
+import pandas as pd
+import pytest
+
+from pyretailscience.analysis.cohort import CohortAnalysis
+
+
+class TestCohortAnalysisBigQuery:
+    """Integration tests for Cohort Analysis using real BigQuery data."""
+
+    def test_cohort_computation_bigquery(self, transactions_table):
+        """Tests cohort computation logic using BigQuery data."""
+        cohort = CohortAnalysis(
+            df=transactions_table,
+            aggregation_column="unit_spend",
+            agg_func="nunique",
+            period="month",
+            percentage=False,
+        )
+        result = cohort.table
+        assert not result.empty, "Cohort table should not be empty for valid BigQuery data"
+        assert isinstance(result, pd.DataFrame)
+
+    def test_invalid_period(self, transactions_table):
+        """Test if an invalid period raises an error."""
+        invalid_period = "m"
+        with pytest.raises(
+            ValueError,
+            match=f"Invalid period '{invalid_period}'. Allowed values: {CohortAnalysis.VALID_PERIODS}",
+        ):
+            CohortAnalysis(
+                df=transactions_table,
+                aggregation_column="unit_spend",
+                period=invalid_period,
+            )
+
+    def test_cohort_percentage(self, transactions_table):
+        """Tests cohort analysis with percentage=True."""
+        cohort = CohortAnalysis(
+            df=transactions_table,
+            aggregation_column="unit_spend",
+            agg_func="sum",
+            period="month",
+            percentage=True,
+        )
+        result = cohort.table
+        assert not result.empty
+        assert result.max().max() <= 1.0, "Values should be <= 1 when percentage=True"
@@ -0,0 +1,113 @@
+"""Integration tests for Composite Rank Analysis with BigQuery."""
+
+import pytest
+
+from pyretailscience.analysis.composite_rank import CompositeRank
+
+
+class TestCompositeRank:
+    """Tests for the CompositeRank class."""
+
+    @pytest.fixture(scope="class")
+    def test_transactions_df(self, transactions_table):
+        """Fetch test transactions data from BigQuery and convert to DataFrame.
+
+        This fixture assumes a table with columns like product_id, spend, customers, etc.
+        Modify the query and column names as per your actual BigQuery table structure.
+        """
+        df = transactions_table.to_pandas()
+
+        if "spend_per_customer" not in df.columns:
+            df["spend_per_customer"] = df["unit_spend"] / df["customer_id"]
+
+        return df
+
+    def test_composite_rank_with_bigquery_data(self, test_transactions_df):
+        """Test CompositeRank functionality with real BigQuery data.
+
+        This test demonstrates using CompositeRank with BigQuery-sourced data.
+        """
+        rank_cols = [
+            ("unit_spend", "desc"),
+            ("customer_id", "desc"),
+            ("spend_per_customer", "desc"),
+        ]
+
+        cr = CompositeRank(
+            df=test_transactions_df,
+            rank_cols=rank_cols,
+            agg_func="mean",
+            ignore_ties=False,
+        )
+
+        assert "composite_rank" in cr.df.columns
+        assert len(cr.df) > 0
+
+        expected_rank_columns = [
+            "unit_spend_rank",
+            "customer_id_rank",
+            "spend_per_customer_rank",
+            "composite_rank",
+        ]
+        for col in expected_rank_columns:
+            assert col in cr.df.columns
+
+    def test_different_agg_functions_with_bigquery(self, test_transactions_df):
+        """Test different aggregation functions with BigQuery data."""
+        agg_functions = ["mean", "sum", "min", "max"]
+
+        rank_cols = [
+            ("unit_spend", "desc"),
+            ("customer_id", "desc"),
+            ("spend_per_customer", "desc"),
+        ]
+
+        for agg_func in agg_functions:
+            cr = CompositeRank(
+                df=test_transactions_df,
+                rank_cols=rank_cols,
+                agg_func=agg_func,
+                ignore_ties=False,
+            )
+
+            assert "composite_rank" in cr.df.columns
+            assert len(cr.df) > 0
+
+    def test_ignore_ties_with_bigquery(self, test_transactions_df):
+        """Test tie-breaking behavior with BigQuery data."""
+        rank_cols = [("unit_spend", "desc")]
+
+        cr_with_ties = CompositeRank(
+            df=test_transactions_df,
+            rank_cols=rank_cols,
+            agg_func="mean",
+            ignore_ties=False,
+        )
+
+        cr_no_ties = CompositeRank(
+            df=test_transactions_df,
+            rank_cols=rank_cols,
+            agg_func="mean",
+            ignore_ties=True,
+        )
+
+        assert "unit_spend_rank" in cr_with_ties.df.columns
+        assert "unit_spend_rank" in cr_no_ties.df.columns
+
+    def test_ibis_table_input(self, transactions_table):
+        """Explicitly test Ibis table input for CompositeRank."""
+        cr = CompositeRank(
+            df=transactions_table,
+            rank_cols=[("unit_spend", "desc"), ("customer_id", "desc")],
+            agg_func="mean",
+            ignore_ties=False,
+        )
+
+        expected_columns = [
+            "unit_spend_rank",
+            "customer_id_rank",
+            "composite_rank",
+        ]
+
+        for col in expected_columns:
+            assert col in cr.df.columns