Merge branch 'zaristei/bidirectional-sampler' of https://github.com/zaristei/pytorch_geometric into zaristei/bidirectional-sampler

zaristei · zaristei · commit ba96098cb86e · 2025-04-14T20:02:34.000-07:00
diff --git a/.github/workflows/rag_testing.yml b/.github/workflows/rag_testing.yml
@@ -0,0 +1,51 @@
+name: Testing RAG on PyTorch 2.5
+
+on:  # yamllint disable-line rule:truthy
+  push:
+    branches:
+      - master
+  pull_request:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ startsWith(github.ref, 'refs/pull/') || github.run_number }}  # yamllint disable-line
+  # Only cancel intermediate builds if on a PR:
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+
+jobs:
+
+  rag_pytest:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 40
+
+      # Run workflow if only certain files have been changed.
+      - name: Get changed files
+        id: changed-files-specific-rag
+        uses: tj-actions/changed-files@v41
+        with:
+          files: |
+            torch_geometric/datasets/web_qsp_dataset.py
+            torch_geometric/nn/nlp/**
+            torch_geometric/nn/models/g_retriever.py
+            torch_geometric/loader/rag_loader.py
+
+      - name: Setup packages
+        if: steps.changed-files-specific-rag.outputs.any_changed == 'true'
+        uses: ./.github/actions/setup
+        with:
+          full_install: false
+
+      - name: Install main package
+        if: steps.changed-files-specific-rag.outputs.any_changed == 'true'
+        run: |
+          pip install -e .[test,rag]
+
+      - name: Run tests
+        if: steps.changed-files-specific-rag.outputs.any_changed == 'true'
+        timeout-minutes: 10
+        run: |
+          RAG_TEST=1 pytest -m rag
diff --git a/.github/workflows/testing_full.yml b/.github/workflows/testing_full.yml
@@ -63,13 +63,18 @@ jobs:
         run: |
           sudo apt-get install graphviz
 
-      - name: Install mpmath
-        if: ${{ matrix.torch-version == 'nightly' }}
+      - name: Install main package (torch!=nightly)
+        if: ${{ matrix.torch-version != 'nightly' }}
         run: |
-          pip install mpmath==1.3.0
+          echo "torch==${{ matrix.torch-version }}" > requirements-constraint.txt
+          pip install -e ".[full,test]" --constraint requirements-constraint.txt
+          python -c "import torch; print('PyTorch:', torch.__version__)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+          python -c "import torch; print('CUDA:', torch.version.cuda)"
         shell: bash
 
-      - name: Install main package
+      - name: Install main package (torch==nightly)
+        if: ${{ matrix.torch-version == 'nightly' }}
         run: |
           pip install -e ".[full,test]"
           python -c "import torch; print('PyTorch:', torch.__version__)"
@@ -80,7 +85,7 @@ jobs:
       - name: Run tests
         timeout-minutes: 20
         run: |
-          FULL_TEST=1 pytest --cov --cov-report=xml
+          FULL_TEST=1 pytest --cov --cov-report=xml --durations 10
         shell: bash
 
       - name: Upload coverage
diff --git a/pyproject.toml b/pyproject.toml
@@ -58,6 +58,14 @@ benchmark=[
     "protobuf<4.21",
     "wandb",
 ]
+rag=[
+    "pcst_fast",
+    "datasets",
+    "transformers",
+    "pandas",
+    "sentencepiece",
+    "accelerate",
+]
 test=[
     "onnx",
     "onnxruntime",
@@ -192,6 +200,9 @@ filterwarnings = [
     # Filter `pytorch_lightning` warnings:
     "ignore:GPU available but not used:UserWarning",
 ]
+markers = [
+    "rag: mark test as RAG test",
+]
 
 [tool.coverage.run]
 source = ["torch_geometric"]
diff --git a/test/datasets/test_web_qsp_dataset.py b/test/datasets/test_web_qsp_dataset.py
@@ -0,0 +1,201 @@
+import os
+import random
+import string
+
+import pytest
+
+from torch_geometric.datasets import WebQSPDataset
+from torch_geometric.datasets.web_qsp_dataset import KGQABaseDataset
+from torch_geometric.testing import (
+    onlyFullTest,
+    onlyOnline,
+    onlyRAG,
+    withPackage,
+)
+
+
+@pytest.mark.skip(reason="Times out")
+@onlyOnline
+@onlyFullTest
+def test_web_qsp_dataset(tmp_path):
+    dataset = WebQSPDataset(root=tmp_path)
+    # Split for this dataset is 2826 train | 246 val | 1628 test
+    # default split is train
+    assert len(dataset) == 2826
+    assert str(dataset) == "WebQSPDataset(2826)"
+
+    dataset_train = WebQSPDataset(root=tmp_path, split="train")
+    assert len(dataset_train) == 2826
+    assert str(dataset_train) == "WebQSPDataset(2826)"
+
+    dataset_val = WebQSPDataset(root=tmp_path, split="val")
+    assert len(dataset_val) == 246
+    assert str(dataset_val) == "WebQSPDataset(246)"
+
+    dataset_test = WebQSPDataset(root=tmp_path, split="test")
+    assert len(dataset_test) == 1628
+    assert str(dataset_test) == "WebQSPDataset(1628)"
+
+
+class MockSentenceTransformer:
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def to(self, device):
+        return self
+
+    def eval(self):
+        return self
+
+    def encode(self, sentences, batch_size=None, output_device=None):
+        import torch
+
+        def string_to_tensor(s: str) -> torch.Tensor:
+            return torch.ones(1024).float()
+
+        if isinstance(sentences, str):
+            return string_to_tensor(sentences)
+        return torch.stack([string_to_tensor(s) for s in sentences])
+
+
+def create_mock_graphs(tmp_path: str, train_size: int, val_size: int,
+                       test_size: int, num_nodes: int, num_edge_types: int,
+                       num_trips: int, seed: int = 42):
+    random.seed(seed)
+    strkeys = string.ascii_letters + string.digits
+    qa_strkeys = string.ascii_letters + string.digits + " "
+
+    def create_mock_triplets(num_nodes: int, num_edges: int, num_trips: int):
+        nodes = list(
+            {"".join(random.sample(strkeys, 10))
+             for i in range(num_nodes)})
+        edges = list(
+            {"".join(random.sample(strkeys, 10))
+             for i in range(num_edges)})
+        triplets = []
+
+        for i in range(num_trips):
+            h = random.randint(0, num_nodes - 1)
+            t = random.randint(0, num_nodes - 1)
+            r = random.randint(0, num_edge_types - 1)
+            triplets.append((nodes[h], edges[r], nodes[t]))
+        return triplets
+
+    train_triplets = [
+        create_mock_triplets(num_nodes, num_edge_types, num_trips)
+        for _ in range(train_size)
+    ]
+    val_triplets = [
+        create_mock_triplets(num_nodes, num_edge_types, num_trips)
+        for _ in range(val_size)
+    ]
+    test_triplets = [
+        create_mock_triplets(num_nodes, num_edge_types, num_trips)
+        for _ in range(test_size)
+    ]
+
+    train_questions = [
+        "".join(random.sample(qa_strkeys, 10)) for _ in range(train_size)
+    ]
+    val_questions = [
+        "".join(random.sample(qa_strkeys, 10)) for _ in range(val_size)
+    ]
+    test_questions = [
+        "".join(random.sample(qa_strkeys, 10)) for _ in range(test_size)
+    ]
+
+    train_answers = [
+        "".join(random.sample(qa_strkeys, 10)) for _ in range(train_size)
+    ]
+    val_answers = [
+        "".join(random.sample(qa_strkeys, 10)) for _ in range(val_size)
+    ]
+    test_answers = [
+        "".join(random.sample(qa_strkeys, 10)) for _ in range(test_size)
+    ]
+
+    train_graphs = {
+        "graph": train_triplets,
+        "question": train_questions,
+        "answer": train_answers
+    }
+    val_graphs = {
+        "graph": val_triplets,
+        "question": val_questions,
+        "answer": val_answers
+    }
+    test_graphs = {
+        "graph": test_triplets,
+        "question": test_questions,
+        "answer": test_answers
+    }
+
+    from datasets import Dataset, DatasetDict, load_from_disk
+
+    ds_train = Dataset.from_dict(train_graphs, split="train")
+    ds_val = Dataset.from_dict(val_graphs, split="validation")
+    ds_test = Dataset.from_dict(test_graphs, split="test")
+
+    ds = DatasetDict({
+        "train": ds_train,
+        "validation": ds_val,
+        "test": ds_test
+    })
+
+    def mock_load_dataset(name: str):
+        # Save the dataset and then load it to emulate downloading from HF
+        DATASET_CACHE_DIR = os.path.join(tmp_path,
+                                         ".cache/huggingface/datasets", name)
+        os.makedirs(DATASET_CACHE_DIR, exist_ok=True)
+
+        ds.save_to_disk(DATASET_CACHE_DIR)
+        dataset_remote = load_from_disk(DATASET_CACHE_DIR)
+        return dataset_remote
+
+    return mock_load_dataset, ds
+
+
+@onlyRAG
+@withPackage("datasets", "pandas")
+def test_kgqa_base_dataset(tmp_path, monkeypatch):
+
+    num_nodes = 500
+    num_edge_types = 25
+    num_trips = 5000
+
+    # Mock the dataset graphs
+    mock_load_dataset_func, expected_result = create_mock_graphs(
+        tmp_path, train_size=10, val_size=5, test_size=5, num_nodes=num_nodes,
+        num_edge_types=num_edge_types, num_trips=num_trips)
+
+    import datasets
+
+    monkeypatch.setattr(datasets, "load_dataset", mock_load_dataset_func)
+
+    # Mock the SentenceTransformer
+    import torch_geometric.datasets.web_qsp_dataset
+    monkeypatch.setattr(torch_geometric.datasets.web_qsp_dataset,
+                        "SentenceTransformer", MockSentenceTransformer)
+
+    dataset_train = KGQABaseDataset(root=tmp_path, dataset_name="TestDataset",
+                                    split="train", use_pcst=False)
+    assert len(dataset_train) == 10
+    assert str(dataset_train) == "KGQABaseDataset(10)"
+    for graph in dataset_train:
+        assert graph.x.shape == (num_nodes, 1024)
+        assert graph.edge_index.shape == (2, num_trips)
+        assert graph.edge_attr.shape == (
+            num_trips, 1024)  # Reminder: edge_attr encodes the entire triplet
+
+    dataset_val = KGQABaseDataset(root=tmp_path, dataset_name="TestDataset",
+                                  split="val", use_pcst=False)
+    assert len(dataset_val) == 5
+    assert str(dataset_val) == "KGQABaseDataset(5)"
+
+    dataset_test = KGQABaseDataset(root=tmp_path, dataset_name="TestDataset",
+                                   split="test", use_pcst=False)
+    assert len(dataset_test) == 5
+    assert str(dataset_test) == "KGQABaseDataset(5)"
+
+    # TODO(zaristei): More rigorous tests to validate that values are correct
+    # TODO(zaristei): Proper tests for PCST and CWQ
diff --git a/test/nn/models/test_g_retriever.py b/test/nn/models/test_g_retriever.py
@@ -2,10 +2,10 @@
 
 from torch_geometric.nn import GAT, GRetriever
 from torch_geometric.nn.nlp import LLM
-from torch_geometric.testing import onlyFullTest, withPackage
+from torch_geometric.testing import onlyRAG, withPackage
 
 
-@onlyFullTest
+@onlyRAG
 @withPackage('transformers', 'sentencepiece', 'accelerate')
 def test_g_retriever() -> None:
     llm = LLM(
@@ -53,7 +53,7 @@ def test_g_retriever() -> None:
     assert len(pred) == 1
 
 
-@onlyFullTest
+@onlyRAG
 @withPackage('transformers', 'sentencepiece', 'accelerate')
 def test_g_retriever_many_tokens() -> None:
     llm = LLM(
diff --git a/test/nn/models/test_gpse.py b/test/nn/models/test_gpse.py
@@ -21,7 +21,7 @@ def test_gpse_training():
 
     data = Data(x=x, y=y, edge_index=edge_index)
     data = VirtualNode()(data)
-    data.y_graph = torch.tensor(torch.randn(11))
+    data.y_graph = torch.randn(11)
 
     batch = Batch.from_data_list([data])
     model = GPSE()
diff --git a/test/nn/nlp/test_llm.py b/test/nn/nlp/test_llm.py
@@ -2,10 +2,10 @@
 from torch import Tensor
 
 from torch_geometric.nn.nlp import LLM
-from torch_geometric.testing import onlyFullTest, withPackage
+from torch_geometric.testing import onlyRAG, withPackage
 
 
-@onlyFullTest
+@onlyRAG
 @withPackage('transformers', 'accelerate')
 def test_llm() -> None:
     question = ["Is PyG the best open-source GNN library?"]
diff --git a/test/nn/nlp/test_sentence_transformer.py b/test/nn/nlp/test_sentence_transformer.py
@@ -1,11 +1,11 @@
 import pytest
 
 from torch_geometric.nn.nlp import SentenceTransformer
-from torch_geometric.testing import onlyFullTest, withCUDA, withPackage
+from torch_geometric.testing import onlyRAG, withCUDA, withPackage
 
 
 @withCUDA
-@onlyFullTest
+@onlyRAG
 @withPackage('transformers')
 @pytest.mark.parametrize('batch_size', [None, 1])
 @pytest.mark.parametrize('pooling_strategy', ['mean', 'last', 'cls'])
diff --git a/torch_geometric/data/large_graph_indexer.py b/torch_geometric/data/large_graph_indexer.py
@@ -22,6 +22,7 @@
 from tqdm import tqdm
 
 from torch_geometric.data import Data
+from torch_geometric.io import fs
 from torch_geometric.typing import WITH_PT24
 
 # Could be any hashable type
@@ -505,13 +506,13 @@ def from_disk(cls, path: str) -> "LargeGraphIndexer":
         for fname in os.listdir(node_attr_path):
             full_fname = f"{node_attr_path}/{fname}"
             key = fname.split(".")[0]
-            indexer.node_attr[key] = torch.load(full_fname)
+            indexer.node_attr[key] = fs.torch_load(full_fname)
 
         edge_attr_path = path + "/edge_attr"
         for fname in os.listdir(edge_attr_path):
             full_fname = f"{edge_attr_path}/{fname}"
             key = fname.split(".")[0]
-            indexer.edge_attr[key] = torch.load(full_fname)
+            indexer.edge_attr[key] = fs.torch_load(full_fname)
 
         return indexer
 
diff --git a/torch_geometric/testing/__init__.py b/torch_geometric/testing/__init__.py
diff --git a/torch_geometric/testing/decorators.py b/torch_geometric/testing/decorators.py