From ef32b844a4731c838d670996bdf37604456fa210 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 22 Sep 2023 13:17:35 -0700 Subject: [PATCH 001/197] Multinode-multigpu Papers100m `GCN` example --- .../multinode-multigpu-papers100m-gcn.py | 219 ++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 examples/multi_gpu/multinode-multigpu-papers100m-gcn.py diff --git a/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py b/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py new file mode 100644 index 000000000000..ca06bc1a48a5 --- /dev/null +++ b/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py @@ -0,0 +1,219 @@ +''' +in terminal 1: +srun --overlap -A -p interactive -J -N 2 -t 02:00:00 --pty bash + +in terminal 2: +squeue -u +then +export jobid=<> + +then + +srun -l -N2 --ntasks-per-node=1 --overlap --jobid=$jobid \ +--container-image= --container-name=cont \ +--container-mounts=/ogb-papers100m/:/workspace/dataset true + +srun -l -N2 --ntasks-per-node=3 --overlap --jobid=$jobid \ +--container-name=cont-rp-9-22 \ +--container-mounts=/lustre/fsw/dlfw/dlfw-pyg/riship/ogb-papers100m/:/workspace/dataset/ \ +python3 multinode-papers100m-gcn.py --ngpu_per_node 3 + + + +Results: + +Data = Data(num_nodes=111059956, edge_index=[2, 1615685872], x=[111059956, 128], node_year=[111059956, 1], y=[111059956]) +Using 6 GPUs... +Beginning training... +Epoch: 0, Iteration: 1570, Loss: tensor(2.3715, device='cuda:0', grad_fn=) +Average Training Iteration Time: 0.00961135015664843 s/iter +Validation Accuracy: 40.0000% +Average Inference Iteration Time: 0.05930390887790256 s/iter +Epoch: 1, Iteration: 1570, Loss: tensor(2.3669, device='cuda:0', grad_fn=) +Average Training Iteration Time: 0.009306145814043982 s/iter +Validation Accuracy: 40.0781% +Average Inference Iteration Time: 0.05451994472079807 s/iter +Epoch: 2, Iteration: 1570, Loss: tensor(2.3352, device='cuda:0', grad_fn=) +Average Training Iteration Time: 0.009185380617981152 s/iter +Validation Accuracy: 40.0234% +Average Inference Iteration Time: 0.06007050673166911 s/iter +Test Accuracy: 24.8861% + +''' +import argparse +import os +import time + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +import torch.nn.functional as F +from ogb.nodeproppred import PygNodePropPredDataset +from torch.nn.parallel import DistributedDataParallel +from torchmetrics import Accuracy + +from torch_geometric.loader import NeighborLoader +from torch_geometric.nn import GCNConv +import warnings + +warnings.filterwarnings("ignore") + + +def pyg_num_work(): + num_work = None + if hasattr(os, "sched_getaffinity"): + try: + num_work = len(os.sched_getaffinity(0)) / 2 + except Exception: + pass + if num_work is None: + num_work = os.cpu_count() / 2 + return int(num_work) + +_LOCAL_PROCESS_GROUP = None + +def create_local_process_group(num_workers_per_node): + global _LOCAL_PROCESS_GROUP + assert _LOCAL_PROCESS_GROUP is None + world_size = dist.get_world_size() if dist.is_initialized() else 1 + rank = dist.get_rank() if dist.is_initialized() else 0 + assert world_size % num_workers_per_node == 0 + + num_nodes = world_size // num_workers_per_node + node_rank = rank // num_workers_per_node + for i in range(num_nodes): + ranks_on_i = list(range(i * num_workers_per_node, (i + 1) * num_workers_per_node)) + pg = dist.new_group(ranks_on_i) + if i == node_rank: + _LOCAL_PROCESS_GROUP = pg + +def get_local_process_group(): + assert _LOCAL_PROCESS_GROUP is not None + return _LOCAL_PROCESS_GROUP + + +class GCN(torch.nn.Module): + def __init__(self, in_channels, hidden_channels, out_channels): + super().__init__() + self.conv1 = GCNConv(in_channels, hidden_channels) + self.conv2 = GCNConv(hidden_channels, out_channels) + + def forward(self, x, edge_index, edge_weight=None): + x = F.dropout(x, p=0.5, training=self.training) + x = self.conv1(x, edge_index, edge_weight).relu() + x = F.dropout(x, p=0.5, training=self.training) + x = self.conv2(x, edge_index, edge_weight) + return x + + +def run_train(device, data, world_size, model, epochs, batch_size, fan_out, + split_idx, num_classes): + local_group = get_local_process_group() + loc_id = dist.get_rank(group=local_group) + rank = torch.distributed.get_rank() + os.environ['NVSHMEM_SYMMETRIC_SIZE'] = "107374182400" + if rank == 0: + print("Data =", data) + print('Using', nprocs, 'GPUs...') + split_idx['train'] = split_idx['train'].split( + split_idx['train'].size(0) // world_size, dim=0)[rank].clone() + model = model.to(device) + model = DistributedDataParallel(model, device_ids=[loc_id]) + optimizer = torch.optim.Adam(model.parameters(), lr=0.01, + weight_decay=0.0005) + train_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], + input_nodes=split_idx['train'], + batch_size=batch_size, + num_workers=pyg_num_work()) + if rank == 0: + eval_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], + input_nodes=split_idx['valid'], + batch_size=batch_size, + num_workers=pyg_num_work()) + test_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], + input_nodes=split_idx['test'], + batch_size=batch_size, + num_workers=pyg_num_work()) + eval_steps = 100 + acc = Accuracy(task="multiclass", num_classes=num_classes).to(device) + if rank == 0: + print("Beginning training...") + for epoch in range(epochs): + for i, batch in enumerate(train_loader): + if i >= 10: + start = time.time() + batch = batch.to(device) + batch.y = batch.y.to(torch.long) + optimizer.zero_grad() + out = model(batch.x, batch.edge_index) + loss = F.cross_entropy(out[:batch_size], batch.y[:batch_size]) + loss.backward() + optimizer.step() + if rank == 0 and i % 10 == 0: + print("Epoch: " + str(epoch) + ", Iteration: " + str(i) + + ", Loss: " + str(loss)) + if rank == 0: + print("Average Training Iteration Time:", + (time.time() - start) / (i - 10), "s/iter") + acc_sum = 0.0 + with torch.no_grad(): + for i, batch in enumerate(eval_loader): + if i >= eval_steps: + break + if i >= 10: + start = time.time() + batch = batch.to(device) + batch.y = batch.y.to(torch.long) + out = model(batch.x, batch.edge_index) + acc_sum += acc(out[:batch_size].softmax(dim=-1), + batch.y[:batch_size]) + print(f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) + print("Average Inference Iteration Time:", + (time.time() - start) / (i - 10), "s/iter") + if rank == 0: + acc_sum = 0.0 + with torch.no_grad(): + for i, batch in enumerate(test_loader): + batch = batch.to(device) + batch.y = batch.y.to(torch.long) + out = model(batch.x, batch.edge_index) + acc_sum += acc(out[:batch_size].softmax(dim=-1), + batch.y[:batch_size]) + print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--hidden_channels', type=int, default=64) + parser.add_argument('--lr', type=float, default=0.01) + parser.add_argument('--epochs', type=int, default=3) + parser.add_argument('--batch_size', type=int, default=128) + parser.add_argument('--fan_out', type=int, default=50) + parser.add_argument( + "--ngpu_per_node", + type=int, + default="1", + help="number of GPU(s) for each node for multi-gpu training,", + ) + args = parser.parse_args() + # setup multi node + torch.distributed.init_process_group("nccl") + nprocs = dist.get_world_size() + create_local_process_group(args.ngpu_per_node) + local_group = get_local_process_group() + device_id = dist.get_rank(group=local_group) if dist.is_initialized() else 0 + torch.cuda.set_device(device_id) + device = torch.device(device_id) + all_pids = torch.zeros(dist.get_world_size(), dtype=torch.int64).to(device) + all_pids[dist.get_rank()] = os.getpid() + dist.all_reduce(all_pids) + + dataset = PygNodePropPredDataset(name='ogbn-papers100M') + split_idx = dataset.get_idx_split() + + data = dataset[0] + data.y = data.y.reshape(-1) + model = GCN(dataset.num_features, args.hidden_channels, + dataset.num_classes) + run_train(device, data, nprocs, model, args.epochs, args.batch_size, + args.fan_out, split_idx, dataset.num_classes) From 8540ec08967b69d80c715cf8b7e292061f72706b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 22 Sep 2023 20:19:24 +0000 Subject: [PATCH 002/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../multinode-multigpu-papers100m-gcn.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py b/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py index ca06bc1a48a5..21848667f8a7 100644 --- a/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py +++ b/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py @@ -43,6 +43,7 @@ import argparse import os import time +import warnings import torch import torch.distributed as dist @@ -54,7 +55,6 @@ from torch_geometric.loader import NeighborLoader from torch_geometric.nn import GCNConv -import warnings warnings.filterwarnings("ignore") @@ -70,8 +70,10 @@ def pyg_num_work(): num_work = os.cpu_count() / 2 return int(num_work) + _LOCAL_PROCESS_GROUP = None + def create_local_process_group(num_workers_per_node): global _LOCAL_PROCESS_GROUP assert _LOCAL_PROCESS_GROUP is None @@ -82,11 +84,13 @@ def create_local_process_group(num_workers_per_node): num_nodes = world_size // num_workers_per_node node_rank = rank // num_workers_per_node for i in range(num_nodes): - ranks_on_i = list(range(i * num_workers_per_node, (i + 1) * num_workers_per_node)) + ranks_on_i = list( + range(i * num_workers_per_node, (i + 1) * num_workers_per_node)) pg = dist.new_group(ranks_on_i) if i == node_rank: _LOCAL_PROCESS_GROUP = pg + def get_local_process_group(): assert _LOCAL_PROCESS_GROUP is not None return _LOCAL_PROCESS_GROUP @@ -198,10 +202,11 @@ def run_train(device, data, world_size, model, epochs, batch_size, fan_out, args = parser.parse_args() # setup multi node torch.distributed.init_process_group("nccl") - nprocs = dist.get_world_size() + nprocs = dist.get_world_size() create_local_process_group(args.ngpu_per_node) local_group = get_local_process_group() - device_id = dist.get_rank(group=local_group) if dist.is_initialized() else 0 + device_id = dist.get_rank( + group=local_group) if dist.is_initialized() else 0 torch.cuda.set_device(device_id) device = torch.device(device_id) all_pids = torch.zeros(dist.get_world_size(), dtype=torch.int64).to(device) @@ -216,4 +221,4 @@ def run_train(device, data, world_size, model, epochs, batch_size, fan_out, model = GCN(dataset.num_features, args.hidden_channels, dataset.num_classes) run_train(device, data, nprocs, model, args.epochs, args.batch_size, - args.fan_out, split_idx, dataset.num_classes) + args.fan_out, split_idx, dataset.num_classes) From 7f03b57cb8333af910dd7124e10964ca004bdb36 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 22 Sep 2023 14:22:25 -0700 Subject: [PATCH 003/197] Update multinode-multigpu-papers100m-gcn.py --- examples/multi_gpu/multinode-multigpu-papers100m-gcn.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py b/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py index 21848667f8a7..d5fcef602608 100644 --- a/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py +++ b/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py @@ -40,6 +40,8 @@ Test Accuracy: 24.8861% ''' + + import argparse import os import time From 8693427891fa9817f11dda27bbcbf9f0aac6ac9e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 22 Sep 2023 21:23:16 +0000 Subject: [PATCH 004/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/multinode-multigpu-papers100m-gcn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py b/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py index d5fcef602608..383481df4443 100644 --- a/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py +++ b/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py @@ -41,7 +41,6 @@ ''' - import argparse import os import time From cc4ba562d73cc1fd6a7fa03845d84bb4b48ea531 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 22 Sep 2023 15:01:39 -0700 Subject: [PATCH 005/197] Update papers100m_multigpu.py --- examples/multi_gpu/papers100m_multigpu.py | 27 ++++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/examples/multi_gpu/papers100m_multigpu.py b/examples/multi_gpu/papers100m_multigpu.py index 7a368357d4aa..52d69512f9ea 100644 --- a/examples/multi_gpu/papers100m_multigpu.py +++ b/examples/multi_gpu/papers100m_multigpu.py @@ -14,15 +14,15 @@ from torch_geometric.nn import GCNConv -def pyg_num_work(): +def pyg_num_work(world_size): num_work = None if hasattr(os, "sched_getaffinity"): try: - num_work = len(os.sched_getaffinity(0)) / 2 + num_work = len(os.sched_getaffinity(0)) / (2 * world_size) except Exception: pass if num_work is None: - num_work = os.cpu_count() / 2 + num_work = os.cpu_count() / (2 * world_size) return int(num_work) @@ -51,26 +51,31 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, model = DistributedDataParallel(model, device_ids=[rank]) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005) + num_work = pyg_num_work(world_size) train_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], input_nodes=split_idx['train'], batch_size=batch_size, - num_workers=pyg_num_work()) + shuffle=True, + num_workers=num_work) if rank == 0: eval_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], input_nodes=split_idx['valid'], batch_size=batch_size, - num_workers=pyg_num_work()) + shuffle=True + num_workers=num_work) test_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], input_nodes=split_idx['test'], batch_size=batch_size, - num_workers=pyg_num_work()) - eval_steps = 100 + shuffle=False, + num_workers=num_work) + eval_steps = 1000 + warmup_steps = 100 acc = Accuracy(task="multiclass", num_classes=num_classes).to(rank) if rank == 0: print("Beginning training...") for epoch in range(epochs): for i, batch in enumerate(train_loader): - if i >= 10: + if i >= warmup_steps: start = time.time() batch = batch.to(rank) batch.y = batch.y.to(torch.long) @@ -84,13 +89,13 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, ", Loss: " + str(loss)) if rank == 0: print("Average Training Iteration Time:", - (time.time() - start) / (i - 10), "s/iter") + (time.time() - start) / (i - warmup_steps), "s/iter") acc_sum = 0.0 with torch.no_grad(): for i, batch in enumerate(eval_loader): if i >= eval_steps: break - if i >= 10: + if i >= warmup_steps: start = time.time() batch = batch.to(rank) batch.y = batch.y.to(torch.long) @@ -99,7 +104,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y[:batch_size]) print(f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) print("Average Inference Iteration Time:", - (time.time() - start) / (i - 10), "s/iter") + (time.time() - start) / (i - warmup_steps), "s/iter") if rank == 0: acc_sum = 0.0 with torch.no_grad(): From 8341d125e2f68e3ad3e2f9b8a05e047cc9a5c437 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 22 Sep 2023 15:05:05 -0700 Subject: [PATCH 006/197] Rename papers100m_multigpu.py to multigpu_papers100m_gcn.py --- .../{papers100m_multigpu.py => multigpu_papers100m_gcn.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/multi_gpu/{papers100m_multigpu.py => multigpu_papers100m_gcn.py} (100%) diff --git a/examples/multi_gpu/papers100m_multigpu.py b/examples/multi_gpu/multigpu_papers100m_gcn.py similarity index 100% rename from examples/multi_gpu/papers100m_multigpu.py rename to examples/multi_gpu/multigpu_papers100m_gcn.py From 4f6949fe404a93b10756de92fa830e6b3c42b711 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 22 Sep 2023 15:11:33 -0700 Subject: [PATCH 007/197] optimization updates --- .../multinode-multigpu-papers100m-gcn.py | 75 ++++++++++--------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py b/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py index 383481df4443..9bde2d61eef1 100644 --- a/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py +++ b/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py @@ -1,4 +1,5 @@ ''' + in terminal 1: srun --overlap -A -p interactive -J -N 2 -t 02:00:00 --pty bash @@ -25,26 +26,29 @@ Data = Data(num_nodes=111059956, edge_index=[2, 1615685872], x=[111059956, 128], node_year=[111059956, 1], y=[111059956]) Using 6 GPUs... Beginning training... -Epoch: 0, Iteration: 1570, Loss: tensor(2.3715, device='cuda:0', grad_fn=) -Average Training Iteration Time: 0.00961135015664843 s/iter -Validation Accuracy: 40.0000% -Average Inference Iteration Time: 0.05930390887790256 s/iter -Epoch: 1, Iteration: 1570, Loss: tensor(2.3669, device='cuda:0', grad_fn=) -Average Training Iteration Time: 0.009306145814043982 s/iter -Validation Accuracy: 40.0781% -Average Inference Iteration Time: 0.05451994472079807 s/iter -Epoch: 2, Iteration: 1570, Loss: tensor(2.3352, device='cuda:0', grad_fn=) -Average Training Iteration Time: 0.009185380617981152 s/iter -Validation Accuracy: 40.0234% -Average Inference Iteration Time: 0.06007050673166911 s/iter -Test Accuracy: 24.8861% + +Epoch: 0, Iteration: 1570, Loss: tensor(2.7372, device='cuda:0', grad_fn=) +Average Training Iteration Time: 0.0022558025027757116 s/iter +Validation Accuracy: 33.1712% +Average Inference Iteration Time: 0.002441989262174637 s/iter + +Epoch: 1, Iteration: 1570, Loss: tensor(2.6074, device='cuda:0', grad_fn=) +Average Training Iteration Time: 0.002187901319104231 s/iter +Validation Accuracy: 32.2733% +Average Inference Iteration Time: 0.002225210835015855 s/iter + +Epoch: 2, Iteration: 1570, Loss: tensor(2.5593, device='cuda:0', grad_fn=) +Average Training Iteration Time: 0.002199090496994302 s/iter +Validation Accuracy: 33.9588% +Average Inference Iteration Time: 0.003229572181006499 s/iter +Test Accuracy: 24.5902% ''' + import argparse import os import time -import warnings import torch import torch.distributed as dist @@ -56,25 +60,24 @@ from torch_geometric.loader import NeighborLoader from torch_geometric.nn import GCNConv +import warnings warnings.filterwarnings("ignore") -def pyg_num_work(): +def pyg_num_work(ngpu_per_node): num_work = None if hasattr(os, "sched_getaffinity"): try: - num_work = len(os.sched_getaffinity(0)) / 2 + num_work = len(os.sched_getaffinity(0)) / (2 * ngpu_per_node) except Exception: pass if num_work is None: - num_work = os.cpu_count() / 2 + num_work = os.cpu_count() / (2 * ngpu_per_node) return int(num_work) - _LOCAL_PROCESS_GROUP = None - def create_local_process_group(num_workers_per_node): global _LOCAL_PROCESS_GROUP assert _LOCAL_PROCESS_GROUP is None @@ -85,13 +88,11 @@ def create_local_process_group(num_workers_per_node): num_nodes = world_size // num_workers_per_node node_rank = rank // num_workers_per_node for i in range(num_nodes): - ranks_on_i = list( - range(i * num_workers_per_node, (i + 1) * num_workers_per_node)) + ranks_on_i = list(range(i * num_workers_per_node, (i + 1) * num_workers_per_node)) pg = dist.new_group(ranks_on_i) if i == node_rank: _LOCAL_PROCESS_GROUP = pg - def get_local_process_group(): assert _LOCAL_PROCESS_GROUP is not None return _LOCAL_PROCESS_GROUP @@ -111,7 +112,7 @@ def forward(self, x, edge_index, edge_weight=None): return x -def run_train(device, data, world_size, model, epochs, batch_size, fan_out, +def run_train(device, data, world_size, ngpu_per_node, model, epochs, batch_size, fan_out, split_idx, num_classes): local_group = get_local_process_group() loc_id = dist.get_rank(group=local_group) @@ -126,26 +127,31 @@ def run_train(device, data, world_size, model, epochs, batch_size, fan_out, model = DistributedDataParallel(model, device_ids=[loc_id]) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005) + num_work = pyg_num_work(ngpu_per_node) train_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], input_nodes=split_idx['train'], batch_size=batch_size, - num_workers=pyg_num_work()) + shuffle=True, + num_workers=num_work) if rank == 0: eval_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], input_nodes=split_idx['valid'], batch_size=batch_size, - num_workers=pyg_num_work()) + shuffle=True, + num_workers=num_work) test_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], input_nodes=split_idx['test'], batch_size=batch_size, - num_workers=pyg_num_work()) - eval_steps = 100 + shuffle=False, + num_workers=num_work) + eval_steps = 1000 + warmup_steps = 100 acc = Accuracy(task="multiclass", num_classes=num_classes).to(device) if rank == 0: print("Beginning training...") for epoch in range(epochs): for i, batch in enumerate(train_loader): - if i >= 10: + if i >= warmup_steps: start = time.time() batch = batch.to(device) batch.y = batch.y.to(torch.long) @@ -165,7 +171,7 @@ def run_train(device, data, world_size, model, epochs, batch_size, fan_out, for i, batch in enumerate(eval_loader): if i >= eval_steps: break - if i >= 10: + if i >= warmup_steps: start = time.time() batch = batch.to(device) batch.y = batch.y.to(torch.long) @@ -174,7 +180,7 @@ def run_train(device, data, world_size, model, epochs, batch_size, fan_out, batch.y[:batch_size]) print(f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) print("Average Inference Iteration Time:", - (time.time() - start) / (i - 10), "s/iter") + (time.time() - start) / (i - warmup_steps), "s/iter") if rank == 0: acc_sum = 0.0 with torch.no_grad(): @@ -203,11 +209,10 @@ def run_train(device, data, world_size, model, epochs, batch_size, fan_out, args = parser.parse_args() # setup multi node torch.distributed.init_process_group("nccl") - nprocs = dist.get_world_size() + nprocs = dist.get_world_size() create_local_process_group(args.ngpu_per_node) local_group = get_local_process_group() - device_id = dist.get_rank( - group=local_group) if dist.is_initialized() else 0 + device_id = dist.get_rank(group=local_group) if dist.is_initialized() else 0 torch.cuda.set_device(device_id) device = torch.device(device_id) all_pids = torch.zeros(dist.get_world_size(), dtype=torch.int64).to(device) @@ -221,5 +226,5 @@ def run_train(device, data, world_size, model, epochs, batch_size, fan_out, data.y = data.y.reshape(-1) model = GCN(dataset.num_features, args.hidden_channels, dataset.num_classes) - run_train(device, data, nprocs, model, args.epochs, args.batch_size, - args.fan_out, split_idx, dataset.num_classes) + run_train(device, data, nprocs, args.ngpu_per_node, model, args.epochs, args.batch_size, + args.fan_out, split_idx, dataset.num_classes) From d4605a3cece3aaaf6a8f08cc88c45a35aace54c0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 22 Sep 2023 22:12:23 +0000 Subject: [PATCH 008/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../multinode-multigpu-papers100m-gcn.py | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py b/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py index 9bde2d61eef1..9795037bdec5 100644 --- a/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py +++ b/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py @@ -45,10 +45,10 @@ ''' - import argparse import os import time +import warnings import torch import torch.distributed as dist @@ -60,7 +60,6 @@ from torch_geometric.loader import NeighborLoader from torch_geometric.nn import GCNConv -import warnings warnings.filterwarnings("ignore") @@ -76,8 +75,10 @@ def pyg_num_work(ngpu_per_node): num_work = os.cpu_count() / (2 * ngpu_per_node) return int(num_work) + _LOCAL_PROCESS_GROUP = None + def create_local_process_group(num_workers_per_node): global _LOCAL_PROCESS_GROUP assert _LOCAL_PROCESS_GROUP is None @@ -88,11 +89,13 @@ def create_local_process_group(num_workers_per_node): num_nodes = world_size // num_workers_per_node node_rank = rank // num_workers_per_node for i in range(num_nodes): - ranks_on_i = list(range(i * num_workers_per_node, (i + 1) * num_workers_per_node)) + ranks_on_i = list( + range(i * num_workers_per_node, (i + 1) * num_workers_per_node)) pg = dist.new_group(ranks_on_i) if i == node_rank: _LOCAL_PROCESS_GROUP = pg + def get_local_process_group(): assert _LOCAL_PROCESS_GROUP is not None return _LOCAL_PROCESS_GROUP @@ -112,8 +115,8 @@ def forward(self, x, edge_index, edge_weight=None): return x -def run_train(device, data, world_size, ngpu_per_node, model, epochs, batch_size, fan_out, - split_idx, num_classes): +def run_train(device, data, world_size, ngpu_per_node, model, epochs, + batch_size, fan_out, split_idx, num_classes): local_group = get_local_process_group() loc_id = dist.get_rank(group=local_group) rank = torch.distributed.get_rank() @@ -130,19 +133,16 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, batch_size num_work = pyg_num_work(ngpu_per_node) train_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], input_nodes=split_idx['train'], - batch_size=batch_size, - shuffle=True, + batch_size=batch_size, shuffle=True, num_workers=num_work) if rank == 0: eval_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], input_nodes=split_idx['valid'], - batch_size=batch_size, - shuffle=True, + batch_size=batch_size, shuffle=True, num_workers=num_work) test_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], input_nodes=split_idx['test'], - batch_size=batch_size, - shuffle=False, + batch_size=batch_size, shuffle=False, num_workers=num_work) eval_steps = 1000 warmup_steps = 100 @@ -209,10 +209,11 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, batch_size args = parser.parse_args() # setup multi node torch.distributed.init_process_group("nccl") - nprocs = dist.get_world_size() + nprocs = dist.get_world_size() create_local_process_group(args.ngpu_per_node) local_group = get_local_process_group() - device_id = dist.get_rank(group=local_group) if dist.is_initialized() else 0 + device_id = dist.get_rank( + group=local_group) if dist.is_initialized() else 0 torch.cuda.set_device(device_id) device = torch.device(device_id) all_pids = torch.zeros(dist.get_world_size(), dtype=torch.int64).to(device) @@ -226,5 +227,5 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, batch_size data.y = data.y.reshape(-1) model = GCN(dataset.num_features, args.hidden_channels, dataset.num_classes) - run_train(device, data, nprocs, args.ngpu_per_node, model, args.epochs, args.batch_size, - args.fan_out, split_idx, dataset.num_classes) + run_train(device, data, nprocs, args.ngpu_per_node, model, args.epochs, + args.batch_size, args.fan_out, split_idx, dataset.num_classes) From 94e47570d6b7010f6bf57ad52e15f0f317542cdf Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 22 Sep 2023 15:27:32 -0700 Subject: [PATCH 009/197] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76e0eafc2a69..abd149ca6cd5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [2.4.0] - 2023-MM-DD ### Added - +- Added multinode-multigpu Papers100m GCN example ([#8070](https://github.com/pyg-team/pytorch_geometric/pull/8070)) - Added `OnDiskDataset` interface ([#8066](https://github.com/pyg-team/pytorch_geometric/pull/8066)) - Added a tutorial for `Node2Vec` and `MetaPath2Vec` usage ([#7938](https://github.com/pyg-team/pytorch_geometric/pull/7938) - Added a tutorial for multi-GPU training with pure PyTorch ([#7894](https://github.com/pyg-team/pytorch_geometric/pull/7894) From b45f939ca9427c706b30353d6046eb711e058434 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 25 Sep 2023 09:46:17 -0700 Subject: [PATCH 010/197] removing unused code --- examples/multi_gpu/multinode-multigpu-papers100m-gcn.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py b/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py index 9795037bdec5..be2e84bb47ac 100644 --- a/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py +++ b/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py @@ -216,9 +216,6 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, group=local_group) if dist.is_initialized() else 0 torch.cuda.set_device(device_id) device = torch.device(device_id) - all_pids = torch.zeros(dist.get_world_size(), dtype=torch.int64).to(device) - all_pids[dist.get_rank()] = os.getpid() - dist.all_reduce(all_pids) dataset = PygNodePropPredDataset(name='ogbn-papers100M') split_idx = dataset.get_idx_split() From 25bce033e64a309666860de9b1116ebb469ac6f4 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 25 Sep 2023 09:59:37 -0700 Subject: [PATCH 011/197] cleaning up --- ...pu-papers100m-gcn.py => multinode_multigpu_papers100m_gcn.py} | 1 - 1 file changed, 1 deletion(-) rename examples/multi_gpu/{multinode-multigpu-papers100m-gcn.py => multinode_multigpu_papers100m_gcn.py} (99%) diff --git a/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py similarity index 99% rename from examples/multi_gpu/multinode-multigpu-papers100m-gcn.py rename to examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index be2e84bb47ac..5e7733b3dc97 100644 --- a/examples/multi_gpu/multinode-multigpu-papers100m-gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -52,7 +52,6 @@ import torch import torch.distributed as dist -import torch.multiprocessing as mp import torch.nn.functional as F from ogb.nodeproppred import PygNodePropPredDataset from torch.nn.parallel import DistributedDataParallel From 96719ef8e00c0a0cc0eda3a24a28311c594748be Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 25 Sep 2023 10:25:36 -0700 Subject: [PATCH 012/197] cleaning flake --- .../multinode_multigpu_papers100m_gcn.py | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 5e7733b3dc97..bb565a118d14 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -1,7 +1,8 @@ ''' in terminal 1: -srun --overlap -A -p interactive -J -N 2 -t 02:00:00 --pty bash +srun --overlap -A -p interactive \ + -J -N 2 -t 02:00:00 --pty bash in terminal 2: squeue -u @@ -10,34 +11,39 @@ then -srun -l -N2 --ntasks-per-node=1 --overlap --jobid=$jobid \ ---container-image= --container-name=cont \ ---container-mounts=/ogb-papers100m/:/workspace/dataset true +srun -l -N2 --ntasks-per-node=1 --overlap --jobid=$jobid + --container-image= --container-name=cont + --container-mounts=/ogb-papers100m/:/workspace/dataset true -srun -l -N2 --ntasks-per-node=3 --overlap --jobid=$jobid \ ---container-name=cont-rp-9-22 \ ---container-mounts=/lustre/fsw/dlfw/dlfw-pyg/riship/ogb-papers100m/:/workspace/dataset/ \ +srun -l -N2 --ntasks-per-node=3 --overlap --jobid=$jobid + --container-name=cont-rp-9-22 + --container-mounts= + /lustre/fsw/dlfw/dlfw-pyg/riship/ogb-papers100m/:/workspace/dataset/ python3 multinode-papers100m-gcn.py --ngpu_per_node 3 Results: -Data = Data(num_nodes=111059956, edge_index=[2, 1615685872], x=[111059956, 128], node_year=[111059956, 1], y=[111059956]) +Data = Data(num_nodes=111059956, edge_index=[2, 1615685872], + x=[111059956, 128], node_year=[111059956, 1], y=[111059956]) Using 6 GPUs... Beginning training... -Epoch: 0, Iteration: 1570, Loss: tensor(2.7372, device='cuda:0', grad_fn=) +Epoch: 0, Iteration: 1570, Loss: + tensor(2.7372, device='cuda:0', grad_fn=) Average Training Iteration Time: 0.0022558025027757116 s/iter Validation Accuracy: 33.1712% Average Inference Iteration Time: 0.002441989262174637 s/iter -Epoch: 1, Iteration: 1570, Loss: tensor(2.6074, device='cuda:0', grad_fn=) +Epoch: 1, Iteration: 1570, Loss: + tensor(2.6074, device='cuda:0', grad_fn=) Average Training Iteration Time: 0.002187901319104231 s/iter Validation Accuracy: 32.2733% Average Inference Iteration Time: 0.002225210835015855 s/iter -Epoch: 2, Iteration: 1570, Loss: tensor(2.5593, device='cuda:0', grad_fn=) +Epoch: 2, Iteration: 1570, Loss: + tensor(2.5593, device='cuda:0', grad_fn=) Average Training Iteration Time: 0.002199090496994302 s/iter Validation Accuracy: 33.9588% Average Inference Iteration Time: 0.003229572181006499 s/iter From 7f4b0a1331821d383b416db997ce7e475ad9a82e Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 25 Sep 2023 10:40:02 -0700 Subject: [PATCH 013/197] fix --- examples/multi_gpu/multigpu_papers100m_gcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_gpu/multigpu_papers100m_gcn.py b/examples/multi_gpu/multigpu_papers100m_gcn.py index 52d69512f9ea..f029a1c797ac 100644 --- a/examples/multi_gpu/multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multigpu_papers100m_gcn.py @@ -61,7 +61,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, eval_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], input_nodes=split_idx['valid'], batch_size=batch_size, - shuffle=True + shuffle=True, num_workers=num_work) test_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], input_nodes=split_idx['test'], From c78de794314f0124f26a3391189d5be02b07e626 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 25 Sep 2023 17:40:56 +0000 Subject: [PATCH 014/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/multigpu_papers100m_gcn.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/examples/multi_gpu/multigpu_papers100m_gcn.py b/examples/multi_gpu/multigpu_papers100m_gcn.py index f029a1c797ac..b4a9d0ff0751 100644 --- a/examples/multi_gpu/multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multigpu_papers100m_gcn.py @@ -54,19 +54,16 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, num_work = pyg_num_work(world_size) train_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], input_nodes=split_idx['train'], - batch_size=batch_size, - shuffle=True, + batch_size=batch_size, shuffle=True, num_workers=num_work) if rank == 0: eval_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], input_nodes=split_idx['valid'], - batch_size=batch_size, - shuffle=True, + batch_size=batch_size, shuffle=True, num_workers=num_work) test_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], input_nodes=split_idx['test'], - batch_size=batch_size, - shuffle=False, + batch_size=batch_size, shuffle=False, num_workers=num_work) eval_steps = 1000 warmup_steps = 100 From 381cfe52b68d0614e9f029a545f5354869759432 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 25 Sep 2023 17:42:33 -0700 Subject: [PATCH 015/197] cleaning --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index bb565a118d14..12832c74d7b3 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -5,9 +5,10 @@ -J -N 2 -t 02:00:00 --pty bash in terminal 2: + squeue -u then -export jobid=<> +export jobid= then @@ -16,7 +17,7 @@ --container-mounts=/ogb-papers100m/:/workspace/dataset true srun -l -N2 --ntasks-per-node=3 --overlap --jobid=$jobid - --container-name=cont-rp-9-22 + --container-name=cont --container-mounts= /lustre/fsw/dlfw/dlfw-pyg/riship/ogb-papers100m/:/workspace/dataset/ python3 multinode-papers100m-gcn.py --ngpu_per_node 3 From 4da89c146bbed634e75a1f5d7a8f21e157fac5f6 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 6 Oct 2023 09:15:49 -0700 Subject: [PATCH 016/197] Rename multigpu_papers100m_gcn.py to singlenode_multigpu_papers100m_gcn.py --- ...pu_papers100m_gcn.py => singlenode_multigpu_papers100m_gcn.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/multi_gpu/{multigpu_papers100m_gcn.py => singlenode_multigpu_papers100m_gcn.py} (100%) diff --git a/examples/multi_gpu/multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py similarity index 100% rename from examples/multi_gpu/multigpu_papers100m_gcn.py rename to examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py From 60132dd1f0bb48fb9532f65bb75d6477712f4e3e Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Tue, 10 Oct 2023 10:45:21 -0700 Subject: [PATCH 017/197] upgrading papers100m examples --- .../multinode_multigpu_papers100m_gcn.py | 30 ++++++++++++++----- examples/ogbn_papers_100m.py | 10 ++++++- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 12832c74d7b3..3aa22a1f30e8 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -65,7 +65,7 @@ from torchmetrics import Accuracy from torch_geometric.loader import NeighborLoader -from torch_geometric.nn import GCNConv +from torch_geometric.nn import GCNConv, GATConv warnings.filterwarnings("ignore") @@ -107,11 +107,15 @@ def get_local_process_group(): return _LOCAL_PROCESS_GROUP -class GCN(torch.nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels): +class GNN(torch.nn.Module): + def __init__(self, in_channels, hidden_channels, out_channels, use_gat_conv=False, n_gat_conv_heads=1): super().__init__() - self.conv1 = GCNConv(in_channels, hidden_channels) - self.conv2 = GCNConv(hidden_channels, out_channels) + if use_gat_conv: + self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) + self.conv2 = GATConv(hidden_channels, out_channels, heads=n_gat_conv_heads) + else: + self.conv1 = GCNConv(in_channels, hidden_channels) + self.conv2 = GCNConv(hidden_channels, out_channels) def forward(self, x, edge_index, edge_weight=None): x = F.dropout(x, p=0.5, training=self.training) @@ -212,6 +216,18 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, default="1", help="number of GPU(s) for each node for multi-gpu training,", ) + parser.add_argument( + "--use_gat_conv", + type=bool, + default=False, + help="Wether or not to use GATConv. (Defaults to using GCNConv)", + ) + parser.add_argument( + "--n_gat_conv_heads", + type=int, + default=1, + help="If using GATConv, number of attention heads to use", + ) args = parser.parse_args() # setup multi node torch.distributed.init_process_group("nccl") @@ -228,7 +244,7 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, data = dataset[0] data.y = data.y.reshape(-1) - model = GCN(dataset.num_features, args.hidden_channels, - dataset.num_classes) + model = GNN(dataset.num_features, args.hidden_channels, + dataset.num_classes, args.use_gat_conv, args.n_gat_conv_heads) run_train(device, data, nprocs, args.ngpu_per_node, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 5bd7b590e6a4..605e23ec42f1 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -51,11 +51,14 @@ def forward(self, x, edge_index): model = GCN(dataset.num_features, 64, dataset.num_classes).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005) +warmup_steps = 50 def train(): model.train() for i, batch in enumerate(train_loader): + if i >= warmup_steps: + start_avg_time = time.perf_counter() start = time.perf_counter() batch = batch.to(device) optimizer.zero_grad() @@ -69,6 +72,8 @@ def train(): print(f'Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}, ' f's/iter: {time.perf_counter() - start:.6f}') + print(f'Average Training Iteration Time (s/iter): {time.perf_counter() - start_avg_time:.6f}') + @torch.no_grad() def test(loader: NeighborLoader, eval_steps: Optional[int] = None): @@ -78,7 +83,8 @@ def test(loader: NeighborLoader, eval_steps: Optional[int] = None): for i, batch in enumerate(loader): if eval_steps is not None and i >= eval_steps: break - + if i >= warmup_steps: + start_avg_time = time.perf_counter() batch = batch.to(device) out = model(batch.x, batch.edge_index)[:batch.batch_size] pred = out.argmax(dim=-1) @@ -87,6 +93,8 @@ def test(loader: NeighborLoader, eval_steps: Optional[int] = None): total_correct += int((pred == y).sum()) total_examples += y.size(0) + print(f'Average Inference Iteration Time (s/iter): {time.perf_counter() - start_avg_time:.6f}') + return total_correct / total_examples From 0c36a60f139585ed9186790ae867c2de670653cc Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Tue, 10 Oct 2023 11:00:34 -0700 Subject: [PATCH 018/197] upgrading papers100m examples --- .../multinode_multigpu_papers100m_gcn.py | 3 +- .../singlenode_multigpu_papers100m_gcn.py | 28 +++++++++++++++---- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 3aa22a1f30e8..c56858a630f6 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -19,7 +19,7 @@ srun -l -N2 --ntasks-per-node=3 --overlap --jobid=$jobid --container-name=cont --container-mounts= - /lustre/fsw/dlfw/dlfw-pyg/riship/ogb-papers100m/:/workspace/dataset/ + /ogb-papers100m/:/workspace/dataset/ python3 multinode-papers100m-gcn.py --ngpu_per_node 3 @@ -228,6 +228,7 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, default=1, help="If using GATConv, number of attention heads to use", ) + args = parser.parse_args() # setup multi node torch.distributed.init_process_group("nccl") diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index b4a9d0ff0751..0b61d181dd85 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -26,11 +26,15 @@ def pyg_num_work(world_size): return int(num_work) -class GCN(torch.nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels): +class GNN(torch.nn.Module): + def __init__(self, in_channels, hidden_channels, out_channels, use_gat_conv=False, n_gat_conv_heads=1): super().__init__() - self.conv1 = GCNConv(in_channels, hidden_channels) - self.conv2 = GCNConv(hidden_channels, out_channels) + if use_gat_conv: + self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) + self.conv2 = GATConv(hidden_channels, out_channels, heads=n_gat_conv_heads) + else: + self.conv1 = GCNConv(in_channels, hidden_channels) + self.conv2 = GCNConv(hidden_channels, out_channels) def forward(self, x, edge_index, edge_weight=None): x = F.dropout(x, p=0.5, training=self.training) @@ -121,6 +125,18 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, parser.add_argument('--epochs', type=int, default=3) parser.add_argument('--batch_size', type=int, default=128) parser.add_argument('--fan_out', type=int, default=50) + parser.add_argument( + "--use_gat_conv", + type=bool, + default=False, + help="Wether or not to use GATConv. (Defaults to using GCNConv)", + ) + parser.add_argument( + "--n_gat_conv_heads", + type=int, + default=1, + help="If using GATConv, number of attention heads to use", + ) args = parser.parse_args() @@ -128,8 +144,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, split_idx = dataset.get_idx_split() data = dataset[0] data.y = data.y.reshape(-1) - model = GCN(dataset.num_features, args.hidden_channels, - dataset.num_classes) + model = GNN(dataset.num_features, args.hidden_channels, + dataset.num_classes, args.use_gat_conv, args.n_gat_conv_heads) print("Data =", data) world_size = torch.cuda.device_count() print('Let\'s use', world_size, 'GPUs!') From 32132c9d428a89194445d6c1a96d02b29aa29bd5 Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Tue, 10 Oct 2023 11:17:06 -0700 Subject: [PATCH 019/197] upgrading papers100m examples --- .../multinode_multigpu_papers100m_gcn.py | 14 +++--- .../singlenode_multigpu_papers100m_gcn.py | 12 ++--- examples/ogbn_papers_100m.py | 46 ++++++++++++++----- 3 files changed, 47 insertions(+), 25 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index c56858a630f6..db4f09d1a0b0 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -108,7 +108,7 @@ def get_local_process_group(): class GNN(torch.nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels, use_gat_conv=False, n_gat_conv_heads=1): + def __init__(self, in_channels, hidden_channels, out_channels, use_gat_conv=False, n_gat_conv_heads=4): super().__init__() if use_gat_conv: self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) @@ -205,11 +205,11 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--hidden_channels', type=int, default=64) - parser.add_argument('--lr', type=float, default=0.01) + parser.add_argument('--hidden_channels', type=int, default=128) + parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=3) - parser.add_argument('--batch_size', type=int, default=128) - parser.add_argument('--fan_out', type=int, default=50) + parser.add_argument('--batch_size', type=int, default=2048) + parser.add_argument('--fan_out', type=int, default=16) parser.add_argument( "--ngpu_per_node", type=int, @@ -225,10 +225,10 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, parser.add_argument( "--n_gat_conv_heads", type=int, - default=1, + default=4, help="If using GATConv, number of attention heads to use", ) - + args = parser.parse_args() # setup multi node torch.distributed.init_process_group("nccl") diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 0b61d181dd85..b110672509fe 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -27,7 +27,7 @@ def pyg_num_work(world_size): class GNN(torch.nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels, use_gat_conv=False, n_gat_conv_heads=1): + def __init__(self, in_channels, hidden_channels, out_channels, use_gat_conv=False, n_gat_conv_heads=4): super().__init__() if use_gat_conv: self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) @@ -120,11 +120,11 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--hidden_channels', type=int, default=64) - parser.add_argument('--lr', type=float, default=0.01) + parser.add_argument('--hidden_channels', type=int, default=128) + parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=3) - parser.add_argument('--batch_size', type=int, default=128) - parser.add_argument('--fan_out', type=int, default=50) + parser.add_argument('--batch_size', type=int, default=2048) + parser.add_argument('--fan_out', type=int, default=16) parser.add_argument( "--use_gat_conv", type=bool, @@ -134,7 +134,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, parser.add_argument( "--n_gat_conv_heads", type=int, - default=1, + default=4, help="If using GATConv, number of attention heads to use", ) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 605e23ec42f1..9245094f701c 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -1,14 +1,32 @@ import os import time from typing import Optional - +import argparse import torch import torch.nn.functional as F from ogb.nodeproppred import PygNodePropPredDataset from torch_geometric.loader import NeighborLoader -from torch_geometric.nn import GCNConv - +from torch_geometric.nn import GCNConv, GATConv + +parser = argparse.ArgumentParser() +parser.add_argument('--hidden_channels', type=int, default=128) +parser.add_argument('--lr', type=float, default=0.001) +parser.add_argument('--epochs', type=int, default=3) +parser.add_argument('--batch_size', type=int, default=2048) +parser.add_argument('--fan_out', type=int, default=16) +parser.add_argument( + "--use_gat_conv", + type=bool, + default=False, + help="Wether or not to use GATConv. (Defaults to using GCNConv)", +) +parser.add_argument( + "--n_gat_conv_heads", + type=int, + default=4, + help="If using GATConv, number of attention heads to use", +) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') dataset = PygNodePropPredDataset(name='ogbn-papers100M') @@ -34,21 +52,25 @@ def get_num_workers() -> int: test_loader = NeighborLoader(input_nodes=split_idx['test'], **kwargs) -class GCN(torch.nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels): +class GNN(torch.nn.Module): + def __init__(self, in_channels, hidden_channels, out_channels, use_gat_conv=False, n_gat_conv_heads=4): super().__init__() - self.conv1 = GCNConv(in_channels, hidden_channels) - self.conv2 = GCNConv(hidden_channels, out_channels) - - def forward(self, x, edge_index): + if use_gat_conv: + self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) + self.conv2 = GATConv(hidden_channels, out_channels, heads=n_gat_conv_heads) + else: + self.conv1 = GCNConv(in_channels, hidden_channels) + self.conv2 = GCNConv(hidden_channels, out_channels) + + def forward(self, x, edge_index, edge_weight=None): x = F.dropout(x, p=0.5, training=self.training) - x = self.conv1(x, edge_index).relu() + x = self.conv1(x, edge_index, edge_weight).relu() x = F.dropout(x, p=0.5, training=self.training) - x = self.conv2(x, edge_index) + x = self.conv2(x, edge_index, edge_weight) return x -model = GCN(dataset.num_features, 64, dataset.num_classes).to(device) +model = GNN(dataset.num_features, 64, dataset.num_classes, args.use_gat_conv, args.n_gat_conv_heads).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005) warmup_steps = 50 From 05700326889a737e0653861ca21828c00e331266 Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Tue, 10 Oct 2023 11:36:34 -0700 Subject: [PATCH 020/197] upgrading papers100m examples --- .../multi_gpu/multinode_multigpu_papers100m_gcn.py | 6 ++++++ .../multi_gpu/singlenode_multigpu_papers100m_gcn.py | 6 ++++++ examples/ogbn_papers_100m.py | 13 ++++++++++--- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index db4f09d1a0b0..5a8ead7de9a0 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -228,6 +228,12 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, default=4, help="If using GATConv, number of attention heads to use", ) + parser.add_argument( + "--cugraph_data_loader", + type=bool, + default=False, + help="Wether or not to use CuGraph for Neighbor Loading", + ) args = parser.parse_args() # setup multi node diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index b110672509fe..d2903dd44dab 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -137,6 +137,12 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, default=4, help="If using GATConv, number of attention heads to use", ) + parser.add_argument( + "--cugraph_data_loader", + type=bool, + default=False, + help="Wether or not to use CuGraph for Neighbor Loading", + ) args = parser.parse_args() diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 9245094f701c..0bd60c1c10f9 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -27,6 +27,13 @@ default=4, help="If using GATConv, number of attention heads to use", ) +parser.add_argument( + "--cugraph_data_loader", + type=bool, + default=False, + help="Wether or not to use CuGraph for Neighbor Loading", +) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') dataset = PygNodePropPredDataset(name='ogbn-papers100M') @@ -42,8 +49,8 @@ def get_num_workers() -> int: kwargs = dict( data=dataset[0], - num_neighbors=[50, 50], - batch_size=128, + num_neighbors=[args.fan_out, args.fan_out], + batch_size=args.batch_size, num_workers=get_num_workers(), ) train_loader = NeighborLoader(input_nodes=split_idx['train'], shuffle=True, @@ -70,7 +77,7 @@ def forward(self, x, edge_index, edge_weight=None): return x -model = GNN(dataset.num_features, 64, dataset.num_classes, args.use_gat_conv, args.n_gat_conv_heads).to(device) +model = GNN(dataset.num_features, args.hidden_channels, dataset.num_classes, args.use_gat_conv, args.n_gat_conv_heads).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005) warmup_steps = 50 From 90c7baffc27248026effb11e3651ec4af33a1a71 Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Tue, 10 Oct 2023 11:40:31 -0700 Subject: [PATCH 021/197] upgrading papers100m examples --- examples/ogbn_papers_100m.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 0bd60c1c10f9..4a29e2fafe17 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -78,7 +78,7 @@ def forward(self, x, edge_index, edge_weight=None): model = GNN(dataset.num_features, args.hidden_channels, dataset.num_classes, args.use_gat_conv, args.n_gat_conv_heads).to(device) -optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005) +optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005) warmup_steps = 50 @@ -127,7 +127,7 @@ def test(loader: NeighborLoader, eval_steps: Optional[int] = None): return total_correct / total_examples -for epoch in range(1, 4): +for epoch in range(1, 1+args.epochs): train() val_acc = test(val_loader, eval_steps=100) print(f'Val Acc: ~{val_acc:.4f}') From c517c7d0a4de6cc6efa7eb663dd763d9557662aa Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Tue, 10 Oct 2023 11:58:22 -0700 Subject: [PATCH 022/197] upgrading papers100m examples --- .../multinode_multigpu_papers100m_gcn.py | 44 ++++++++++++------- .../singlenode_multigpu_papers100m_gcn.py | 44 ++++++++++++------- examples/ogbn_papers_100m.py | 27 +++++++++--- 3 files changed, 78 insertions(+), 37 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 5a8ead7de9a0..d3b5dbd87558 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -126,7 +126,7 @@ def forward(self, x, edge_index, edge_weight=None): def run_train(device, data, world_size, ngpu_per_node, model, epochs, - batch_size, fan_out, split_idx, num_classes): + batch_size, fan_out, split_idx, num_classes, cugraph_data_loader): local_group = get_local_process_group() loc_id = dist.get_rank(group=local_group) rank = torch.distributed.get_rank() @@ -141,19 +141,33 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005) num_work = pyg_num_work(ngpu_per_node) - train_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], - input_nodes=split_idx['train'], - batch_size=batch_size, shuffle=True, - num_workers=num_work) - if rank == 0: - eval_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], - input_nodes=split_idx['valid'], - batch_size=batch_size, shuffle=True, - num_workers=num_work) - test_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], - input_nodes=split_idx['test'], - batch_size=batch_size, shuffle=False, - num_workers=num_work) + kwargs = dict( + num_neighbors=[fan_out, fan_out], + batch_size=batch_size, + num_workers=num_work(), + ) + # Set Up Dataloaders + if cugraph_data_loader: + import cugraph + from cugraph_pyg.data import CuGraphStore + from cugraph_pyg.loader import CuGraphNeighborLoader + G = {("N", "E", "N"): graph.edge_index} + N = {"N": graph.num_nodes} + fs = cugraph.gnn.FeatureStore(backend="torch") + fs.add_data(data.x, "N", "x") + fs.add_data(data.y, "N", "y") + cugraph_store = CuGraphStore(fs, G, N) + train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], shuffle=True, **kwargs) + if rank == 0: + eval_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], **kwargs) + test_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['test'], **kwargs) + else: + from torch_geometric.loader import NeighborLoader + train_loader = NeighborLoader(data, input_nodes=split_idx['train'], shuffle=True, **kwargs) + if rank == 0: + eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], **kwargs) + test_loader = NeighborLoader(data, input_nodes=split_idx['test'], **kwargs) + eval_steps = 1000 warmup_steps = 100 acc = Accuracy(task="multiclass", num_classes=num_classes).to(device) @@ -254,4 +268,4 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, model = GNN(dataset.num_features, args.hidden_channels, dataset.num_classes, args.use_gat_conv, args.n_gat_conv_heads) run_train(device, data, nprocs, args.ngpu_per_node, model, args.epochs, - args.batch_size, args.fan_out, split_idx, dataset.num_classes) + args.batch_size, args.fan_out, split_idx, dataset.num_classes, args.cugraph_data_loader) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index d2903dd44dab..a0f789a9941c 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -10,7 +10,6 @@ from torch.nn.parallel import DistributedDataParallel from torchmetrics import Accuracy -from torch_geometric.loader import NeighborLoader from torch_geometric.nn import GCNConv @@ -45,7 +44,7 @@ def forward(self, x, edge_index, edge_weight=None): def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, - split_idx, num_classes): + split_idx, num_classes, cugraph_data_loader): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '12355' dist.init_process_group('nccl', rank=rank, world_size=world_size) @@ -56,19 +55,32 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005) num_work = pyg_num_work(world_size) - train_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], - input_nodes=split_idx['train'], - batch_size=batch_size, shuffle=True, - num_workers=num_work) - if rank == 0: - eval_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], - input_nodes=split_idx['valid'], - batch_size=batch_size, shuffle=True, - num_workers=num_work) - test_loader = NeighborLoader(data, num_neighbors=[fan_out, fan_out], - input_nodes=split_idx['test'], - batch_size=batch_size, shuffle=False, - num_workers=num_work) + kwargs = dict( + num_neighbors=[fan_out, fan_out], + batch_size=batch_size, + num_workers=num_work(), + ) + # Set Up Dataloaders + if cugraph_data_loader: + import cugraph + from cugraph_pyg.data import CuGraphStore + from cugraph_pyg.loader import CuGraphNeighborLoader + G = {("N", "E", "N"): graph.edge_index} + N = {"N": graph.num_nodes} + fs = cugraph.gnn.FeatureStore(backend="torch") + fs.add_data(data.x, "N", "x") + fs.add_data(data.y, "N", "y") + cugraph_store = CuGraphStore(fs, G, N) + train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], shuffle=True, **kwargs) + if rank == 0: + eval_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], **kwargs) + test_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['test'], **kwargs) + else: + from torch_geometric.loader import NeighborLoader + train_loader = NeighborLoader(data, input_nodes=split_idx['train'], shuffle=True, **kwargs) + if rank == 0: + eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], **kwargs) + test_loader = NeighborLoader(data, input_nodes=split_idx['test'], **kwargs) eval_steps = 1000 warmup_steps = 100 acc = Accuracy(task="multiclass", num_classes=num_classes).to(rank) @@ -157,5 +169,5 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, print('Let\'s use', world_size, 'GPUs!') mp.spawn( run_train, args=(data, world_size, model, args.epochs, args.batch_size, - args.fan_out, split_idx, dataset.num_classes), + args.fan_out, split_idx, dataset.num_classes, args.cugraph_data_loader), nprocs=world_size, join=True) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 4a29e2fafe17..2f0df83c7a22 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -6,7 +6,6 @@ import torch.nn.functional as F from ogb.nodeproppred import PygNodePropPredDataset -from torch_geometric.loader import NeighborLoader from torch_geometric.nn import GCNConv, GATConv parser = argparse.ArgumentParser() @@ -48,15 +47,31 @@ def get_num_workers() -> int: kwargs = dict( - data=dataset[0], num_neighbors=[args.fan_out, args.fan_out], batch_size=args.batch_size, num_workers=get_num_workers(), ) -train_loader = NeighborLoader(input_nodes=split_idx['train'], shuffle=True, - **kwargs) -val_loader = NeighborLoader(input_nodes=split_idx['valid'], **kwargs) -test_loader = NeighborLoader(input_nodes=split_idx['test'], **kwargs) +# Set Up Dataloaders +data = dataset[0] +if args.cugraph_data_loader: + import cugraph + from cugraph_pyg.data import CuGraphStore + from cugraph_pyg.loader import CuGraphNeighborLoader + G = {("N", "E", "N"): graph.edge_index} + N = {"N": graph.num_nodes} + fs = cugraph.gnn.FeatureStore(backend="torch") + fs.add_data(data.x, "N", "x") + fs.add_data(data.y, "N", "y") + cugraph_store = CuGraphStore(fs, G, N) + train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], shuffle=True, **kwargs) + val_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], **kwargs) + test_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['test'], **kwargs) +else: + from torch_geometric.loader import NeighborLoader + train_loader = NeighborLoader(data=data, input_nodes=split_idx['train'], shuffle=True, + **kwargs) + val_loader = NeighborLoader(data=data, input_nodes=split_idx['valid'], **kwargs) + test_loader = NeighborLoader(data=data, input_nodes=split_idx['test'], **kwargs) class GNN(torch.nn.Module): From 57605ebfdf3c9004c69835c00281e5c72fc27abd Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Tue, 10 Oct 2023 12:02:00 -0700 Subject: [PATCH 023/197] upgrading papers100m examples --- .../multi_gpu/multinode_multigpu_papers100m_gcn.py | 8 ++++---- .../multi_gpu/singlenode_multigpu_papers100m_gcn.py | 10 +++++----- examples/ogbn_papers_100m.py | 8 ++++---- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index d3b5dbd87558..16fd4b9a4f8b 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -140,7 +140,6 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, model = DistributedDataParallel(model, device_ids=[loc_id]) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005) - num_work = pyg_num_work(ngpu_per_node) kwargs = dict( num_neighbors=[fan_out, fan_out], batch_size=batch_size, @@ -163,10 +162,11 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, test_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['test'], **kwargs) else: from torch_geometric.loader import NeighborLoader - train_loader = NeighborLoader(data, input_nodes=split_idx['train'], shuffle=True, **kwargs) + num_work = pyg_num_work(world_size) + train_loader = NeighborLoader(data, input_nodes=split_idx['train'], num_workers=num_work, shuffle=True, **kwargs) if rank == 0: - eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], **kwargs) - test_loader = NeighborLoader(data, input_nodes=split_idx['test'], **kwargs) + eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], num_workers=num_work, **kwargs) + test_loader = NeighborLoader(data, input_nodes=split_idx['test'], num_workers=num_work, **kwargs) eval_steps = 1000 warmup_steps = 100 diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index a0f789a9941c..7bb0c1f8ac71 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -54,11 +54,10 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, model = DistributedDataParallel(model, device_ids=[rank]) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005) - num_work = pyg_num_work(world_size) + kwargs = dict( num_neighbors=[fan_out, fan_out], batch_size=batch_size, - num_workers=num_work(), ) # Set Up Dataloaders if cugraph_data_loader: @@ -77,10 +76,11 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, test_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['test'], **kwargs) else: from torch_geometric.loader import NeighborLoader - train_loader = NeighborLoader(data, input_nodes=split_idx['train'], shuffle=True, **kwargs) + num_work = pyg_num_work(world_size) + train_loader = NeighborLoader(data, input_nodes=split_idx['train'], num_workers=num_work, shuffle=True, **kwargs) if rank == 0: - eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], **kwargs) - test_loader = NeighborLoader(data, input_nodes=split_idx['test'], **kwargs) + eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], num_workers=num_work, **kwargs) + test_loader = NeighborLoader(data, input_nodes=split_idx['test'], num_workers=num_work, **kwargs) eval_steps = 1000 warmup_steps = 100 acc = Accuracy(task="multiclass", num_classes=num_classes).to(rank) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 2f0df83c7a22..3738671db1f3 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -49,7 +49,6 @@ def get_num_workers() -> int: kwargs = dict( num_neighbors=[args.fan_out, args.fan_out], batch_size=args.batch_size, - num_workers=get_num_workers(), ) # Set Up Dataloaders data = dataset[0] @@ -67,11 +66,12 @@ def get_num_workers() -> int: val_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], **kwargs) test_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['test'], **kwargs) else: + num_work = get_num_workers() from torch_geometric.loader import NeighborLoader - train_loader = NeighborLoader(data=data, input_nodes=split_idx['train'], shuffle=True, + train_loader = NeighborLoader(data=data, input_nodes=split_idx['train'], num_workers=num_work, shuffle=True, **kwargs) - val_loader = NeighborLoader(data=data, input_nodes=split_idx['valid'], **kwargs) - test_loader = NeighborLoader(data=data, input_nodes=split_idx['test'], **kwargs) + val_loader = NeighborLoader(data=data, input_nodes=split_idx['valid'], num_workers=num_work, **kwargs) + test_loader = NeighborLoader(data=data, input_nodes=split_idx['test'], num_workers=num_work, **kwargs) class GNN(torch.nn.Module): From b1a216268aa6f593f7e148064e71b4a71816d159 Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Tue, 10 Oct 2023 12:02:47 -0700 Subject: [PATCH 024/197] upgrading papers100m examples --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 3 +-- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 2 +- examples/ogbn_papers_100m.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 16fd4b9a4f8b..7bbefa8a2252 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -143,9 +143,8 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, kwargs = dict( num_neighbors=[fan_out, fan_out], batch_size=batch_size, - num_workers=num_work(), ) - # Set Up Dataloaders + # Set Up Neighbor Loading if cugraph_data_loader: import cugraph from cugraph_pyg.data import CuGraphStore diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 7bb0c1f8ac71..4213d65eaeea 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -59,7 +59,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, num_neighbors=[fan_out, fan_out], batch_size=batch_size, ) - # Set Up Dataloaders + # Set Up Neighbor Loading if cugraph_data_loader: import cugraph from cugraph_pyg.data import CuGraphStore diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 3738671db1f3..014e17fe5669 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -50,7 +50,7 @@ def get_num_workers() -> int: num_neighbors=[args.fan_out, args.fan_out], batch_size=args.batch_size, ) -# Set Up Dataloaders +# Set Up Neighbor Loading data = dataset[0] if args.cugraph_data_loader: import cugraph From bac5de5f3609f88d4262f359256c632248ab6d10 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Oct 2023 19:37:22 +0000 Subject: [PATCH 025/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../multinode_multigpu_papers100m_gcn.py | 39 +++++++++----- .../singlenode_multigpu_papers100m_gcn.py | 38 ++++++++----- examples/ogbn_papers_100m.py | 53 +++++++++++++------ 3 files changed, 89 insertions(+), 41 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 7bbefa8a2252..1bc94ce310c5 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -65,7 +65,7 @@ from torchmetrics import Accuracy from torch_geometric.loader import NeighborLoader -from torch_geometric.nn import GCNConv, GATConv +from torch_geometric.nn import GATConv, GCNConv warnings.filterwarnings("ignore") @@ -108,11 +108,14 @@ def get_local_process_group(): class GNN(torch.nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels, use_gat_conv=False, n_gat_conv_heads=4): + def __init__(self, in_channels, hidden_channels, out_channels, + use_gat_conv=False, n_gat_conv_heads=4): super().__init__() if use_gat_conv: - self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) - self.conv2 = GATConv(hidden_channels, out_channels, heads=n_gat_conv_heads) + self.conv1 = GATConv(in_channels, hidden_channels, + heads=n_gat_conv_heads) + self.conv2 = GATConv(hidden_channels, out_channels, + heads=n_gat_conv_heads) else: self.conv1 = GCNConv(in_channels, hidden_channels) self.conv2 = GCNConv(hidden_channels, out_channels) @@ -126,7 +129,8 @@ def forward(self, x, edge_index, edge_weight=None): def run_train(device, data, world_size, ngpu_per_node, model, epochs, - batch_size, fan_out, split_idx, num_classes, cugraph_data_loader): + batch_size, fan_out, split_idx, num_classes, + cugraph_data_loader): local_group = get_local_process_group() loc_id = dist.get_rank(group=local_group) rank = torch.distributed.get_rank() @@ -155,17 +159,27 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, fs.add_data(data.x, "N", "x") fs.add_data(data.y, "N", "y") cugraph_store = CuGraphStore(fs, G, N) - train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], shuffle=True, **kwargs) + train_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['train'], + shuffle=True, **kwargs) if rank == 0: - eval_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], **kwargs) - test_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['test'], **kwargs) + eval_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['valid'], + **kwargs) + test_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['test'], + **kwargs) else: from torch_geometric.loader import NeighborLoader num_work = pyg_num_work(world_size) - train_loader = NeighborLoader(data, input_nodes=split_idx['train'], num_workers=num_work, shuffle=True, **kwargs) + train_loader = NeighborLoader(data, input_nodes=split_idx['train'], + num_workers=num_work, shuffle=True, + **kwargs) if rank == 0: - eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], num_workers=num_work, **kwargs) - test_loader = NeighborLoader(data, input_nodes=split_idx['test'], num_workers=num_work, **kwargs) + eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], + num_workers=num_work, **kwargs) + test_loader = NeighborLoader(data, input_nodes=split_idx['test'], + num_workers=num_work, **kwargs) eval_steps = 1000 warmup_steps = 100 @@ -267,4 +281,5 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, model = GNN(dataset.num_features, args.hidden_channels, dataset.num_classes, args.use_gat_conv, args.n_gat_conv_heads) run_train(device, data, nprocs, args.ngpu_per_node, model, args.epochs, - args.batch_size, args.fan_out, split_idx, dataset.num_classes, args.cugraph_data_loader) + args.batch_size, args.fan_out, split_idx, dataset.num_classes, + args.cugraph_data_loader) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 4213d65eaeea..61bc380c8995 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -26,11 +26,14 @@ def pyg_num_work(world_size): class GNN(torch.nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels, use_gat_conv=False, n_gat_conv_heads=4): + def __init__(self, in_channels, hidden_channels, out_channels, + use_gat_conv=False, n_gat_conv_heads=4): super().__init__() if use_gat_conv: - self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) - self.conv2 = GATConv(hidden_channels, out_channels, heads=n_gat_conv_heads) + self.conv1 = GATConv(in_channels, hidden_channels, + heads=n_gat_conv_heads) + self.conv2 = GATConv(hidden_channels, out_channels, + heads=n_gat_conv_heads) else: self.conv1 = GCNConv(in_channels, hidden_channels) self.conv2 = GCNConv(hidden_channels, out_channels) @@ -54,7 +57,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, model = DistributedDataParallel(model, device_ids=[rank]) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005) - + kwargs = dict( num_neighbors=[fan_out, fan_out], batch_size=batch_size, @@ -70,17 +73,27 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, fs.add_data(data.x, "N", "x") fs.add_data(data.y, "N", "y") cugraph_store = CuGraphStore(fs, G, N) - train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], shuffle=True, **kwargs) + train_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['train'], + shuffle=True, **kwargs) if rank == 0: - eval_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], **kwargs) - test_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['test'], **kwargs) + eval_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['valid'], + **kwargs) + test_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['test'], + **kwargs) else: from torch_geometric.loader import NeighborLoader num_work = pyg_num_work(world_size) - train_loader = NeighborLoader(data, input_nodes=split_idx['train'], num_workers=num_work, shuffle=True, **kwargs) + train_loader = NeighborLoader(data, input_nodes=split_idx['train'], + num_workers=num_work, shuffle=True, + **kwargs) if rank == 0: - eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], num_workers=num_work, **kwargs) - test_loader = NeighborLoader(data, input_nodes=split_idx['test'], num_workers=num_work, **kwargs) + eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], + num_workers=num_work, **kwargs) + test_loader = NeighborLoader(data, input_nodes=split_idx['test'], + num_workers=num_work, **kwargs) eval_steps = 1000 warmup_steps = 100 acc = Accuracy(task="multiclass", num_classes=num_classes).to(rank) @@ -169,5 +182,6 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, print('Let\'s use', world_size, 'GPUs!') mp.spawn( run_train, args=(data, world_size, model, args.epochs, args.batch_size, - args.fan_out, split_idx, dataset.num_classes, args.cugraph_data_loader), - nprocs=world_size, join=True) + args.fan_out, split_idx, dataset.num_classes, + args.cugraph_data_loader), nprocs=world_size, + join=True) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 014e17fe5669..1f80d10b2458 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -1,12 +1,13 @@ +import argparse import os import time from typing import Optional -import argparse + import torch import torch.nn.functional as F from ogb.nodeproppred import PygNodePropPredDataset -from torch_geometric.nn import GCNConv, GATConv +from torch_geometric.nn import GATConv, GCNConv parser = argparse.ArgumentParser() parser.add_argument('--hidden_channels', type=int, default=128) @@ -62,24 +63,35 @@ def get_num_workers() -> int: fs.add_data(data.x, "N", "x") fs.add_data(data.y, "N", "y") cugraph_store = CuGraphStore(fs, G, N) - train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], shuffle=True, **kwargs) - val_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], **kwargs) - test_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['test'], **kwargs) + train_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['train'], + shuffle=True, **kwargs) + val_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['valid'], + **kwargs) + test_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['test'], + **kwargs) else: num_work = get_num_workers() from torch_geometric.loader import NeighborLoader - train_loader = NeighborLoader(data=data, input_nodes=split_idx['train'], num_workers=num_work, shuffle=True, - **kwargs) - val_loader = NeighborLoader(data=data, input_nodes=split_idx['valid'], num_workers=num_work, **kwargs) - test_loader = NeighborLoader(data=data, input_nodes=split_idx['test'], num_workers=num_work, **kwargs) + train_loader = NeighborLoader(data=data, input_nodes=split_idx['train'], + num_workers=num_work, shuffle=True, **kwargs) + val_loader = NeighborLoader(data=data, input_nodes=split_idx['valid'], + num_workers=num_work, **kwargs) + test_loader = NeighborLoader(data=data, input_nodes=split_idx['test'], + num_workers=num_work, **kwargs) class GNN(torch.nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels, use_gat_conv=False, n_gat_conv_heads=4): + def __init__(self, in_channels, hidden_channels, out_channels, + use_gat_conv=False, n_gat_conv_heads=4): super().__init__() if use_gat_conv: - self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) - self.conv2 = GATConv(hidden_channels, out_channels, heads=n_gat_conv_heads) + self.conv1 = GATConv(in_channels, hidden_channels, + heads=n_gat_conv_heads) + self.conv2 = GATConv(hidden_channels, out_channels, + heads=n_gat_conv_heads) else: self.conv1 = GCNConv(in_channels, hidden_channels) self.conv2 = GCNConv(hidden_channels, out_channels) @@ -92,11 +104,14 @@ def forward(self, x, edge_index, edge_weight=None): return x -model = GNN(dataset.num_features, args.hidden_channels, dataset.num_classes, args.use_gat_conv, args.n_gat_conv_heads).to(device) -optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005) +model = GNN(dataset.num_features, args.hidden_channels, dataset.num_classes, + args.use_gat_conv, args.n_gat_conv_heads).to(device) +optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, + weight_decay=0.0005) warmup_steps = 50 + def train(): model.train() @@ -116,7 +131,9 @@ def train(): print(f'Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}, ' f's/iter: {time.perf_counter() - start:.6f}') - print(f'Average Training Iteration Time (s/iter): {time.perf_counter() - start_avg_time:.6f}') + print( + f'Average Training Iteration Time (s/iter): {time.perf_counter() - start_avg_time:.6f}' + ) @torch.no_grad() @@ -137,12 +154,14 @@ def test(loader: NeighborLoader, eval_steps: Optional[int] = None): total_correct += int((pred == y).sum()) total_examples += y.size(0) - print(f'Average Inference Iteration Time (s/iter): {time.perf_counter() - start_avg_time:.6f}') + print( + f'Average Inference Iteration Time (s/iter): {time.perf_counter() - start_avg_time:.6f}' + ) return total_correct / total_examples -for epoch in range(1, 1+args.epochs): +for epoch in range(1, 1 + args.epochs): train() val_acc = test(val_loader, eval_steps=100) print(f'Val Acc: ~{val_acc:.4f}') From e9253c8fd50beeb4f90168447f9013214e84c6da Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Tue, 10 Oct 2023 13:00:33 -0700 Subject: [PATCH 026/197] upgrading papers100m examples --- .../multi_gpu/multinode_multigpu_papers100m_gcn.py | 5 ++--- .../multi_gpu/singlenode_multigpu_papers100m_gcn.py | 6 +++--- examples/ogbn_papers_100m.py | 12 +++++++----- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 1bc94ce310c5..53048d5bec95 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -64,7 +64,6 @@ from torch.nn.parallel import DistributedDataParallel from torchmetrics import Accuracy -from torch_geometric.loader import NeighborLoader from torch_geometric.nn import GATConv, GCNConv warnings.filterwarnings("ignore") @@ -153,8 +152,8 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, import cugraph from cugraph_pyg.data import CuGraphStore from cugraph_pyg.loader import CuGraphNeighborLoader - G = {("N", "E", "N"): graph.edge_index} - N = {"N": graph.num_nodes} + G = {("N", "E", "N"): data.edge_index} + N = {"N": data.num_nodes} fs = cugraph.gnn.FeatureStore(backend="torch") fs.add_data(data.x, "N", "x") fs.add_data(data.y, "N", "y") diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 61bc380c8995..9c5378113a29 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -10,7 +10,7 @@ from torch.nn.parallel import DistributedDataParallel from torchmetrics import Accuracy -from torch_geometric.nn import GCNConv +from torch_geometric.nn import GCNConv, GATConv def pyg_num_work(world_size): @@ -67,8 +67,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, import cugraph from cugraph_pyg.data import CuGraphStore from cugraph_pyg.loader import CuGraphNeighborLoader - G = {("N", "E", "N"): graph.edge_index} - N = {"N": graph.num_nodes} + G = {("N", "E", "N"): data.edge_index} + N = {"N": data.num_nodes} fs = cugraph.gnn.FeatureStore(backend="torch") fs.add_data(data.x, "N", "x") fs.add_data(data.y, "N", "y") diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 1f80d10b2458..e25997c7e4c6 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -33,7 +33,7 @@ default=False, help="Wether or not to use CuGraph for Neighbor Loading", ) - +args = parser.parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') dataset = PygNodePropPredDataset(name='ogbn-papers100M') @@ -57,8 +57,8 @@ def get_num_workers() -> int: import cugraph from cugraph_pyg.data import CuGraphStore from cugraph_pyg.loader import CuGraphNeighborLoader - G = {("N", "E", "N"): graph.edge_index} - N = {"N": graph.num_nodes} + G = {("N", "E", "N"): data.edge_index} + N = {"N": data.num_nodes} fs = cugraph.gnn.FeatureStore(backend="torch") fs.add_data(data.x, "N", "x") fs.add_data(data.y, "N", "y") @@ -132,7 +132,8 @@ def train(): f's/iter: {time.perf_counter() - start:.6f}') print( - f'Average Training Iteration Time (s/iter): {time.perf_counter() - start_avg_time:.6f}' + f'Average Training Iteration Time (s/iter): \ + {time.perf_counter() - start_avg_time:.6f}' ) @@ -155,7 +156,8 @@ def test(loader: NeighborLoader, eval_steps: Optional[int] = None): total_examples += y.size(0) print( - f'Average Inference Iteration Time (s/iter): {time.perf_counter() - start_avg_time:.6f}' + f'Average Inference Iteration Time (s/iter): \ + {time.perf_counter() - start_avg_time:.6f}' ) return total_correct / total_examples From 65873877cb267c81c89958905cd33c5fdbce1709 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Oct 2023 20:01:39 +0000 Subject: [PATCH 027/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../multi_gpu/singlenode_multigpu_papers100m_gcn.py | 2 +- examples/ogbn_papers_100m.py | 12 ++++-------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 9c5378113a29..efd95e8c4b92 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -10,7 +10,7 @@ from torch.nn.parallel import DistributedDataParallel from torchmetrics import Accuracy -from torch_geometric.nn import GCNConv, GATConv +from torch_geometric.nn import GATConv, GCNConv def pyg_num_work(world_size): diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index e25997c7e4c6..4b2fa01e8291 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -131,10 +131,8 @@ def train(): print(f'Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}, ' f's/iter: {time.perf_counter() - start:.6f}') - print( - f'Average Training Iteration Time (s/iter): \ - {time.perf_counter() - start_avg_time:.6f}' - ) + print(f'Average Training Iteration Time (s/iter): \ + {time.perf_counter() - start_avg_time:.6f}') @torch.no_grad() @@ -155,10 +153,8 @@ def test(loader: NeighborLoader, eval_steps: Optional[int] = None): total_correct += int((pred == y).sum()) total_examples += y.size(0) - print( - f'Average Inference Iteration Time (s/iter): \ - {time.perf_counter() - start_avg_time:.6f}' - ) + print(f'Average Inference Iteration Time (s/iter): \ + {time.perf_counter() - start_avg_time:.6f}') return total_correct / total_examples From 3630c3c9e1263ee655b4c65f404635f8b599ff9e Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Wed, 11 Oct 2023 13:56:07 -0700 Subject: [PATCH 028/197] clean up --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 5 ++--- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 5 ++--- examples/ogbn_papers_100m.py | 5 ++--- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 53048d5bec95..2d3623585e89 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -244,8 +244,7 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, ) parser.add_argument( "--use_gat_conv", - type=bool, - default=False, + action='store_true', help="Wether or not to use GATConv. (Defaults to using GCNConv)", ) parser.add_argument( @@ -257,7 +256,7 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, parser.add_argument( "--cugraph_data_loader", type=bool, - default=False, + action='store_true', help="Wether or not to use CuGraph for Neighbor Loading", ) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 9c5378113a29..659d4463dcb1 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -152,8 +152,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, parser.add_argument('--fan_out', type=int, default=16) parser.add_argument( "--use_gat_conv", - type=bool, - default=False, + action='store_true', help="Wether or not to use GATConv. (Defaults to using GCNConv)", ) parser.add_argument( @@ -165,7 +164,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, parser.add_argument( "--cugraph_data_loader", type=bool, - default=False, + action='store_true', help="Wether or not to use CuGraph for Neighbor Loading", ) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index e25997c7e4c6..eeaed3c133fb 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -17,8 +17,7 @@ parser.add_argument('--fan_out', type=int, default=16) parser.add_argument( "--use_gat_conv", - type=bool, - default=False, + action='store_true', help="Wether or not to use GATConv. (Defaults to using GCNConv)", ) parser.add_argument( @@ -30,7 +29,7 @@ parser.add_argument( "--cugraph_data_loader", type=bool, - default=False, + action='store_true', help="Wether or not to use CuGraph for Neighbor Loading", ) args = parser.parse_args() From 6465e68ccf0e94aef4f5f58410acf966bcf4256a Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Wed, 11 Oct 2023 13:59:07 -0700 Subject: [PATCH 029/197] clean up --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 1 - examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 1 - examples/ogbn_papers_100m.py | 1 - 3 files changed, 3 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 2d3623585e89..acc251452ca1 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -255,7 +255,6 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, ) parser.add_argument( "--cugraph_data_loader", - type=bool, action='store_true', help="Wether or not to use CuGraph for Neighbor Loading", ) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index f86e2eaf8e37..1c4d9e8b6a7a 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -163,7 +163,6 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, ) parser.add_argument( "--cugraph_data_loader", - type=bool, action='store_true', help="Wether or not to use CuGraph for Neighbor Loading", ) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index f5f22a21b4a4..aaa7a5ca86e5 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -28,7 +28,6 @@ ) parser.add_argument( "--cugraph_data_loader", - type=bool, action='store_true', help="Wether or not to use CuGraph for Neighbor Loading", ) From 51e50a1138d3e71e1644659aa47deb4fe9fc2567 Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Wed, 11 Oct 2023 15:14:23 -0700 Subject: [PATCH 030/197] clean up --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 2 +- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 2 +- examples/ogbn_papers_100m.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index acc251452ca1..146392a033ed 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -113,7 +113,7 @@ def __init__(self, in_channels, hidden_channels, out_channels, if use_gat_conv: self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) - self.conv2 = GATConv(hidden_channels, out_channels, + self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, out_channels / n_gat_conv_heads, heads=n_gat_conv_heads) else: self.conv1 = GCNConv(in_channels, hidden_channels) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 1c4d9e8b6a7a..603ed6bbf69d 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -32,7 +32,7 @@ def __init__(self, in_channels, hidden_channels, out_channels, if use_gat_conv: self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) - self.conv2 = GATConv(hidden_channels, out_channels, + self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, out_channels / n_gat_conv_heads, heads=n_gat_conv_heads) else: self.conv1 = GCNConv(in_channels, hidden_channels) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index aaa7a5ca86e5..de4410e7abf7 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -88,7 +88,7 @@ def __init__(self, in_channels, hidden_channels, out_channels, if use_gat_conv: self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) - self.conv2 = GATConv(hidden_channels, out_channels, + self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, out_channels / n_gat_conv_heads, heads=n_gat_conv_heads) else: self.conv1 = GCNConv(in_channels, hidden_channels) From d62dd72ebcd5e923d105c27922d30727928048a0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Oct 2023 22:15:20 +0000 Subject: [PATCH 031/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 3 ++- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 3 ++- examples/ogbn_papers_100m.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 146392a033ed..0f907be91fe0 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -113,7 +113,8 @@ def __init__(self, in_channels, hidden_channels, out_channels, if use_gat_conv: self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) - self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, out_channels / n_gat_conv_heads, + self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, + out_channels / n_gat_conv_heads, heads=n_gat_conv_heads) else: self.conv1 = GCNConv(in_channels, hidden_channels) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 603ed6bbf69d..6989389cdc1e 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -32,7 +32,8 @@ def __init__(self, in_channels, hidden_channels, out_channels, if use_gat_conv: self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) - self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, out_channels / n_gat_conv_heads, + self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, + out_channels / n_gat_conv_heads, heads=n_gat_conv_heads) else: self.conv1 = GCNConv(in_channels, hidden_channels) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index de4410e7abf7..9a1d6d7f2cc5 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -88,7 +88,8 @@ def __init__(self, in_channels, hidden_channels, out_channels, if use_gat_conv: self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) - self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, out_channels / n_gat_conv_heads, + self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, + out_channels / n_gat_conv_heads, heads=n_gat_conv_heads) else: self.conv1 = GCNConv(in_channels, hidden_channels) From 5104b0eea90847c57446bd156d03f4da22d2a6dc Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Wed, 11 Oct 2023 15:30:41 -0700 Subject: [PATCH 032/197] clean up --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 2 +- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 2 +- examples/ogbn_papers_100m.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 146392a033ed..5bd0eecb42bd 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -113,7 +113,7 @@ def __init__(self, in_channels, hidden_channels, out_channels, if use_gat_conv: self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) - self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, out_channels / n_gat_conv_heads, + self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, int(out_channels / n_gat_conv_heads), heads=n_gat_conv_heads) else: self.conv1 = GCNConv(in_channels, hidden_channels) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 603ed6bbf69d..45b1ce46f4bc 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -32,7 +32,7 @@ def __init__(self, in_channels, hidden_channels, out_channels, if use_gat_conv: self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) - self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, out_channels / n_gat_conv_heads, + self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, int(out_channels / n_gat_conv_heads), heads=n_gat_conv_heads) else: self.conv1 = GCNConv(in_channels, hidden_channels) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index de4410e7abf7..280b68b07220 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -88,7 +88,7 @@ def __init__(self, in_channels, hidden_channels, out_channels, if use_gat_conv: self.conv1 = GATConv(in_channels, hidden_channels, heads=n_gat_conv_heads) - self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, out_channels / n_gat_conv_heads, + self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, int(out_channels / n_gat_conv_heads), heads=n_gat_conv_heads) else: self.conv1 = GCNConv(in_channels, hidden_channels) From 49318fb63eca4c141df3c4eb4a7f56d6c3d6c62c Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Wed, 11 Oct 2023 15:48:16 -0700 Subject: [PATCH 033/197] clean up --- .../multinode_multigpu_papers100m_gcn.py | 29 ++++------------ .../singlenode_multigpu_papers100m_gcn.py | 29 ++++------------ examples/ogbn_papers_100m.py | 33 +++++-------------- 3 files changed, 20 insertions(+), 71 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 5bd0eecb42bd..a7390a596233 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -106,27 +106,6 @@ def get_local_process_group(): return _LOCAL_PROCESS_GROUP -class GNN(torch.nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels, - use_gat_conv=False, n_gat_conv_heads=4): - super().__init__() - if use_gat_conv: - self.conv1 = GATConv(in_channels, hidden_channels, - heads=n_gat_conv_heads) - self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, int(out_channels / n_gat_conv_heads), - heads=n_gat_conv_heads) - else: - self.conv1 = GCNConv(in_channels, hidden_channels) - self.conv2 = GCNConv(hidden_channels, out_channels) - - def forward(self, x, edge_index, edge_weight=None): - x = F.dropout(x, p=0.5, training=self.training) - x = self.conv1(x, edge_index, edge_weight).relu() - x = F.dropout(x, p=0.5, training=self.training) - x = self.conv2(x, edge_index, edge_weight) - return x - - def run_train(device, data, world_size, ngpu_per_node, model, epochs, batch_size, fan_out, split_idx, num_classes, cugraph_data_loader): @@ -232,6 +211,7 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--hidden_channels', type=int, default=128) + parser.add_argument('--num_layers', type=int, default=2) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=3) parser.add_argument('--batch_size', type=int, default=2048) @@ -275,8 +255,11 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, data = dataset[0] data.y = data.y.reshape(-1) - model = GNN(dataset.num_features, args.hidden_channels, - dataset.num_classes, args.use_gat_conv, args.n_gat_conv_heads) + if args.use_gat_conv: + model = torch_geometric.nn.models.GAT(dataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes, + heads=args.n_gat_conv_heads).to(device) + else: + model = torch_geometric.nn.models.GCN(ddataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes).to(device) run_train(device, data, nprocs, args.ngpu_per_node, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, args.cugraph_data_loader) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 45b1ce46f4bc..a7b13b5ac794 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -25,27 +25,6 @@ def pyg_num_work(world_size): return int(num_work) -class GNN(torch.nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels, - use_gat_conv=False, n_gat_conv_heads=4): - super().__init__() - if use_gat_conv: - self.conv1 = GATConv(in_channels, hidden_channels, - heads=n_gat_conv_heads) - self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, int(out_channels / n_gat_conv_heads), - heads=n_gat_conv_heads) - else: - self.conv1 = GCNConv(in_channels, hidden_channels) - self.conv2 = GCNConv(hidden_channels, out_channels) - - def forward(self, x, edge_index, edge_weight=None): - x = F.dropout(x, p=0.5, training=self.training) - x = self.conv1(x, edge_index, edge_weight).relu() - x = F.dropout(x, p=0.5, training=self.training) - x = self.conv2(x, edge_index, edge_weight) - return x - - def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, split_idx, num_classes, cugraph_data_loader): os.environ['MASTER_ADDR'] = 'localhost' @@ -146,6 +125,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--hidden_channels', type=int, default=128) + parser.add_argument('--num_layers', type=int, default=2) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=3) parser.add_argument('--batch_size', type=int, default=2048) @@ -173,8 +153,11 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, split_idx = dataset.get_idx_split() data = dataset[0] data.y = data.y.reshape(-1) - model = GNN(dataset.num_features, args.hidden_channels, - dataset.num_classes, args.use_gat_conv, args.n_gat_conv_heads) + if args.use_gat_conv: + model = torch_geometric.nn.models.GAT(dataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes, + heads=args.n_gat_conv_heads).to(device) + else: + model = torch_geometric.nn.models.GCN(ddataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes).to(device) print("Data =", data) world_size = torch.cuda.device_count() print('Let\'s use', world_size, 'GPUs!') diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 280b68b07220..d86661591a37 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -7,10 +7,11 @@ import torch.nn.functional as F from ogb.nodeproppred import PygNodePropPredDataset -from torch_geometric.nn import GATConv, GCNConv +import torch_geometric parser = argparse.ArgumentParser() parser.add_argument('--hidden_channels', type=int, default=128) +parser.add_argument('--num_layers', type=int, default=2) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=3) parser.add_argument('--batch_size', type=int, default=2048) @@ -72,7 +73,7 @@ def get_num_workers() -> int: **kwargs) else: num_work = get_num_workers() - from torch_geometric.loader import NeighborLoader + NeighborLoader = torch_geometric.loader.NeighborLoader train_loader = NeighborLoader(data=data, input_nodes=split_idx['train'], num_workers=num_work, shuffle=True, **kwargs) val_loader = NeighborLoader(data=data, input_nodes=split_idx['valid'], @@ -80,30 +81,12 @@ def get_num_workers() -> int: test_loader = NeighborLoader(data=data, input_nodes=split_idx['test'], num_workers=num_work, **kwargs) +if args.use_gat_conv: + model = torch_geometric.nn.models.GAT(dataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes, + heads=args.n_gat_conv_heads).to(device) +else: + model = torch_geometric.nn.models.GCN(ddataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes).to(device) -class GNN(torch.nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels, - use_gat_conv=False, n_gat_conv_heads=4): - super().__init__() - if use_gat_conv: - self.conv1 = GATConv(in_channels, hidden_channels, - heads=n_gat_conv_heads) - self.conv2 = GATConv(n_gat_conv_heads * hidden_channels, int(out_channels / n_gat_conv_heads), - heads=n_gat_conv_heads) - else: - self.conv1 = GCNConv(in_channels, hidden_channels) - self.conv2 = GCNConv(hidden_channels, out_channels) - - def forward(self, x, edge_index, edge_weight=None): - x = F.dropout(x, p=0.5, training=self.training) - x = self.conv1(x, edge_index, edge_weight).relu() - x = F.dropout(x, p=0.5, training=self.training) - x = self.conv2(x, edge_index, edge_weight) - return x - - -model = GNN(dataset.num_features, args.hidden_channels, dataset.num_classes, - args.use_gat_conv, args.n_gat_conv_heads).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005) From 8d8963b9f7f49c34922053c76fb79b92dad4d415 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Oct 2023 22:49:41 +0000 Subject: [PATCH 034/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../multi_gpu/multinode_multigpu_papers100m_gcn.py | 10 +++++++--- .../multi_gpu/singlenode_multigpu_papers100m_gcn.py | 10 +++++++--- examples/ogbn_papers_100m.py | 10 +++++++--- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index a7390a596233..c0a682d785a5 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -256,10 +256,14 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, data = dataset[0] data.y = data.y.reshape(-1) if args.use_gat_conv: - model = torch_geometric.nn.models.GAT(dataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes, - heads=args.n_gat_conv_heads).to(device) + model = torch_geometric.nn.models.GAT( + dataset.num_features, args.hidden_channels, args.num_layers, + dataset.num_classes, heads=args.n_gat_conv_heads).to(device) else: - model = torch_geometric.nn.models.GCN(ddataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes).to(device) + model = torch_geometric.nn.models.GCN(ddataset.num_features, + args.hidden_channels, + args.num_layers, + dataset.num_classes).to(device) run_train(device, data, nprocs, args.ngpu_per_node, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, args.cugraph_data_loader) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index a7b13b5ac794..89376aeb61cb 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -154,10 +154,14 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, data = dataset[0] data.y = data.y.reshape(-1) if args.use_gat_conv: - model = torch_geometric.nn.models.GAT(dataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes, - heads=args.n_gat_conv_heads).to(device) + model = torch_geometric.nn.models.GAT( + dataset.num_features, args.hidden_channels, args.num_layers, + dataset.num_classes, heads=args.n_gat_conv_heads).to(device) else: - model = torch_geometric.nn.models.GCN(ddataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes).to(device) + model = torch_geometric.nn.models.GCN(ddataset.num_features, + args.hidden_channels, + args.num_layers, + dataset.num_classes).to(device) print("Data =", data) world_size = torch.cuda.device_count() print('Let\'s use', world_size, 'GPUs!') diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index d86661591a37..4418d5280cdc 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -82,10 +82,14 @@ def get_num_workers() -> int: num_workers=num_work, **kwargs) if args.use_gat_conv: - model = torch_geometric.nn.models.GAT(dataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes, - heads=args.n_gat_conv_heads).to(device) + model = torch_geometric.nn.models.GAT( + dataset.num_features, args.hidden_channels, args.num_layers, + dataset.num_classes, heads=args.n_gat_conv_heads).to(device) else: - model = torch_geometric.nn.models.GCN(ddataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes).to(device) + model = torch_geometric.nn.models.GCN(ddataset.num_features, + args.hidden_channels, + args.num_layers, + dataset.num_classes).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005) From f3e5a2eb8170603a527ee62e4e8e5e67b979a148 Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Wed, 11 Oct 2023 16:00:22 -0700 Subject: [PATCH 035/197] clean up --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 2 +- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 2 +- examples/ogbn_papers_100m.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index a7390a596233..08037207f2b9 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -259,7 +259,7 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, model = torch_geometric.nn.models.GAT(dataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes, heads=args.n_gat_conv_heads).to(device) else: - model = torch_geometric.nn.models.GCN(ddataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes).to(device) + model = torch_geometric.nn.models.GCN(dataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes).to(device) run_train(device, data, nprocs, args.ngpu_per_node, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, args.cugraph_data_loader) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index a7b13b5ac794..591470028ea5 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -157,7 +157,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, model = torch_geometric.nn.models.GAT(dataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes, heads=args.n_gat_conv_heads).to(device) else: - model = torch_geometric.nn.models.GCN(ddataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes).to(device) + model = torch_geometric.nn.models.GCN(dataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes).to(device) print("Data =", data) world_size = torch.cuda.device_count() print('Let\'s use', world_size, 'GPUs!') diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index d86661591a37..e232b15f70a7 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -85,7 +85,7 @@ def get_num_workers() -> int: model = torch_geometric.nn.models.GAT(dataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes, heads=args.n_gat_conv_heads).to(device) else: - model = torch_geometric.nn.models.GCN(ddataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes).to(device) + model = torch_geometric.nn.models.GCN(dataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005) From 5c8c72a33afd28e02a9d1bad84a2d85f64aac579 Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Wed, 11 Oct 2023 16:08:49 -0700 Subject: [PATCH 036/197] clean up --- examples/ogbn_papers_100m.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index cc05b218c5f6..13780cbd4c09 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -117,7 +117,7 @@ def train(): f's/iter: {time.perf_counter() - start:.6f}') print(f'Average Training Iteration Time (s/iter): \ - {time.perf_counter() - start_avg_time:.6f}') + {(time.perf_counter() - start_avg_time)/(i-warmup_steps):.6f}') @torch.no_grad() @@ -139,7 +139,7 @@ def test(loader: NeighborLoader, eval_steps: Optional[int] = None): total_examples += y.size(0) print(f'Average Inference Iteration Time (s/iter): \ - {time.perf_counter() - start_avg_time:.6f}') + {(time.perf_counter() - start_avg_time)/(i-warmup_steps):.6f}') return total_correct / total_examples From 163460dd46974bdf38637b10f0cd72e1ff045f6a Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Wed, 11 Oct 2023 16:24:27 -0700 Subject: [PATCH 037/197] clean up --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 4 ---- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 4 ---- examples/ogbn_papers_100m.py | 3 --- 3 files changed, 11 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index bc64f9dedc25..70c8ddf529f1 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -186,16 +186,12 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, for i, batch in enumerate(eval_loader): if i >= eval_steps: break - if i >= warmup_steps: - start = time.time() batch = batch.to(device) batch.y = batch.y.to(torch.long) out = model(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) print(f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) - print("Average Inference Iteration Time:", - (time.time() - start) / (i - warmup_steps), "s/iter") if rank == 0: acc_sum = 0.0 with torch.no_grad(): diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 920bad4ce410..4f55fc955ea7 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -100,16 +100,12 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, for i, batch in enumerate(eval_loader): if i >= eval_steps: break - if i >= warmup_steps: - start = time.time() batch = batch.to(rank) batch.y = batch.y.to(torch.long) out = model(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) print(f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) - print("Average Inference Iteration Time:", - (time.time() - start) / (i - warmup_steps), "s/iter") if rank == 0: acc_sum = 0.0 with torch.no_grad(): diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 13780cbd4c09..da05444360e5 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -138,9 +138,6 @@ def test(loader: NeighborLoader, eval_steps: Optional[int] = None): total_correct += int((pred == y).sum()) total_examples += y.size(0) - print(f'Average Inference Iteration Time (s/iter): \ - {(time.perf_counter() - start_avg_time)/(i-warmup_steps):.6f}') - return total_correct / total_examples From 5a5df2a511964a7f651103e9b7ffb9e157f0542a Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Wed, 11 Oct 2023 16:43:17 -0700 Subject: [PATCH 038/197] clean up --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 7 +++---- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 70c8ddf529f1..7cf9cc33cf34 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -63,8 +63,7 @@ from ogb.nodeproppred import PygNodePropPredDataset from torch.nn.parallel import DistributedDataParallel from torchmetrics import Accuracy - -from torch_geometric.nn import GATConv, GCNConv +import torch_geometric warnings.filterwarnings("ignore") @@ -254,12 +253,12 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, if args.use_gat_conv: model = torch_geometric.nn.models.GAT( dataset.num_features, args.hidden_channels, args.num_layers, - dataset.num_classes, heads=args.n_gat_conv_heads).to(device) + dataset.num_classes, heads=args.n_gat_conv_heads) else: model = torch_geometric.nn.models.GCN(dataset.num_features, args.hidden_channels, args.num_layers, - dataset.num_classes).to(device) + dataset.num_classes) run_train(device, data, nprocs, args.ngpu_per_node, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, args.cugraph_data_loader) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 4f55fc955ea7..af624c159bee 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -10,7 +10,7 @@ from torch.nn.parallel import DistributedDataParallel from torchmetrics import Accuracy -from torch_geometric.nn import GATConv, GCNConv +import torch_geometric def pyg_num_work(world_size): @@ -152,12 +152,12 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, if args.use_gat_conv: model = torch_geometric.nn.models.GAT( dataset.num_features, args.hidden_channels, args.num_layers, - dataset.num_classes, heads=args.n_gat_conv_heads).to(device) + dataset.num_classes, heads=args.n_gat_conv_heads) else: model = torch_geometric.nn.models.GCN(dataset.num_features, args.hidden_channels, args.num_layers, - dataset.num_classes).to(device) + dataset.num_classes) print("Data =", data) world_size = torch.cuda.device_count() print('Let\'s use', world_size, 'GPUs!') From 6592065ae08abf6f77035bc21adc2bedd7e44729 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Oct 2023 23:44:14 +0000 Subject: [PATCH 039/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 9 ++++++--- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 8 +++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 7cf9cc33cf34..09428eef49db 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -63,6 +63,7 @@ from ogb.nodeproppred import PygNodePropPredDataset from torch.nn.parallel import DistributedDataParallel from torchmetrics import Accuracy + import torch_geometric warnings.filterwarnings("ignore") @@ -251,9 +252,11 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, data = dataset[0] data.y = data.y.reshape(-1) if args.use_gat_conv: - model = torch_geometric.nn.models.GAT( - dataset.num_features, args.hidden_channels, args.num_layers, - dataset.num_classes, heads=args.n_gat_conv_heads) + model = torch_geometric.nn.models.GAT(dataset.num_features, + args.hidden_channels, + args.num_layers, + dataset.num_classes, + heads=args.n_gat_conv_heads) else: model = torch_geometric.nn.models.GCN(dataset.num_features, args.hidden_channels, diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index af624c159bee..dc023e7fde25 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -150,9 +150,11 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, data = dataset[0] data.y = data.y.reshape(-1) if args.use_gat_conv: - model = torch_geometric.nn.models.GAT( - dataset.num_features, args.hidden_channels, args.num_layers, - dataset.num_classes, heads=args.n_gat_conv_heads) + model = torch_geometric.nn.models.GAT(dataset.num_features, + args.hidden_channels, + args.num_layers, + dataset.num_classes, + heads=args.n_gat_conv_heads) else: model = torch_geometric.nn.models.GCN(dataset.num_features, args.hidden_channels, From 7e6ad6744a8d098e8874df7d059766d30160be29 Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Thu, 12 Oct 2023 08:44:11 -0700 Subject: [PATCH 040/197] clean up --- examples/ogbn_papers_100m.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index da05444360e5..a9b13e963def 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -128,8 +128,6 @@ def test(loader: NeighborLoader, eval_steps: Optional[int] = None): for i, batch in enumerate(loader): if eval_steps is not None and i >= eval_steps: break - if i >= warmup_steps: - start_avg_time = time.perf_counter() batch = batch.to(device) out = model(batch.x, batch.edge_index)[:batch.batch_size] pred = out.argmax(dim=-1) From c91a215b62565a11e50c571ae1a6363ee81ef3ef Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Thu, 12 Oct 2023 09:12:21 -0700 Subject: [PATCH 041/197] clean up --- .../multi_gpu/multinode_multigpu_papers100m_gcn.py | 5 ++++- .../multi_gpu/singlenode_multigpu_papers100m_gcn.py | 12 +++++++----- examples/ogbn_papers_100m.py | 4 +++- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 09428eef49db..7709e7a143ae 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -232,9 +232,12 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, parser.add_argument( "--cugraph_data_loader", action='store_true', - help="Wether or not to use CuGraph for Neighbor Loading", + help="Wether or not to use CuGraph for Neighbor Loading. \ + \nNote that this requires more GPU memory or \ + a reduction in batch_size/fan_out/hidden_channels/num_layers", ) + args = parser.parse_args() # setup multi node torch.distributed.init_process_group("nccl") diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index dc023e7fde25..b8aeca1905a6 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -137,11 +137,13 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, default=4, help="If using GATConv, number of attention heads to use", ) - parser.add_argument( - "--cugraph_data_loader", - action='store_true', - help="Wether or not to use CuGraph for Neighbor Loading", - ) +parser.add_argument( + "--cugraph_data_loader", + action='store_true', + help="Wether or not to use CuGraph for Neighbor Loading. \ + \nNote that this requires more GPU memory or \ + a reduction in batch_size/fan_out/hidden_channels/num_layers", +) args = parser.parse_args() diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index a9b13e963def..b86984922da7 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -30,7 +30,9 @@ parser.add_argument( "--cugraph_data_loader", action='store_true', - help="Wether or not to use CuGraph for Neighbor Loading", + help="Wether or not to use CuGraph for Neighbor Loading. \ + \nNote that this requires more GPU memory or \ + a reduction in batch_size/fan_out/hidden_channels/num_layers", ) args = parser.parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') From 82298fec53850d01b889492d8a88c9ad32acc87f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Oct 2023 16:14:43 +0000 Subject: [PATCH 042/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 7709e7a143ae..0bf52ab24a83 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -237,7 +237,6 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, a reduction in batch_size/fan_out/hidden_channels/num_layers", ) - args = parser.parse_args() # setup multi node torch.distributed.init_process_group("nccl") From 17b0ff555b7aaaef71221a15b9cfedd540ee8e2d Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Thu, 12 Oct 2023 09:44:27 -0700 Subject: [PATCH 043/197] clean up --- .../singlenode_multigpu_papers100m_gcn.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index b8aeca1905a6..8cedbef398a7 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -137,13 +137,13 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, default=4, help="If using GATConv, number of attention heads to use", ) -parser.add_argument( - "--cugraph_data_loader", - action='store_true', - help="Wether or not to use CuGraph for Neighbor Loading. \ - \nNote that this requires more GPU memory or \ - a reduction in batch_size/fan_out/hidden_channels/num_layers", -) + parser.add_argument( + "--cugraph_data_loader", + action='store_true', + help="Wether or not to use CuGraph for Neighbor Loading. \ + \nNote that this requires more GPU memory or \ + a reduction in batch_size/fan_out/hidden_channels/num_layers", + ) args = parser.parse_args() From 0b500ffa229c436ba924cf5342fa44a704004654 Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Thu, 12 Oct 2023 12:04:54 -0700 Subject: [PATCH 044/197] cleanup --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 0bf52ab24a83..63a02ab42c7f 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -89,7 +89,9 @@ def create_local_process_group(num_workers_per_node): assert _LOCAL_PROCESS_GROUP is None world_size = dist.get_world_size() if dist.is_initialized() else 1 rank = dist.get_rank() if dist.is_initialized() else 0 - assert world_size % num_workers_per_node == 0 + assert world_size % num_workers_per_node == 0, \ + "world_size = " + str(world_size) + \ + "\nnum_workers_per_node = " + str(num_workers_per_node) num_nodes = world_size // num_workers_per_node node_rank = rank // num_workers_per_node From 4814f161f95ea3f2e9e7da3668847f0c8e7cafbc Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Thu, 12 Oct 2023 12:09:06 -0700 Subject: [PATCH 045/197] cleanup --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 63a02ab42c7f..7e587f422d03 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -89,9 +89,9 @@ def create_local_process_group(num_workers_per_node): assert _LOCAL_PROCESS_GROUP is None world_size = dist.get_world_size() if dist.is_initialized() else 1 rank = dist.get_rank() if dist.is_initialized() else 0 - assert world_size % num_workers_per_node == 0, \ - "world_size = " + str(world_size) + \ - "\nnum_workers_per_node = " + str(num_workers_per_node) + # assert world_size % num_workers_per_node == 0, \ + # "world_size = " + str(world_size) + \ + # "\nnum_workers_per_node = " + str(num_workers_per_node) num_nodes = world_size // num_workers_per_node node_rank = rank // num_workers_per_node From 774364c324c58b7092ab570781394ad7196d17ee Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Thu, 12 Oct 2023 12:11:43 -0700 Subject: [PATCH 046/197] cleanup --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 7e587f422d03..63a02ab42c7f 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -89,9 +89,9 @@ def create_local_process_group(num_workers_per_node): assert _LOCAL_PROCESS_GROUP is None world_size = dist.get_world_size() if dist.is_initialized() else 1 rank = dist.get_rank() if dist.is_initialized() else 0 - # assert world_size % num_workers_per_node == 0, \ - # "world_size = " + str(world_size) + \ - # "\nnum_workers_per_node = " + str(num_workers_per_node) + assert world_size % num_workers_per_node == 0, \ + "world_size = " + str(world_size) + \ + "\nnum_workers_per_node = " + str(num_workers_per_node) num_nodes = world_size // num_workers_per_node node_rank = rank // num_workers_per_node From bca59509960ec0bfcbf1c8021b63b33eec828c2a Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Thu, 12 Oct 2023 14:17:57 -0700 Subject: [PATCH 047/197] cleanup --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 2 +- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 2 +- examples/ogbn_papers_100m.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 63a02ab42c7f..ac926d437c62 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -162,7 +162,7 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, num_workers=num_work, **kwargs) eval_steps = 1000 - warmup_steps = 100 + warmup_steps = 20 acc = Accuracy(task="multiclass", num_classes=num_classes).to(device) if rank == 0: print("Beginning training...") diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 8cedbef398a7..66c4c2b7b1b6 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -74,7 +74,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, test_loader = NeighborLoader(data, input_nodes=split_idx['test'], num_workers=num_work, **kwargs) eval_steps = 1000 - warmup_steps = 100 + warmup_steps = 20 acc = Accuracy(task="multiclass", num_classes=num_classes).to(rank) if rank == 0: print("Beginning training...") diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index b86984922da7..b7cc0853d955 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -96,7 +96,7 @@ def get_num_workers() -> int: optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005) -warmup_steps = 50 +warmup_steps = 20 def train(): From 59c95f704c7a7bec5c40ea4809b9af6031ba9aa0 Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Fri, 13 Oct 2023 13:50:32 -0700 Subject: [PATCH 048/197] cleanup --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 4 ++-- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 4 ++-- examples/ogbn_papers_100m.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index ac926d437c62..26557f5b3124 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -141,7 +141,7 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, cugraph_store = CuGraphStore(fs, G, N) train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], - shuffle=True, **kwargs) + shuffle=True, , drop_last=True, **kwargs) if rank == 0: eval_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], @@ -153,7 +153,7 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, from torch_geometric.loader import NeighborLoader num_work = pyg_num_work(world_size) train_loader = NeighborLoader(data, input_nodes=split_idx['train'], - num_workers=num_work, shuffle=True, + num_workers=num_work, shuffle=True, drop_last=True, **kwargs) if rank == 0: eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 66c4c2b7b1b6..52c56e1d6790 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -54,7 +54,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, cugraph_store = CuGraphStore(fs, G, N) train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], - shuffle=True, **kwargs) + shuffle=True, , drop_last=True, **kwargs) if rank == 0: eval_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], @@ -66,7 +66,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, from torch_geometric.loader import NeighborLoader num_work = pyg_num_work(world_size) train_loader = NeighborLoader(data, input_nodes=split_idx['train'], - num_workers=num_work, shuffle=True, + num_workers=num_work, shuffle=True, drop_last=True, **kwargs) if rank == 0: eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index b7cc0853d955..4db5ae445f72 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -66,7 +66,7 @@ def get_num_workers() -> int: cugraph_store = CuGraphStore(fs, G, N) train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], - shuffle=True, **kwargs) + shuffle=True, drop_last=True, **kwargs) val_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], **kwargs) @@ -77,7 +77,7 @@ def get_num_workers() -> int: num_work = get_num_workers() NeighborLoader = torch_geometric.loader.NeighborLoader train_loader = NeighborLoader(data=data, input_nodes=split_idx['train'], - num_workers=num_work, shuffle=True, **kwargs) + num_workers=num_work, drop_last=True, shuffle=True, **kwargs) val_loader = NeighborLoader(data=data, input_nodes=split_idx['valid'], num_workers=num_work, **kwargs) test_loader = NeighborLoader(data=data, input_nodes=split_idx['test'], From 1185241e6cbac8691eff347d2424e8bee7336d6b Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Fri, 13 Oct 2023 13:51:57 -0700 Subject: [PATCH 049/197] cleanup --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 2 +- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 26557f5b3124..056afa01b64c 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -141,7 +141,7 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, cugraph_store = CuGraphStore(fs, G, N) train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], - shuffle=True, , drop_last=True, **kwargs) + shuffle=True, drop_last=True, **kwargs) if rank == 0: eval_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 52c56e1d6790..6d2d79c348fe 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -54,7 +54,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, cugraph_store = CuGraphStore(fs, G, N) train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], - shuffle=True, , drop_last=True, **kwargs) + shuffle=True, drop_last=True, **kwargs) if rank == 0: eval_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], From 6087bf22bce10902d34f8557117935b99dbe54e4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 13 Oct 2023 20:52:59 +0000 Subject: [PATCH 050/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 7 ++++--- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 7 ++++--- examples/ogbn_papers_100m.py | 6 ++++-- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 056afa01b64c..63ec93baa5fe 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -141,7 +141,8 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, cugraph_store = CuGraphStore(fs, G, N) train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], - shuffle=True, drop_last=True, **kwargs) + shuffle=True, drop_last=True, + **kwargs) if rank == 0: eval_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], @@ -153,8 +154,8 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, from torch_geometric.loader import NeighborLoader num_work = pyg_num_work(world_size) train_loader = NeighborLoader(data, input_nodes=split_idx['train'], - num_workers=num_work, shuffle=True, drop_last=True, - **kwargs) + num_workers=num_work, shuffle=True, + drop_last=True, **kwargs) if rank == 0: eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], num_workers=num_work, **kwargs) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 6d2d79c348fe..84723a03dc91 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -54,7 +54,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, cugraph_store = CuGraphStore(fs, G, N) train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], - shuffle=True, drop_last=True, **kwargs) + shuffle=True, drop_last=True, + **kwargs) if rank == 0: eval_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], @@ -66,8 +67,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, from torch_geometric.loader import NeighborLoader num_work = pyg_num_work(world_size) train_loader = NeighborLoader(data, input_nodes=split_idx['train'], - num_workers=num_work, shuffle=True, drop_last=True, - **kwargs) + num_workers=num_work, shuffle=True, + drop_last=True, **kwargs) if rank == 0: eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], num_workers=num_work, **kwargs) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 4db5ae445f72..5f2da272d52a 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -66,7 +66,8 @@ def get_num_workers() -> int: cugraph_store = CuGraphStore(fs, G, N) train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], - shuffle=True, drop_last=True, **kwargs) + shuffle=True, drop_last=True, + **kwargs) val_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], **kwargs) @@ -77,7 +78,8 @@ def get_num_workers() -> int: num_work = get_num_workers() NeighborLoader = torch_geometric.loader.NeighborLoader train_loader = NeighborLoader(data=data, input_nodes=split_idx['train'], - num_workers=num_work, drop_last=True, shuffle=True, **kwargs) + num_workers=num_work, drop_last=True, + shuffle=True, **kwargs) val_loader = NeighborLoader(data=data, input_nodes=split_idx['valid'], num_workers=num_work, **kwargs) test_loader = NeighborLoader(data=data, input_nodes=split_idx['test'], From 17bf9e7d173be1d54feaf6604db81587c256ed5d Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Thu, 19 Oct 2023 15:46:49 -0700 Subject: [PATCH 051/197] cleanup --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 3 +++ examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 4 +++- examples/ogbn_papers_100m.py | 4 +++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 63ec93baa5fe..857daab705ba 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -241,6 +241,9 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, ) args = parser.parse_args() + if args.cugraph_data_loader: + from cugraph.testing.mg_utils import enable_spilling + enable_spilling() # setup multi node torch.distributed.init_process_group("nccl") nprocs = dist.get_world_size() diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 84723a03dc91..a7ba40cc3e97 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -147,7 +147,9 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, ) args = parser.parse_args() - + if args.cugraph_data_loader: + from cugraph.testing.mg_utils import enable_spilling + enable_spilling() dataset = PygNodePropPredDataset(name='ogbn-papers100M') split_idx = dataset.get_idx_split() data = dataset[0] diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 5f2da272d52a..bd44dedf90ad 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -39,7 +39,9 @@ dataset = PygNodePropPredDataset(name='ogbn-papers100M') split_idx = dataset.get_idx_split() - +if args.cugraph_data_loader: + from cugraph.testing.mg_utils import enable_spilling + enable_spilling() def get_num_workers() -> int: try: From 705de9afabbdf770841f5fb8397aaf6d42491d7e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 19 Oct 2023 22:47:53 +0000 Subject: [PATCH 052/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/ogbn_papers_100m.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index bd44dedf90ad..6bc7a93d783b 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -43,6 +43,7 @@ from cugraph.testing.mg_utils import enable_spilling enable_spilling() + def get_num_workers() -> int: try: return len(os.sched_getaffinity(0)) // 2 From d2bc07aef20e63e8e17df0535d8d720850f1c30e Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Fri, 20 Oct 2023 14:20:38 -0700 Subject: [PATCH 053/197] fixes from Cugraph-PyG lead Alexandria Barghi --- examples/ogbn_papers_100m.py | 37 +++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index bd44dedf90ad..db8546bcde48 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -43,6 +43,7 @@ from cugraph.testing.mg_utils import enable_spilling enable_spilling() + def get_num_workers() -> int: try: return len(os.sched_getaffinity(0)) // 2 @@ -57,6 +58,17 @@ def get_num_workers() -> int: # Set Up Neighbor Loading data = dataset[0] if args.cugraph_data_loader: + import rmm + import cupy + + rmm.reinitialize(devices=[0], pool_allocator=True, initial_pool_size=78e9, managed_memory=True) + + from rmm.allocators.torch import rmm_torch_allocator + torch.cuda.memory.change_current_allocator(rmm_torch_allocator) + + from rmm.allocators.cupy import rmm_cupy_allocator + cupy.cuda.set_allocator(rmm_cupy_allocator) + import cugraph from cugraph_pyg.data import CuGraphStore from cugraph_pyg.loader import CuGraphNeighborLoader @@ -68,7 +80,7 @@ def get_num_workers() -> int: cugraph_store = CuGraphStore(fs, G, N) train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], - shuffle=True, drop_last=True, + #shuffle=True, drop_last=True, **kwargs) val_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], @@ -80,8 +92,8 @@ def get_num_workers() -> int: num_work = get_num_workers() NeighborLoader = torch_geometric.loader.NeighborLoader train_loader = NeighborLoader(data=data, input_nodes=split_idx['train'], - num_workers=num_work, drop_last=True, - shuffle=True, **kwargs) + num_workers=num_work, drop_last=False, + shuffle=False, **kwargs) val_loader = NeighborLoader(data=data, input_nodes=split_idx['valid'], num_workers=num_work, **kwargs) test_loader = NeighborLoader(data=data, input_nodes=split_idx['test'], @@ -107,13 +119,17 @@ def train(): model.train() for i, batch in enumerate(train_loader): + if isinstance(batch, torch_geometric.data.HeteroData): + batch = batch.to_homogeneous() + if i >= warmup_steps: start_avg_time = time.perf_counter() start = time.perf_counter() batch = batch.to(device) optimizer.zero_grad() - out = model(batch.x, batch.edge_index)[:batch.batch_size] - y = batch.y[:batch.batch_size].view(-1).to(torch.long) + batch_size=batch.num_sampled_nodes[0] + out = model(batch.x, batch.edge_index)[:batch_size] + y = batch.y[:batch_size].view(-1).to(torch.long) loss = F.cross_entropy(out, y) loss.backward() optimizer.step() @@ -127,17 +143,20 @@ def train(): @torch.no_grad() -def test(loader: NeighborLoader, eval_steps: Optional[int] = None): +def test(loader, eval_steps: Optional[int] = None): model.eval() total_correct = total_examples = 0 for i, batch in enumerate(loader): if eval_steps is not None and i >= eval_steps: break + if isinstance(batch, torch_geometric.data.HeteroData): + batch = batch.to_homogeneous() batch = batch.to(device) - out = model(batch.x, batch.edge_index)[:batch.batch_size] + batch_size=batch.num_sampled_nodes[0] + out = model(batch.x, batch.edge_index)[:batch_size] pred = out.argmax(dim=-1) - y = batch.y[:batch.batch_size].view(-1).to(torch.long) + y = batch.y[:batch_size].view(-1).to(torch.long) total_correct += int((pred == y).sum()) total_examples += y.size(0) @@ -151,4 +170,4 @@ def test(loader: NeighborLoader, eval_steps: Optional[int] = None): print(f'Val Acc: ~{val_acc:.4f}') test_acc = test(test_loader) -print(f'Test Acc: {test_acc:.4f}') +print(f'Test Acc: {test_acc:.4f}') \ No newline at end of file From ae8b489953b5647b2c9d16ee2d259c86e51771e6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Oct 2023 21:23:18 +0000 Subject: [PATCH 054/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/ogbn_papers_100m.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index db8546bcde48..a12b2d99a6cc 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -58,11 +58,12 @@ def get_num_workers() -> int: # Set Up Neighbor Loading data = dataset[0] if args.cugraph_data_loader: - import rmm import cupy - - rmm.reinitialize(devices=[0], pool_allocator=True, initial_pool_size=78e9, managed_memory=True) - + import rmm + + rmm.reinitialize(devices=[0], pool_allocator=True, initial_pool_size=78e9, + managed_memory=True) + from rmm.allocators.torch import rmm_torch_allocator torch.cuda.memory.change_current_allocator(rmm_torch_allocator) @@ -78,10 +79,11 @@ def get_num_workers() -> int: fs.add_data(data.x, "N", "x") fs.add_data(data.y, "N", "y") cugraph_store = CuGraphStore(fs, G, N) - train_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['train'], - #shuffle=True, drop_last=True, - **kwargs) + train_loader = CuGraphNeighborLoader( + cugraph_store, + input_nodes=split_idx['train'], + #shuffle=True, drop_last=True, + **kwargs) val_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], **kwargs) @@ -127,7 +129,7 @@ def train(): start = time.perf_counter() batch = batch.to(device) optimizer.zero_grad() - batch_size=batch.num_sampled_nodes[0] + batch_size = batch.num_sampled_nodes[0] out = model(batch.x, batch.edge_index)[:batch_size] y = batch.y[:batch_size].view(-1).to(torch.long) loss = F.cross_entropy(out, y) @@ -153,7 +155,7 @@ def test(loader, eval_steps: Optional[int] = None): if isinstance(batch, torch_geometric.data.HeteroData): batch = batch.to_homogeneous() batch = batch.to(device) - batch_size=batch.num_sampled_nodes[0] + batch_size = batch.num_sampled_nodes[0] out = model(batch.x, batch.edge_index)[:batch_size] pred = out.argmax(dim=-1) y = batch.y[:batch_size].view(-1).to(torch.long) @@ -170,4 +172,4 @@ def test(loader, eval_steps: Optional[int] = None): print(f'Val Acc: ~{val_acc:.4f}') test_acc = test(test_loader) -print(f'Test Acc: {test_acc:.4f}') \ No newline at end of file +print(f'Test Acc: {test_acc:.4f}') From dae116483e73a52b23c9ed982fe628e5cadfdb1d Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Mon, 23 Oct 2023 09:16:24 -0700 Subject: [PATCH 055/197] notes --- examples/ogbn_papers_100m.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index db8546bcde48..743311bee873 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -78,9 +78,14 @@ def get_num_workers() -> int: fs.add_data(data.x, "N", "x") fs.add_data(data.y, "N", "y") cugraph_store = CuGraphStore(fs, G, N) + # Note that train dataloader SHOULD have shuffle and drop_last as True + # However this feature is not yet available in CuGraphNeighborLoader + # Coming early 2024 + # CuGraphNeighborLoader can produce huge speed ups but not shuffling + # can have negative impacts on val/test accuracy. train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], - #shuffle=True, drop_last=True, + # shuffle=True, drop_last=True, **kwargs) val_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], @@ -92,7 +97,7 @@ def get_num_workers() -> int: num_work = get_num_workers() NeighborLoader = torch_geometric.loader.NeighborLoader train_loader = NeighborLoader(data=data, input_nodes=split_idx['train'], - num_workers=num_work, drop_last=False, + num_workers=num_work, drop_last=True, shuffle=False, **kwargs) val_loader = NeighborLoader(data=data, input_nodes=split_idx['valid'], num_workers=num_work, **kwargs) From 61028fca32e3c87ad1d299f06530032f00cc1875 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 23 Oct 2023 15:24:24 -0700 Subject: [PATCH 056/197] cleaning --- examples/ogbn_papers_100m.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 208a026f1625..635203272350 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -87,7 +87,7 @@ def get_num_workers() -> int: train_loader = CuGraphNeighborLoader( cugraph_store, input_nodes=split_idx['train'], - #shuffle=True, drop_last=True, + # shuffle=True, drop_last=True, **kwargs) val_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], From de7069f67dd016b8f2b7a756753af9c472128361 Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Wed, 25 Oct 2023 09:24:45 -0700 Subject: [PATCH 057/197] fixing timer --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 2 +- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 2 +- examples/ogbn_papers_100m.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 857daab705ba..d4483cb30315 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -169,7 +169,7 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, print("Beginning training...") for epoch in range(epochs): for i, batch in enumerate(train_loader): - if i >= warmup_steps: + if i == warmup_steps: start = time.time() batch = batch.to(device) batch.y = batch.y.to(torch.long) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index a7ba40cc3e97..67e12d40a74a 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -81,7 +81,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, print("Beginning training...") for epoch in range(epochs): for i, batch in enumerate(train_loader): - if i >= warmup_steps: + if i == warmup_steps: start = time.time() batch = batch.to(rank) batch.y = batch.y.to(torch.long) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 635203272350..cdfc4861be63 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -124,12 +124,12 @@ def get_num_workers() -> int: def train(): model.train() - + start_timer = False for i, batch in enumerate(train_loader): if isinstance(batch, torch_geometric.data.HeteroData): batch = batch.to_homogeneous() - if i >= warmup_steps: + if i == warmup_steps: start_avg_time = time.perf_counter() start = time.perf_counter() batch = batch.to(device) From d7b0fd71f56dc3fdc0b89fd9c3ef7aa7e1c827a4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 25 Oct 2023 23:14:14 +0000 Subject: [PATCH 058/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 90484c3479c4..a27ccf9c9c1e 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -175,6 +175,7 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, batch.y[:batch_size]) print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--hidden_channels', type=int, default=128) From 64f3afa83bfc6c6ef7b10430e4a454e3086130de Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 25 Oct 2023 16:17:04 -0700 Subject: [PATCH 059/197] cleaning --- examples/ogbn_papers_100m.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index afafa5012aef..caeb61cc8e5e 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -124,7 +124,6 @@ def get_num_workers() -> int: def train(): model.train() - start_timer = False for i, batch in enumerate(train_loader): if isinstance(batch, torch_geometric.data.HeteroData): batch = batch.to_homogeneous() From 795d8ecb2771fc1b256967934bd41427ad669dc0 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 25 Oct 2023 16:21:33 -0700 Subject: [PATCH 060/197] cleaning --- .../multi_gpu/multinode_multigpu_papers100m_gcn.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index a27ccf9c9c1e..2cf70f6d264c 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -34,9 +34,8 @@ import torch.nn.functional as F from ogb.nodeproppred import PygNodePropPredDataset from torch.nn.parallel import DistributedDataParallel - -from torch_geometric.loader import NeighborLoader -from torch_geometric.nn import GCNConv +import torch_geometric +from torchmetrics import Accuracy def get_num_workers(world_size: int) -> int: @@ -108,9 +107,14 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, fs.add_data(data.x, "N", "x") fs.add_data(data.y, "N", "y") cugraph_store = CuGraphStore(fs, G, N) + # Note that train dataloader SHOULD have shuffle and drop_last as True. + # However, this feature is not yet available in CuGraphNeighborLoader. + # Coming early 2024. + # CuGraphNeighborLoader can produce huge speed ups but not shuffling + # can have negative impacts on val/test accuracy. train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], - shuffle=True, drop_last=True, + # shuffle=True, drop_last=True, **kwargs) if rank == 0: eval_loader = CuGraphNeighborLoader(cugraph_store, From 4bcc31ea7ebfb770e2789dfd2456cc343e0a7387 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 25 Oct 2023 23:22:27 +0000 Subject: [PATCH 061/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../multi_gpu/multinode_multigpu_papers100m_gcn.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 2cf70f6d264c..8743e8a78720 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -34,9 +34,10 @@ import torch.nn.functional as F from ogb.nodeproppred import PygNodePropPredDataset from torch.nn.parallel import DistributedDataParallel -import torch_geometric from torchmetrics import Accuracy +import torch_geometric + def get_num_workers(world_size: int) -> int: num_workers = None @@ -112,10 +113,11 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, # Coming early 2024. # CuGraphNeighborLoader can produce huge speed ups but not shuffling # can have negative impacts on val/test accuracy. - train_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['train'], - # shuffle=True, drop_last=True, - **kwargs) + train_loader = CuGraphNeighborLoader( + cugraph_store, + input_nodes=split_idx['train'], + # shuffle=True, drop_last=True, + **kwargs) if rank == 0: eval_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], From c5b89e74aea6372f8a7f1b53e4c343994e5b50c3 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 25 Oct 2023 17:15:50 -0700 Subject: [PATCH 062/197] eval on all ranks --- .../multinode_multigpu_papers100m_gcn.py | 53 +++++++++---------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 8743e8a78720..da9a56c35dc7 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -118,24 +118,22 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, input_nodes=split_idx['train'], # shuffle=True, drop_last=True, **kwargs) - if rank == 0: - eval_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['valid'], - **kwargs) - test_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['test'], - **kwargs) + eval_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['valid'], + **kwargs) + test_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['test'], + **kwargs) else: from torch_geometric.loader import NeighborLoader num_work = get_num_workers(world_size) train_loader = NeighborLoader(data, input_nodes=split_idx['train'], num_workers=num_work, shuffle=True, drop_last=True, **kwargs) - if rank == 0: - eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], - num_workers=num_work, **kwargs) - test_loader = NeighborLoader(data, input_nodes=split_idx['test'], - num_workers=num_work, **kwargs) + eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], + num_workers=num_work, **kwargs) + test_loader = NeighborLoader(data, input_nodes=split_idx['test'], + num_workers=num_work, **kwargs) eval_steps = 1000 warmup_steps = 20 @@ -156,30 +154,31 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, if rank == 0 and i % 10 == 0: print("Epoch: " + str(epoch) + ", Iteration: " + str(i) + ", Loss: " + str(loss)) - if rank == 0: print("Average Training Iteration Time:", (time.time() - start) / (i - 10), "s/iter") - acc_sum = 0.0 - with torch.no_grad(): - for i, batch in enumerate(eval_loader): - if i >= eval_steps: - break - batch = batch.to(device) - batch.y = batch.y.to(torch.long) - out = model(batch.x, batch.edge_index) - acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) - print(f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) - if rank == 0: acc_sum = 0.0 with torch.no_grad(): - for i, batch in enumerate(test_loader): + for i, batch in enumerate(eval_loader): + if i >= eval_steps: + break batch = batch.to(device) batch.y = batch.y.to(torch.long) out = model(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) - print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) + torch.distributed.all_reduce(acc_sum, op=torch.distributed.ReduceOp.MEAN) + print(f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) + acc_sum = 0.0 + with torch.no_grad(): + for i, batch in enumerate(test_loader): + batch = batch.to(device) + batch.y = batch.y.to(torch.long) + out = model(batch.x, batch.edge_index) + acc_sum += acc(out[:batch_size].softmax(dim=-1), + batch.y[:batch_size]) + torch.distributed.all_reduce(acc_sum, op=torch.distributed.ReduceOp.MEAN) + print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) + if __name__ == '__main__': From bc1d74c5e990f784ff266bcc609794798c25a8b9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 26 Oct 2023 00:17:42 +0000 Subject: [PATCH 063/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index da9a56c35dc7..002f858bbc98 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -166,7 +166,8 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, out = model(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) - torch.distributed.all_reduce(acc_sum, op=torch.distributed.ReduceOp.MEAN) + torch.distributed.all_reduce(acc_sum, + op=torch.distributed.ReduceOp.MEAN) print(f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) acc_sum = 0.0 with torch.no_grad(): @@ -176,9 +177,9 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, out = model(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) - torch.distributed.all_reduce(acc_sum, op=torch.distributed.ReduceOp.MEAN) + torch.distributed.all_reduce(acc_sum, + op=torch.distributed.ReduceOp.MEAN) print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) - if __name__ == '__main__': From c86f852644b3455bdaea6ee8cf8264eed6c5465a Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 25 Oct 2023 17:17:46 -0700 Subject: [PATCH 064/197] eval on all ranks --- .../singlenode_multigpu_papers100m_gcn.py | 56 +++++++++---------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index f648a2884ac6..3c9980dd969f 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -56,24 +56,22 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, input_nodes=split_idx['train'], shuffle=True, drop_last=True, **kwargs) - if rank == 0: - eval_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['valid'], - **kwargs) - test_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['test'], - **kwargs) + eval_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['valid'], + **kwargs) + test_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['test'], + **kwargs) else: from torch_geometric.loader import NeighborLoader num_work = get_num_workers(world_size) train_loader = NeighborLoader(data, input_nodes=split_idx['train'], num_workers=num_work, shuffle=True, drop_last=True, **kwargs) - if rank == 0: - eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], - num_workers=num_work, **kwargs) - test_loader = NeighborLoader(data, input_nodes=split_idx['test'], - num_workers=num_work, **kwargs) + eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], + num_workers=num_work, **kwargs) + test_loader = NeighborLoader(data, input_nodes=split_idx['test'], + num_workers=num_work, **kwargs) eval_steps = 1000 warmup_steps = 20 acc = Accuracy(task="multiclass", num_classes=num_classes).to(rank) @@ -93,30 +91,30 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, if rank == 0 and i % 10 == 0: print("Epoch: " + str(epoch) + ", Iteration: " + str(i) + ", Loss: " + str(loss)) - if rank == 0: - print("Average Training Iteration Time:", - (time.time() - start) / (i - warmup_steps), "s/iter") - acc_sum = 0.0 - with torch.no_grad(): - for i, batch in enumerate(eval_loader): - if i >= eval_steps: - break - batch = batch.to(rank) - batch.y = batch.y.to(torch.long) - out = model(batch.x, batch.edge_index) - acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) - print(f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) - if rank == 0: + print("Average Training Iteration Time:", + (time.time() - start) / (i - warmup_steps), "s/iter") acc_sum = 0.0 with torch.no_grad(): - for i, batch in enumerate(test_loader): + for i, batch in enumerate(eval_loader): + if i >= eval_steps: + break batch = batch.to(rank) batch.y = batch.y.to(torch.long) out = model(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) - print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) + torch.distributed.all_reduce(acc_sum, op=torch.distributed.ReduceOp.MEAN) + print(f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%") + acc_sum = 0.0 + with torch.no_grad(): + for i, batch in enumerate(test_loader): + batch = batch.to(rank) + batch.y = batch.y.to(torch.long) + out = model(batch.x, batch.edge_index) + acc_sum += acc(out[:batch_size].softmax(dim=-1), + batch.y[:batch_size]) + torch.distributed.all_reduce(acc_sum, op=torch.distributed.ReduceOp.MEAN) + print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%") if __name__ == '__main__': From 09a04a30b2a15d58b7abb21115d5b9617a95c209 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 26 Oct 2023 00:18:40 +0000 Subject: [PATCH 065/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 3c9980dd969f..e3085d307999 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -103,7 +103,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, out = model(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) - torch.distributed.all_reduce(acc_sum, op=torch.distributed.ReduceOp.MEAN) + torch.distributed.all_reduce(acc_sum, + op=torch.distributed.ReduceOp.MEAN) print(f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%") acc_sum = 0.0 with torch.no_grad(): @@ -113,7 +114,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, out = model(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) - torch.distributed.all_reduce(acc_sum, op=torch.distributed.ReduceOp.MEAN) + torch.distributed.all_reduce(acc_sum, + op=torch.distributed.ReduceOp.MEAN) print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%") From 171e893df9e098a11c4bd158ab9baf5e84704162 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 26 Oct 2023 13:34:28 -0700 Subject: [PATCH 066/197] cleanup --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 002f858bbc98..41572598c08c 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -125,7 +125,7 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, input_nodes=split_idx['test'], **kwargs) else: - from torch_geometric.loader import NeighborLoader + NeighborLoader = torch_geometric.loader.NeighborLoader num_work = get_num_workers(world_size) train_loader = NeighborLoader(data, input_nodes=split_idx['train'], num_workers=num_work, shuffle=True, From c911f3932e81dfe3cb178e422e9dc426e81d6020 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 26 Oct 2023 13:34:57 -0700 Subject: [PATCH 067/197] cleaning --- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index e3085d307999..21104b1520d5 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -63,7 +63,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, input_nodes=split_idx['test'], **kwargs) else: - from torch_geometric.loader import NeighborLoader + NeighborLoader = torch_geometric.loader.NeighborLoader num_work = get_num_workers(world_size) train_loader = NeighborLoader(data, input_nodes=split_idx['train'], num_workers=num_work, shuffle=True, From 8849c7df6b35858d7b36f1257edfe5357b5c01b5 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 26 Oct 2023 13:36:17 -0700 Subject: [PATCH 068/197] cleanup --- examples/ogbn_papers_100m.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index caeb61cc8e5e..c86e9fc17126 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -8,6 +8,7 @@ from ogb.nodeproppred import PygNodePropPredDataset import torch_geometric +from torch_geometric.loader import NeighborLoader parser = argparse.ArgumentParser() parser.add_argument('--hidden_channels', type=int, default=128) @@ -97,7 +98,6 @@ def get_num_workers() -> int: **kwargs) else: num_work = get_num_workers() - NeighborLoader = torch_geometric.loader.NeighborLoader train_loader = NeighborLoader(data=data, input_nodes=split_idx['train'], num_workers=num_work, drop_last=True, shuffle=False, **kwargs) From d54222198fbf53fa302f53492749b50be21ce509 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 26 Oct 2023 13:56:11 -0700 Subject: [PATCH 069/197] cleaning --- examples/ogbn_papers_100m.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index c86e9fc17126..d80d4ae25184 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -141,8 +141,7 @@ def train(): optimizer.step() if i % 10 == 0: - print(f'Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}, ' - f's/iter: {time.perf_counter() - start:.6f}') + print(f'Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}') print(f'Average Training Iteration Time (s/iter): \ {(time.perf_counter() - start_avg_time)/(i-warmup_steps):.6f}') From 86958fb976fb2cfc971c2fe8fbeea0e497cc1d38 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 26 Oct 2023 14:41:45 -0700 Subject: [PATCH 070/197] cleaning --- examples/ogbn_papers_100m.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index d80d4ae25184..ab03d94963e1 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -130,7 +130,6 @@ def train(): if i == warmup_steps: start_avg_time = time.perf_counter() - start = time.perf_counter() batch = batch.to(device) optimizer.zero_grad() batch_size = batch.num_sampled_nodes[0] From 99b75113218b760736c233f52dc6c752b1dfe2e9 Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Thu, 26 Oct 2023 15:38:17 -0700 Subject: [PATCH 071/197] cleaning multinode example --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 41572598c08c..ac5cfe6e41f7 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -83,7 +83,6 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, local_group = get_local_process_group() loc_id = dist.get_rank(group=local_group) rank = torch.distributed.get_rank() - os.environ['NVSHMEM_SYMMETRIC_SIZE'] = "107374182400" if rank == 0: print("Data =", data) print('Using', nprocs, 'GPUs...') From 3e353f61f954dbdc5cdbcfa68b83e3a41130382f Mon Sep 17 00:00:00 2001 From: puririshi98 Date: Fri, 27 Oct 2023 16:53:43 -0700 Subject: [PATCH 072/197] cleaning multinode example --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 7 +++++-- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 6 +++++- examples/ogbn_papers_100m.py | 3 +++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index ac5cfe6e41f7..2e8caee81853 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -179,7 +179,8 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, torch.distributed.all_reduce(acc_sum, op=torch.distributed.ReduceOp.MEAN) print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) - + if rank == 0: + print("Total Program Runtime =", round(time.perf_counter() - wall_clock_start, 2), "seconds") if __name__ == '__main__': parser = argparse.ArgumentParser() @@ -215,6 +216,7 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, ) args = parser.parse_args() + wall_clock_start = time.perf_counter() if args.cugraph_data_loader: from cugraph.testing.mg_utils import enable_spilling enable_spilling() @@ -248,4 +250,5 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, dataset.num_classes) run_train(device, data, nprocs, args.ngpu_per_node, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, - args.cugraph_data_loader) + args.cugraph_data_loader, wall_clock_start) + diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 21104b1520d5..9b9f6d184530 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -117,6 +117,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, torch.distributed.all_reduce(acc_sum, op=torch.distributed.ReduceOp.MEAN) print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%") + if rank == 0: + print("Total Program Runtime =", round(time.perf_counter() - wall_clock_start, 2), "seconds") if __name__ == '__main__': @@ -147,6 +149,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, ) args = parser.parse_args() + wall_clock_start = time.perf_counter() if args.cugraph_data_loader: from cugraph.testing.mg_utils import enable_spilling enable_spilling() @@ -171,5 +174,6 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, mp.spawn( run_train, args=(data, world_size, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, - args.cugraph_data_loader), nprocs=world_size, + args.cugraph_data_loader, wall_clock_start), nprocs=world_size, join=True) + diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index ab03d94963e1..e7b3910524c6 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -36,6 +36,7 @@ a reduction in batch_size/fan_out/hidden_channels/num_layers", ) args = parser.parse_args() +wall_clock_start = time.perf_counter() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') dataset = PygNodePropPredDataset(name='ogbn-papers100M') @@ -175,3 +176,5 @@ def test(loader: NeighborLoader, val_steps: Optional[int] = None): test_acc = test(test_loader) print(f'Test Acc: {test_acc:.4f}') +print("Total Program Runtime =", round(time.perf_counter() - wall_clock_start, 2), "seconds") + From 8b49b757a2009506df85d6467bd3f14ee0f3dc18 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 27 Oct 2023 23:54:44 +0000 Subject: [PATCH 073/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 5 +++-- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 8 ++++---- examples/ogbn_papers_100m.py | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 2e8caee81853..1cb1c940392a 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -180,7 +180,9 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, op=torch.distributed.ReduceOp.MEAN) print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) if rank == 0: - print("Total Program Runtime =", round(time.perf_counter() - wall_clock_start, 2), "seconds") + print("Total Program Runtime =", + round(time.perf_counter() - wall_clock_start, 2), "seconds") + if __name__ == '__main__': parser = argparse.ArgumentParser() @@ -251,4 +253,3 @@ def run_train(device, data, world_size, ngpu_per_node, model, epochs, run_train(device, data, nprocs, args.ngpu_per_node, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, args.cugraph_data_loader, wall_clock_start) - diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 9b9f6d184530..ebeb37c34750 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -118,7 +118,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, op=torch.distributed.ReduceOp.MEAN) print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%") if rank == 0: - print("Total Program Runtime =", round(time.perf_counter() - wall_clock_start, 2), "seconds") + print("Total Program Runtime =", + round(time.perf_counter() - wall_clock_start, 2), "seconds") if __name__ == '__main__': @@ -174,6 +175,5 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, mp.spawn( run_train, args=(data, world_size, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, - args.cugraph_data_loader, wall_clock_start), nprocs=world_size, - join=True) - + args.cugraph_data_loader, wall_clock_start), + nprocs=world_size, join=True) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index e7b3910524c6..ac14fc07df11 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -176,5 +176,5 @@ def test(loader: NeighborLoader, val_steps: Optional[int] = None): test_acc = test(test_loader) print(f'Test Acc: {test_acc:.4f}') -print("Total Program Runtime =", round(time.perf_counter() - wall_clock_start, 2), "seconds") - +print("Total Program Runtime =", + round(time.perf_counter() - wall_clock_start, 2), "seconds") From bfa98f9e7f3c8849864611d2c13b7904cbdd5f65 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 30 Oct 2023 14:33:22 -0700 Subject: [PATCH 074/197] fixes for single node multigpu cugraph --- .../singlenode_multigpu_papers100m_gcn.py | 347 +++++++++++++----- 1 file changed, 261 insertions(+), 86 deletions(-) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index ebeb37c34750..5ab0b7cb6a0c 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -1,37 +1,122 @@ import argparse import os -import time +import numpy as np -import torch -import torch.distributed as dist -import torch.multiprocessing as mp -import torch.nn.functional as F -from ogb.nodeproppred import PygNodePropPredDataset -from torch.nn.parallel import DistributedDataParallel -from torchmetrics import Accuracy +os.environ['CUDF_SPILL'] = '1' +os.environ['RAPIDS_NO_INITIALIZE'] = '1' -import torch_geometric +def start_dask_cluster(): + from cugraph.testing.mg_utils import enable_spilling + from dask_cuda import LocalCUDACluster -def get_num_workers(world_size: int) -> int: - num_workers = None + cluster = LocalCUDACluster( + protocol="tcp", + rmm_pool_size=None, + memory_limit=None, + ) + + from dask.distributed import Client + client = Client(cluster) + client.wait_for_workers(n_workers=len(cluster.workers)) + client.run(enable_spilling) + + print("Dask Cluster Setup Complete") + del client + return cluster + +def create_dask_client(scheduler_address): + from dask.distributed import Client, Lock + from cugraph.dask.comms import comms as Comms + + client = Client(scheduler_address) + lock = Lock('comms_init') + if lock.acquire(timeout=100): + try: + Comms.initialize(p2p=True) + finally: + lock.release() + else: + raise RuntimeError("Failed to acquire lock to initialize comms") + + return client + + +def shutdown_dask_client(client): + from cugraph.dask.comms import comms as Comms + Comms.destroy() + client.close() + + +def pyg_num_work(world_size): + num_work = None if hasattr(os, "sched_getaffinity"): try: - num_workers = len(os.sched_getaffinity(0)) // (2 * world_size) + num_work = len(os.sched_getaffinity(0)) / (2 * world_size) except Exception: pass - if num_workers is None: - num_workers = os.cpu_count() // (2 * world_size) - return num_workers + if num_work is None: + num_work = os.cpu_count() / (2 * world_size) + return int(num_work) -def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, - split_idx, num_classes, cugraph_data_loader): +def init_pytorch_worker(rank, world_size, cugraph_data_loader=False): + if cugraph_data_loader: + import rmm + import cupy + import torch + + """ + rmm.reinitialize( + devices=[rank], + pool_allocator=False, + managed_memory=False + ) + """ + + #if rank == 0: + # from rmm.allocators.torch import rmm_torch_allocator + # torch.cuda.memory.change_current_allocator(rmm_torch_allocator) + + cupy.cuda.Device(rank).use() + from rmm.allocators.cupy import rmm_cupy_allocator + cupy.cuda.set_allocator(rmm_cupy_allocator) + + from cugraph.testing.mg_utils import enable_spilling + enable_spilling() + + torch.cuda.set_device(rank) + os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '12355' - dist.init_process_group('nccl', rank=rank, world_size=world_size) - split_idx['train'] = split_idx['train'].split( - split_idx['train'].size(0) // world_size, dim=0)[rank].clone() + + import torch.distributed as dist + dist.init_process_group('nccl', rank=rank, world_size=world_size) + + +def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, + split_idx, num_classes, cugraph_data_loader, scheduler_address=None, tempdir=None): + import torch + from torch.nn.parallel import DistributedDataParallel + import torch.nn.functional as F + + import time + + from torchmetrics import Accuracy + + init_pytorch_worker( + rank, + world_size, + cugraph_data_loader=cugraph_data_loader, + ) + + if cugraph_data_loader: + print(f'creating dask client on rank {rank}') + client = create_dask_client(scheduler_address) + print(f'created dask client on rank {rank}') + else: + split_idx['train'] = split_idx['train'].split( + split_idx['train'].size(0) // world_size, dim=0)[rank].clone() model = model.to(rank) model = DistributedDataParallel(model, device_ids=[rank]) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, @@ -45,84 +130,157 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, if cugraph_data_loader: import cugraph from cugraph_pyg.data import CuGraphStore - from cugraph_pyg.loader import CuGraphNeighborLoader + from cugraph_pyg.loader import CuGraphNeighborLoader, BulkSampleLoader G = {("N", "E", "N"): data.edge_index} N = {"N": data.num_nodes} fs = cugraph.gnn.FeatureStore(backend="torch") fs.add_data(data.x, "N", "x") fs.add_data(data.y, "N", "y") - cugraph_store = CuGraphStore(fs, G, N) - train_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['train'], - shuffle=True, drop_last=True, - **kwargs) - eval_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['valid'], - **kwargs) - test_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['test'], - **kwargs) + + from distributed import Event as Dask_Event + event = Dask_Event("cugraph_store_creation_event") + + import torch_geometric + import torch.distributed as dist + from torch.distributed.algorithms.join import Join + dist.barrier() + + if rank == 0: + print("Rank 0 creating its cugraph store and initializing distributed graph") + cugraph_store = CuGraphStore(fs, G, N, multi_gpu=True) + event.set() + print("Distributed graph initialization complete.") + else: + print(f"Rank {rank} waiting for distributed graph initialization") + if event.wait(timeout=1000): + print(f"Rank {rank} proceeding with store creation") + cugraph_store = CuGraphStore(fs, {k:len(v) for k,v in G.items()}, N, multi_gpu=False) + print(f"Rank {rank} created store") + + dist.barrier() + if rank == 0: + for epoch in range(epochs): + train_path=os.path.join(tempdir, f'samples_{epoch}') + os.mkdir(train_path) + # runs sampling for the training epoch + BulkSampleLoader( + cugraph_store, + cugraph_store, + input_nodes=split_idx['train'], + directory=train_path, + #shuffle=True, drop_last=True, + **kwargs + ) + + print('validation', len(split_idx['valid'])) + eval_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['valid'], + **kwargs) + test_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['test'], + **kwargs) + + dist.barrier() else: - NeighborLoader = torch_geometric.loader.NeighborLoader - num_work = get_num_workers(world_size) + from torch_geometric.loader import NeighborLoader + num_work = pyg_num_work(world_size) train_loader = NeighborLoader(data, input_nodes=split_idx['train'], num_workers=num_work, shuffle=True, drop_last=True, **kwargs) - eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], - num_workers=num_work, **kwargs) - test_loader = NeighborLoader(data, input_nodes=split_idx['test'], - num_workers=num_work, **kwargs) + + if rank == 0: + eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], + num_workers=num_work, **kwargs) + test_loader = NeighborLoader(data, input_nodes=split_idx['test'], + num_workers=num_work, **kwargs) + + dist.barrier() eval_steps = 1000 warmup_steps = 20 acc = Accuracy(task="multiclass", num_classes=num_classes).to(rank) if rank == 0: print("Beginning training...") for epoch in range(epochs): - for i, batch in enumerate(train_loader): - if i == warmup_steps: - start = time.time() - batch = batch.to(rank) - batch.y = batch.y.to(torch.long) - optimizer.zero_grad() - out = model(batch.x, batch.edge_index) - loss = F.cross_entropy(out[:batch_size], batch.y[:batch_size]) - loss.backward() - optimizer.step() - if rank == 0 and i % 10 == 0: - print("Epoch: " + str(epoch) + ", Iteration: " + str(i) + - ", Loss: " + str(loss)) - print("Average Training Iteration Time:", - (time.time() - start) / (i - warmup_steps), "s/iter") - acc_sum = 0.0 - with torch.no_grad(): - for i, batch in enumerate(eval_loader): - if i >= eval_steps: - break + if cugraph_data_loader: + train_path = os.path.join(tempdir, f'samples_{epoch}') + + input_files=np.array_split( + np.array(os.listdir(train_path)), + world_size + )[rank] + + train_loader = BulkSampleLoader( + cugraph_store, + cugraph_store, + directory=train_path, + input_files=input_files + ) + with Join([model]): + for i, batch in enumerate(train_loader): + if i >= warmup_steps: + start = time.time() batch = batch.to(rank) + + if isinstance(batch, torch_geometric.data.HeteroData): + batch = batch.to_homogeneous() + batch_size = batch.num_sampled_nodes[0] + batch.y = batch.y.to(torch.long) + optimizer.zero_grad() out = model(batch.x, batch.edge_index) - acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) - torch.distributed.all_reduce(acc_sum, - op=torch.distributed.ReduceOp.MEAN) - print(f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%") - acc_sum = 0.0 - with torch.no_grad(): - for i, batch in enumerate(test_loader): - batch = batch.to(rank) - batch.y = batch.y.to(torch.long) - out = model(batch.x, batch.edge_index) - acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) - torch.distributed.all_reduce(acc_sum, - op=torch.distributed.ReduceOp.MEAN) - print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%") - if rank == 0: - print("Total Program Runtime =", - round(time.perf_counter() - wall_clock_start, 2), "seconds") + loss = F.cross_entropy(out[:batch_size], batch.y[:batch_size]) + loss.backward() + optimizer.step() + if rank == 0 and i % 10 == 0: + print("Epoch: " + str(epoch) + ", Iteration: " + str(i) + + ", Loss: " + str(loss)) + dist.barrier() + with Join([model]): + if rank == 0: + print("Average Training Iteration Time:", + (time.time() - start) / (i - warmup_steps), "s/iter") + acc_sum = 0.0 + with torch.no_grad(): + for i, batch in enumerate(eval_loader): + if i >= eval_steps: + break + + batch = batch.to(rank) + if isinstance(batch, torch_geometric.data.HeteroData): + batch = batch.to_homogeneous() + batch_size = batch.num_sampled_nodes[0] + + batch.y = batch.y.to(torch.long) + out = model.module(batch.x, batch.edge_index) + acc_sum += acc(out[:batch_size].softmax(dim=-1), + batch.y[:batch_size]) + print(f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) + dist.barrier() + + with Join([model]): + if rank == 0: + acc_sum = 0.0 + with torch.no_grad(): + for i, batch in enumerate(test_loader): + batch = batch.to(rank) + if isinstance(batch, torch_geometric.data.HeteroData): + batch = batch.to_homogeneous() + batch_size = batch.num_sampled_nodes[0] + + batch.y = batch.y.to(torch.long) + out = model.module(batch.x, batch.edge_index) + acc_sum += acc(out[:batch_size].softmax(dim=-1), + batch.y[:batch_size]) + print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) + dist.barrier() + + if cugraph_data_loader: + shutdown_dask_client(client) + dist.barrier() if __name__ == '__main__': + parser = argparse.ArgumentParser() parser.add_argument('--hidden_channels', type=int, default=128) parser.add_argument('--num_layers', type=int, default=2) @@ -150,10 +308,15 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, ) args = parser.parse_args() - wall_clock_start = time.perf_counter() - if args.cugraph_data_loader: - from cugraph.testing.mg_utils import enable_spilling - enable_spilling() + + cluster = start_dask_cluster() if args.cugraph_data_loader else None + + import torch + import torch.multiprocessing as mp + import torch_geometric + + from ogb.nodeproppred import PygNodePropPredDataset + dataset = PygNodePropPredDataset(name='ogbn-papers100M') split_idx = dataset.get_idx_split() data = dataset[0] @@ -169,11 +332,23 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, args.hidden_channels, args.num_layers, dataset.num_classes) + print("Data =", data) world_size = torch.cuda.device_count() print('Let\'s use', world_size, 'GPUs!') - mp.spawn( - run_train, args=(data, world_size, model, args.epochs, args.batch_size, - args.fan_out, split_idx, dataset.num_classes, - args.cugraph_data_loader, wall_clock_start), - nprocs=world_size, join=True) + + import tempfile + with tempfile.TemporaryDirectory() as tempdir: + mp.spawn( + run_train, + args=(data, world_size, model, args.epochs, args.batch_size, + args.fan_out, split_idx, dataset.num_classes, + args.cugraph_data_loader, + None if cluster is None else cluster.scheduler_address, + tempdir + ), + nprocs=world_size, + join=True) + + if cluster is not None: + cluster.close() From 9cc4d8e62c5418c110a7b549d75cac68aab8cc4b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 30 Oct 2023 21:34:16 +0000 Subject: [PATCH 075/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../singlenode_multigpu_papers100m_gcn.py | 90 +++++++++---------- 1 file changed, 44 insertions(+), 46 deletions(-) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 5ab0b7cb6a0c..fb01a731bca6 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -1,13 +1,14 @@ import argparse import os + import numpy as np os.environ['CUDF_SPILL'] = '1' os.environ['RAPIDS_NO_INITIALIZE'] = '1' + def start_dask_cluster(): from cugraph.testing.mg_utils import enable_spilling - from dask_cuda import LocalCUDACluster cluster = LocalCUDACluster( @@ -25,9 +26,10 @@ def start_dask_cluster(): del client return cluster + def create_dask_client(scheduler_address): - from dask.distributed import Client, Lock from cugraph.dask.comms import comms as Comms + from dask.distributed import Client, Lock client = Client(scheduler_address) lock = Lock('comms_init') @@ -62,10 +64,9 @@ def pyg_num_work(world_size): def init_pytorch_worker(rank, world_size, cugraph_data_loader=False): if cugraph_data_loader: - import rmm import cupy + import rmm import torch - """ rmm.reinitialize( devices=[rank], @@ -91,17 +92,17 @@ def init_pytorch_worker(rank, world_size, cugraph_data_loader=False): os.environ['MASTER_PORT'] = '12355' import torch.distributed as dist - dist.init_process_group('nccl', rank=rank, world_size=world_size) + dist.init_process_group('nccl', rank=rank, world_size=world_size) def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, - split_idx, num_classes, cugraph_data_loader, scheduler_address=None, tempdir=None): - import torch - from torch.nn.parallel import DistributedDataParallel - import torch.nn.functional as F - + split_idx, num_classes, cugraph_data_loader, + scheduler_address=None, tempdir=None): import time + import torch + import torch.nn.functional as F + from torch.nn.parallel import DistributedDataParallel from torchmetrics import Accuracy init_pytorch_worker( @@ -130,7 +131,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, if cugraph_data_loader: import cugraph from cugraph_pyg.data import CuGraphStore - from cugraph_pyg.loader import CuGraphNeighborLoader, BulkSampleLoader + from cugraph_pyg.loader import BulkSampleLoader, CuGraphNeighborLoader G = {("N", "E", "N"): data.edge_index} N = {"N": data.num_nodes} fs = cugraph.gnn.FeatureStore(backend="torch") @@ -140,13 +141,16 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, from distributed import Event as Dask_Event event = Dask_Event("cugraph_store_creation_event") - import torch_geometric import torch.distributed as dist from torch.distributed.algorithms.join import Join + + import torch_geometric dist.barrier() if rank == 0: - print("Rank 0 creating its cugraph store and initializing distributed graph") + print( + "Rank 0 creating its cugraph store and initializing distributed graph" + ) cugraph_store = CuGraphStore(fs, G, N, multi_gpu=True) event.set() print("Distributed graph initialization complete.") @@ -154,13 +158,16 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, print(f"Rank {rank} waiting for distributed graph initialization") if event.wait(timeout=1000): print(f"Rank {rank} proceeding with store creation") - cugraph_store = CuGraphStore(fs, {k:len(v) for k,v in G.items()}, N, multi_gpu=False) + cugraph_store = CuGraphStore(fs, { + k: len(v) + for k, v in G.items() + }, N, multi_gpu=False) print(f"Rank {rank} created store") dist.barrier() if rank == 0: for epoch in range(epochs): - train_path=os.path.join(tempdir, f'samples_{epoch}') + train_path = os.path.join(tempdir, f'samples_{epoch}') os.mkdir(train_path) # runs sampling for the training epoch BulkSampleLoader( @@ -169,8 +176,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, input_nodes=split_idx['train'], directory=train_path, #shuffle=True, drop_last=True, - **kwargs - ) + **kwargs) print('validation', len(split_idx['valid'])) eval_loader = CuGraphNeighborLoader(cugraph_store, @@ -179,7 +185,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, test_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['test'], **kwargs) - + dist.barrier() else: from torch_geometric.loader import NeighborLoader @@ -204,17 +210,12 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, if cugraph_data_loader: train_path = os.path.join(tempdir, f'samples_{epoch}') - input_files=np.array_split( - np.array(os.listdir(train_path)), - world_size - )[rank] + input_files = np.array_split(np.array(os.listdir(train_path)), + world_size)[rank] - train_loader = BulkSampleLoader( - cugraph_store, - cugraph_store, - directory=train_path, - input_files=input_files - ) + train_loader = BulkSampleLoader(cugraph_store, cugraph_store, + directory=train_path, + input_files=input_files) with Join([model]): for i, batch in enumerate(train_loader): if i >= warmup_steps: @@ -233,18 +234,18 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, optimizer.step() if rank == 0 and i % 10 == 0: print("Epoch: " + str(epoch) + ", Iteration: " + str(i) + - ", Loss: " + str(loss)) + ", Loss: " + str(loss)) dist.barrier() with Join([model]): if rank == 0: print("Average Training Iteration Time:", - (time.time() - start) / (i - warmup_steps), "s/iter") + (time.time() - start) / (i - warmup_steps), "s/iter") acc_sum = 0.0 with torch.no_grad(): for i, batch in enumerate(eval_loader): if i >= eval_steps: break - + batch = batch.to(rank) if isinstance(batch, torch_geometric.data.HeteroData): batch = batch.to_homogeneous() @@ -253,10 +254,10 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) + batch.y[:batch_size]) print(f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) dist.barrier() - + with Join([model]): if rank == 0: acc_sum = 0.0 @@ -270,10 +271,10 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) + batch.y[:batch_size]) print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) dist.barrier() - + if cugraph_data_loader: shutdown_dask_client(client) dist.barrier() @@ -313,10 +314,10 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, import torch import torch.multiprocessing as mp - import torch_geometric - from ogb.nodeproppred import PygNodePropPredDataset + import torch_geometric + dataset = PygNodePropPredDataset(name='ogbn-papers100M') split_idx = dataset.get_idx_split() data = dataset[0] @@ -340,15 +341,12 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, import tempfile with tempfile.TemporaryDirectory() as tempdir: mp.spawn( - run_train, + run_train, args=(data, world_size, model, args.epochs, args.batch_size, - args.fan_out, split_idx, dataset.num_classes, - args.cugraph_data_loader, - None if cluster is None else cluster.scheduler_address, - tempdir - ), - nprocs=world_size, - join=True) - + args.fan_out, split_idx, dataset.num_classes, + args.cugraph_data_loader, + None if cluster is None else cluster.scheduler_address, + tempdir), nprocs=world_size, join=True) + if cluster is not None: cluster.close() From bde8c82eb1e9c8dcc231d8a90033933f00764b3a Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 30 Oct 2023 16:36:55 -0700 Subject: [PATCH 076/197] cleaning up the code shared by cugraph team --- .../multi_gpu/singlenode_multigpu_papers100m_gcn.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index fb01a731bca6..4903df579422 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -2,7 +2,12 @@ import os import numpy as np +import torch +import torch.multiprocessing as mp +from ogb.nodeproppred import PygNodePropPredDataset +import torch_geometric +import torch.distributed as dist os.environ['CUDF_SPILL'] = '1' os.environ['RAPIDS_NO_INITIALIZE'] = '1' @@ -90,8 +95,6 @@ def init_pytorch_worker(rank, world_size, cugraph_data_loader=False): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '12355' - - import torch.distributed as dist dist.init_process_group('nccl', rank=rank, world_size=world_size) @@ -312,12 +315,6 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, cluster = start_dask_cluster() if args.cugraph_data_loader else None - import torch - import torch.multiprocessing as mp - from ogb.nodeproppred import PygNodePropPredDataset - - import torch_geometric - dataset = PygNodePropPredDataset(name='ogbn-papers100M') split_idx = dataset.get_idx_split() data = dataset[0] From 579d47e99dacb80b8731df604c2d6fcb29048b20 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 30 Oct 2023 23:37:49 +0000 Subject: [PATCH 077/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 4903df579422..23763faf02e6 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -3,11 +3,12 @@ import numpy as np import torch +import torch.distributed as dist import torch.multiprocessing as mp from ogb.nodeproppred import PygNodePropPredDataset import torch_geometric -import torch.distributed as dist + os.environ['CUDF_SPILL'] = '1' os.environ['RAPIDS_NO_INITIALIZE'] = '1' From ac1df3c03dca97012b16c4775bdae8fc7bdf46af Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 30 Oct 2023 16:58:19 -0700 Subject: [PATCH 078/197] doesnt work... --- .../singlenode_multigpu_papers100m_gcn.py | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 23763faf02e6..fa55182c22dc 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -4,10 +4,17 @@ import numpy as np import torch import torch.distributed as dist +from torch.distributed.algorithms.join import Join import torch.multiprocessing as mp from ogb.nodeproppred import PygNodePropPredDataset +import time +import torch +import torch.nn.functional as F +from torch.nn.parallel import DistributedDataParallel +from torchmetrics import Accuracy import torch_geometric +import tempfile os.environ['CUDF_SPILL'] = '1' os.environ['RAPIDS_NO_INITIALIZE'] = '1' @@ -72,7 +79,6 @@ def init_pytorch_worker(rank, world_size, cugraph_data_loader=False): if cugraph_data_loader: import cupy import rmm - import torch """ rmm.reinitialize( devices=[rank], @@ -102,13 +108,6 @@ def init_pytorch_worker(rank, world_size, cugraph_data_loader=False): def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, split_idx, num_classes, cugraph_data_loader, scheduler_address=None, tempdir=None): - import time - - import torch - import torch.nn.functional as F - from torch.nn.parallel import DistributedDataParallel - from torchmetrics import Accuracy - init_pytorch_worker( rank, world_size, @@ -144,11 +143,6 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, from distributed import Event as Dask_Event event = Dask_Event("cugraph_store_creation_event") - - import torch.distributed as dist - from torch.distributed.algorithms.join import Join - - import torch_geometric dist.barrier() if rank == 0: @@ -336,7 +330,6 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, world_size = torch.cuda.device_count() print('Let\'s use', world_size, 'GPUs!') - import tempfile with tempfile.TemporaryDirectory() as tempdir: mp.spawn( run_train, From 5097b7e20b9bc4e9873818232af7cd419d63d26d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 30 Oct 2023 23:59:14 +0000 Subject: [PATCH 079/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../multi_gpu/singlenode_multigpu_papers100m_gcn.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index fa55182c22dc..2396b5322303 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -1,20 +1,19 @@ import argparse import os +import tempfile +import time import numpy as np import torch import torch.distributed as dist -from torch.distributed.algorithms.join import Join import torch.multiprocessing as mp -from ogb.nodeproppred import PygNodePropPredDataset -import time - -import torch import torch.nn.functional as F +from ogb.nodeproppred import PygNodePropPredDataset +from torch.distributed.algorithms.join import Join from torch.nn.parallel import DistributedDataParallel from torchmetrics import Accuracy + import torch_geometric -import tempfile os.environ['CUDF_SPILL'] = '1' os.environ['RAPIDS_NO_INITIALIZE'] = '1' From 700713f668aa9a5ba14123bb44ca3c270edb0012 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 31 Oct 2023 10:47:03 -0700 Subject: [PATCH 080/197] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index afd395909c10..21e1dba497f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [2.5.0] - 2023-MM-DD ### Added - +- Upgrades for multinode-multigpu example ([#8173](https://github.com/pyg-team/pytorch_geometric/pull/8173)) - Added a tutorial for multi-node multi-GPU training with pure PyTorch ([#8071](https://github.com/pyg-team/pytorch_geometric/pull/8071)) - Added a multinode-multigpu example on `ogbn-papers100M` ([#8070](https://github.com/pyg-team/pytorch_geometric/pull/8070)) - Added support for `to_hetero_with_bases` on static graphs ([#8247](https://github.com/pyg-team/pytorch_geometric/pull/8247)) From 8722ccee724dcedef3fed54fe2d8779b7aea74de Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 31 Oct 2023 17:49:54 +0000 Subject: [PATCH 081/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../multi_gpu/multinode_multigpu_papers100m_gcn.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 66e315195ba2..bf731d825997 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -29,9 +29,9 @@ def get_num_workers(world_size: int) -> int: num_workers = os.cpu_count() // (2 * world_size) return num_workers -def run_train(data, world_size, model, epochs, - batch_size, fan_out, split_idx, num_classes, - cugraph_data_loader): + +def run_train(data, world_size, model, epochs, batch_size, fan_out, split_idx, + num_classes, cugraph_data_loader): local_id = int(os.environ['LOCAL_RANK']) rank = torch.distributed.get_rank() torch.cuda.set_device(local_id) @@ -189,6 +189,6 @@ def run_train(data, world_size, model, epochs, args.hidden_channels, args.num_layers, dataset.num_classes) - run_train(data, nprocs, model, args.epochs, - args.batch_size, args.fan_out, split_idx, dataset.num_classes, - args.cugraph_data_loader, wall_clock_start) + run_train(data, nprocs, model, args.epochs, args.batch_size, args.fan_out, + split_idx, dataset.num_classes, args.cugraph_data_loader, + wall_clock_start) From 35b559734dd14c924f9b93c1ce412fbd2e7a217a Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 31 Oct 2023 11:06:01 -0700 Subject: [PATCH 082/197] cleaning --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index bf731d825997..1d67a0663b7e 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -7,6 +7,7 @@ """ import os import time +import argparse import torch import torch.distributed as dist From d7d7812d9b29d0f2775fcf7d58514349eecdffd0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 31 Oct 2023 18:06:56 +0000 Subject: [PATCH 083/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 1d67a0663b7e..8b5f75c45be7 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -5,9 +5,9 @@ --container-mounts=/ogb-papers100m/:/workspace/dataset python3 path_to_script.py """ +import argparse import os import time -import argparse import torch import torch.distributed as dist From 5662921f7eb727b65dc4f9cbd98e49c3c2b850ad Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 31 Oct 2023 11:12:11 -0700 Subject: [PATCH 084/197] cleaning SNMG example --- .../singlenode_multigpu_papers100m_gcn.py | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 2396b5322303..4cb09eeb78c1 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -77,19 +77,6 @@ def pyg_num_work(world_size): def init_pytorch_worker(rank, world_size, cugraph_data_loader=False): if cugraph_data_loader: import cupy - import rmm - """ - rmm.reinitialize( - devices=[rank], - pool_allocator=False, - managed_memory=False - ) - """ - - #if rank == 0: - # from rmm.allocators.torch import rmm_torch_allocator - # torch.cuda.memory.change_current_allocator(rmm_torch_allocator) - cupy.cuda.Device(rank).use() from rmm.allocators.cupy import rmm_cupy_allocator cupy.cuda.set_allocator(rmm_cupy_allocator) @@ -146,7 +133,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, if rank == 0: print( - "Rank 0 creating its cugraph store and initializing distributed graph" + "Rank 0 creating its cugraph store and \ + initializing distributed graph" ) cugraph_store = CuGraphStore(fs, G, N, multi_gpu=True) event.set() @@ -163,16 +151,22 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, dist.barrier() if rank == 0: + for epoch in range(epochs): train_path = os.path.join(tempdir, f'samples_{epoch}') os.mkdir(train_path) - # runs sampling for the training epoch + # Runs sampling for the training epoch. + # Note that train dataloader SHOULD have shuffle and drop_last as True. + # However, this feature is not yet available. + # Coming early 2024. + # CuGraph can produce huge speed ups but not shuffling + # can have negative impacts on val/test accuracy. BulkSampleLoader( cugraph_store, cugraph_store, input_nodes=split_idx['train'], directory=train_path, - #shuffle=True, drop_last=True, + # shuffle=True, drop_last=True, **kwargs) print('validation', len(split_idx['valid'])) From 48094f92c09075f1413fb1c546bf07f610992009 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 31 Oct 2023 18:13:05 +0000 Subject: [PATCH 085/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 4cb09eeb78c1..8f60f3b6487b 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -132,10 +132,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, dist.barrier() if rank == 0: - print( - "Rank 0 creating its cugraph store and \ - initializing distributed graph" - ) + print("Rank 0 creating its cugraph store and \ + initializing distributed graph") cugraph_store = CuGraphStore(fs, G, N, multi_gpu=True) event.set() print("Distributed graph initialization complete.") @@ -151,7 +149,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, dist.barrier() if rank == 0: - + for epoch in range(epochs): train_path = os.path.join(tempdir, f'samples_{epoch}') os.mkdir(train_path) From 7b701ca470dacb1e611a89d3166dbe6696cc9e5e Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 1 Nov 2023 08:55:31 -0700 Subject: [PATCH 086/197] cleaning --- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 8f60f3b6487b..32711c41d773 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -154,7 +154,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, train_path = os.path.join(tempdir, f'samples_{epoch}') os.mkdir(train_path) # Runs sampling for the training epoch. - # Note that train dataloader SHOULD have shuffle and drop_last as True. + # Note that train dataloader SHOULD have shuffle + # and drop_last as True. # However, this feature is not yet available. # Coming early 2024. # CuGraph can produce huge speed ups but not shuffling From f5ebbc4d787dfc2a3289c660c25d34aaa443d603 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 13 Nov 2023 10:12:18 -0800 Subject: [PATCH 087/197] fixing worldsize issue on get_num_workers --- examples/multi_gpu/multinode_multigpu_papers100m_gcn.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py index 8b5f75c45be7..648746cd9a11 100644 --- a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py @@ -19,15 +19,15 @@ import torch_geometric -def get_num_workers(world_size: int) -> int: +def get_num_workers() -> int: num_workers = None if hasattr(os, "sched_getaffinity"): try: - num_workers = len(os.sched_getaffinity(0)) // (2 * world_size) + num_workers = len(os.sched_getaffinity(0)) // 2 except Exception: pass if num_workers is None: - num_workers = os.cpu_count() // (2 * world_size) + num_workers = os.cpu_count() // 2 return num_workers @@ -79,7 +79,7 @@ def run_train(data, world_size, model, epochs, batch_size, fan_out, split_idx, **kwargs) else: NeighborLoader = torch_geometric.loader.NeighborLoader - num_work = get_num_workers(world_size) + num_work = get_num_workers() train_loader = NeighborLoader(data, input_nodes=split_idx['train'], num_workers=num_work, shuffle=True, drop_last=True, **kwargs) From d68359760e784177c60782a148802d54d2d926a2 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 14 Nov 2023 11:57:28 -0800 Subject: [PATCH 088/197] SNMG needed a timer --- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 32711c41d773..63c8b61628a3 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -263,6 +263,9 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) + if rank == 0: + print("Total Program Runtime =", + round(time.perf_counter() - wall_clock_start, 2), "seconds") dist.barrier() if cugraph_data_loader: @@ -299,7 +302,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, ) args = parser.parse_args() - + wall_clock_start = time.perf_counter() cluster = start_dask_cluster() if args.cugraph_data_loader else None dataset = PygNodePropPredDataset(name='ogbn-papers100M') @@ -329,7 +332,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, args.fan_out, split_idx, dataset.num_classes, args.cugraph_data_loader, None if cluster is None else cluster.scheduler_address, - tempdir), nprocs=world_size, join=True) + tempdir, wall_clock_start), nprocs=world_size, join=True) if cluster is not None: cluster.close() From 29f4abe9a20ab448aa4038a468221d9a24ec2472 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 14 Nov 2023 12:03:10 -0800 Subject: [PATCH 089/197] fixing --- examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py index 63c8b61628a3..6add3938e608 100644 --- a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py +++ b/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py @@ -92,7 +92,7 @@ def init_pytorch_worker(rank, world_size, cugraph_data_loader=False): def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, - split_idx, num_classes, cugraph_data_loader, + split_idx, num_classes, cugraph_data_loader, wall_clock_start, scheduler_address=None, tempdir=None): init_pytorch_worker( rank, @@ -330,9 +330,9 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, run_train, args=(data, world_size, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, - args.cugraph_data_loader, + args.cugraph_data_loader, wall_clock_start, None if cluster is None else cluster.scheduler_address, - tempdir, wall_clock_start), nprocs=world_size, join=True) + tempdir), nprocs=world_size, join=True) if cluster is not None: cluster.close() From 89f6535a104bdb0da6157569e0c5e845d61d982a Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 7 Dec 2023 16:08:13 -0800 Subject: [PATCH 090/197] renaming to match mag240m PR --- ...ode_multigpu_papers100m_gcn.py => papers100m_gcn_multinode.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/multi_gpu/{multinode_multigpu_papers100m_gcn.py => papers100m_gcn_multinode.py} (100%) diff --git a/examples/multi_gpu/multinode_multigpu_papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn_multinode.py similarity index 100% rename from examples/multi_gpu/multinode_multigpu_papers100m_gcn.py rename to examples/multi_gpu/papers100m_gcn_multinode.py From 5d4c54ae1372e38958f2c6aff92e099c6a961072 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 8 Dec 2023 11:45:55 -0800 Subject: [PATCH 091/197] renaming --- .../{singlenode_multigpu_papers100m_gcn.py => papers100m_gcn.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/multi_gpu/{singlenode_multigpu_papers100m_gcn.py => papers100m_gcn.py} (100%) diff --git a/examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py similarity index 100% rename from examples/multi_gpu/singlenode_multigpu_papers100m_gcn.py rename to examples/multi_gpu/papers100m_gcn.py From c7bf8dfb1600183adb27a471933f751ea184a429 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 8 Dec 2023 12:23:07 -0800 Subject: [PATCH 092/197] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a5c0391e627..47128183105a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -80,7 +80,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- Added multinode-multigpu Papers100m GCN example ([#8070](https://github.com/pyg-team/pytorch_geometric/pull/8070)) +- Add the `ogc` method as example ([#8168](https://github.com/pyg-team/pytorch_geometric/pull/8168)) - Added a tutorial on `NeighborLoader` ([#7931](https://github.com/pyg-team/pytorch_geometric/pull/7931)) - Added the option to override usage of `segment_matmul`/`grouped_matmul` via the `torch_geometric.backend.use_segment_matmul` flag ([#8148](https://github.com/pyg-team/pytorch_geometric/pull/8148)) - Added support for PyTorch 2.1.0 ([#8134](https://github.com/pyg-team/pytorch_geometric/pull/8134)) From 887b0a5deca436f23bc7eb5fb2863047d357d81d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 8 Dec 2023 20:23:20 +0000 Subject: [PATCH 093/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 3d13087a2e10..6add3938e608 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -335,4 +335,4 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, tempdir), nprocs=world_size, join=True) if cluster is not None: - cluster.close() \ No newline at end of file + cluster.close() From 9f3c3f551d0c920fc956f7f9fc5b39d5837585ef Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 11 Dec 2023 14:35:32 -0800 Subject: [PATCH 094/197] adding drop_last=True and shuffle=True --- examples/multi_gpu/papers100m_gcn.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 6add3938e608..cd0b6f54e406 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -153,19 +153,12 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, for epoch in range(epochs): train_path = os.path.join(tempdir, f'samples_{epoch}') os.mkdir(train_path) - # Runs sampling for the training epoch. - # Note that train dataloader SHOULD have shuffle - # and drop_last as True. - # However, this feature is not yet available. - # Coming early 2024. - # CuGraph can produce huge speed ups but not shuffling - # can have negative impacts on val/test accuracy. BulkSampleLoader( cugraph_store, cugraph_store, input_nodes=split_idx['train'], directory=train_path, - # shuffle=True, drop_last=True, + shuffle=True, drop_last=True, **kwargs) print('validation', len(split_idx['valid'])) From fd68f4f2590ce8a509fbb0e00e925eb80c3dbe1e Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 11 Dec 2023 14:36:13 -0800 Subject: [PATCH 095/197] fixing --- examples/multi_gpu/papers100m_gcn_multinode.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index d8dc58152305..5563dbb18875 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -62,15 +62,10 @@ def run_train(data, world_size, model, epochs, batch_size, fan_out, split_idx, fs.add_data(data.x, "N", "x") fs.add_data(data.y, "N", "y") cugraph_store = CuGraphStore(fs, G, N) - # Note that train dataloader SHOULD have shuffle and drop_last as True. - # However, this feature is not yet available in CuGraphNeighborLoader. - # Coming early 2024. - # CuGraphNeighborLoader can produce huge speed ups but not shuffling - # can have negative impacts on val/test accuracy. train_loader = CuGraphNeighborLoader( cugraph_store, input_nodes=split_idx['train'], - # shuffle=True, drop_last=True, + shuffle=True, drop_last=True, **kwargs) eval_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], From 0612dbf12be48d395ea288d659340e052610331d Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 11 Dec 2023 14:36:33 -0800 Subject: [PATCH 096/197] fixing --- examples/ogbn_papers_100m.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index ac14fc07df11..41f0ed30ada5 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -81,15 +81,10 @@ def get_num_workers() -> int: fs.add_data(data.x, "N", "x") fs.add_data(data.y, "N", "y") cugraph_store = CuGraphStore(fs, G, N) - # Note that train dataloader SHOULD have shuffle and drop_last as True. - # However, this feature is not yet available in CuGraphNeighborLoader. - # Coming early 2024. - # CuGraphNeighborLoader can produce huge speed ups but not shuffling - # can have negative impacts on val/test accuracy. train_loader = CuGraphNeighborLoader( cugraph_store, input_nodes=split_idx['train'], - # shuffle=True, drop_last=True, + shuffle=True, drop_last=True, **kwargs) val_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], From d1db8b973bbcd9126749321f4ac944820209d628 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 4 Jan 2024 10:14:46 -0800 Subject: [PATCH 097/197] removing the multinode changes, making it a seperate PR --- .../multi_gpu/papers100m_gcn_multinode.py | 215 +++++++----------- 1 file changed, 87 insertions(+), 128 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_multinode.py b/examples/multi_gpu/papers100m_gcn_multinode.py index 5563dbb18875..f827700ac73c 100644 --- a/examples/multi_gpu/papers100m_gcn_multinode.py +++ b/examples/multi_gpu/papers100m_gcn_multinode.py @@ -6,7 +6,6 @@ --container-mounts=/ogb-papers100m/:/workspace/dataset python3 path_to_script.py """ -import argparse import os import time @@ -15,9 +14,9 @@ import torch.nn.functional as F from ogb.nodeproppred import PygNodePropPredDataset from torch.nn.parallel import DistributedDataParallel -from torchmetrics import Accuracy -import torch_geometric +from torch_geometric.loader import NeighborLoader +from torch_geometric.nn import GCNConv def get_num_workers() -> int: @@ -32,160 +31,120 @@ def get_num_workers() -> int: return num_workers -def run_train(data, world_size, model, epochs, batch_size, fan_out, split_idx, - num_classes, cugraph_data_loader): +class GCN(torch.nn.Module): + def __init__(self, in_channels, hidden_channels, out_channels): + super().__init__() + self.conv1 = GCNConv(in_channels, hidden_channels) + self.conv2 = GCNConv(hidden_channels, out_channels) + + def forward(self, x, edge_index): + x = F.dropout(x, p=0.5, training=self.training) + x = self.conv1(x, edge_index).relu() + x = F.dropout(x, p=0.5, training=self.training) + x = self.conv2(x, edge_index) + return x + + +def run(world_size, data, split_idx, model): local_id = int(os.environ['LOCAL_RANK']) rank = torch.distributed.get_rank() torch.cuda.set_device(local_id) device = torch.device(local_id) if rank == 0: - print("Data =", data) - print('Using', world_size, 'GPUs...') + print(f'Using {nprocs} GPUs...') + split_idx['train'] = split_idx['train'].split( - split_idx['train'].size(0) // world_size, dim=0)[rank].clone() - model = model.to(device) - model = DistributedDataParallel(model, device_ids=[local_id]) - optimizer = torch.optim.Adam(model.parameters(), lr=0.01, - weight_decay=0.0005) + split_idx['train'].size(0) // world_size, + dim=0, + )[rank].clone() + + model = DistributedDataParallel(model.to(device), device_ids=[local_id]) + optimizer = torch.optim.Adam(model.parameters(), lr=0.01) + kwargs = dict( - num_neighbors=[fan_out, fan_out], - batch_size=batch_size, + data=data, + batch_size=128, + num_workers=get_num_workers(), + num_neighbors=[50, 50], ) - # Set Up Neighbor Loading - if cugraph_data_loader: - import cugraph - from cugraph_pyg.data import CuGraphStore - from cugraph_pyg.loader import CuGraphNeighborLoader - G = {("N", "E", "N"): data.edge_index} - N = {"N": data.num_nodes} - fs = cugraph.gnn.FeatureStore(backend="torch") - fs.add_data(data.x, "N", "x") - fs.add_data(data.y, "N", "y") - cugraph_store = CuGraphStore(fs, G, N) - train_loader = CuGraphNeighborLoader( - cugraph_store, - input_nodes=split_idx['train'], - shuffle=True, drop_last=True, - **kwargs) - eval_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['valid'], - **kwargs) - test_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['test'], - **kwargs) - else: - NeighborLoader = torch_geometric.loader.NeighborLoader - num_work = get_num_workers() - train_loader = NeighborLoader(data, input_nodes=split_idx['train'], - num_workers=num_work, shuffle=True, - drop_last=True, **kwargs) - eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], - num_workers=num_work, **kwargs) - test_loader = NeighborLoader(data, input_nodes=split_idx['test'], - num_workers=num_work, **kwargs) - - eval_steps = 1000 - warmup_steps = 20 - acc = Accuracy(task="multiclass", num_classes=num_classes).to(device) + + train_loader = NeighborLoader( + input_nodes=split_idx['train'], + shuffle=True, + **kwargs, + ) + if rank == 0: + val_loader = NeighborLoader(input_nodes=split_idx['valid'], **kwargs) + test_loader = NeighborLoader(input_nodes=split_idx['test'], **kwargs) + + val_steps = 1000 + warmup_steps = 100 if rank == 0: print("Beginning training...") - for epoch in range(epochs): + + for epoch in range(1, 4): + model.train() for i, batch in enumerate(train_loader): if i == warmup_steps: start = time.time() batch = batch.to(device) - batch.y = batch.y.to(torch.long) optimizer.zero_grad() - out = model(batch.x, batch.edge_index) - loss = F.cross_entropy(out[:batch_size], batch.y[:batch_size]) + y = batch.y[:batch.batch_size].view(-1).to(torch.long) + out = model(batch.x, batch.edge_index)[:batch.batch_size] + loss = F.cross_entropy(out, y) loss.backward() optimizer.step() + if rank == 0 and i % 10 == 0: - print("Epoch: " + str(epoch) + ", Iteration: " + str(i) + - ", Loss: " + str(loss)) - print("Average Training Iteration Time:", - (time.time() - start) / (i - 10), "s/iter") - acc_sum = 0.0 - with torch.no_grad(): - for i, batch in enumerate(eval_loader): - if i >= eval_steps: + print(f'Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}') + + if rank == 0: + sec_per_iter = (time.time() - start) / (i - warmup_steps) + print(f"Avg Training Iteration Time: {sec_per_iter:.6f} s/iter") + + model.eval() + total_correct = total_examples = 0 + for i, batch in enumerate(val_loader): + if i >= val_steps: break + if i == warmup_steps: + start = time.time() + batch = batch.to(device) - batch.y = batch.y.to(torch.long) - out = model(batch.x, batch.edge_index) - acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) - torch.distributed.all_reduce(acc_sum, - op=torch.distributed.ReduceOp.MEAN) - print(f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) - acc_sum = 0.0 - with torch.no_grad(): + with torch.no_grad(): + out = model(batch.x, batch.edge_index)[:batch.batch_size] + pred = out.argmax(dim=-1) + y = batch.y[:batch.batch_size].view(-1).to(torch.long) + + total_correct += int((pred == y).sum()) + total_examples += y.size(0) + + print(f"Val Acc: {total_correct / total_examples:.4f}") + sec_per_iter = (time.time() - start) / (i - warmup_steps) + print(f"Avg Inference Iteration Time: {sec_per_iter:.6f} s/iter") + + if rank == 0: + model.eval() + total_correct = total_examples = 0 for i, batch in enumerate(test_loader): batch = batch.to(device) - batch.y = batch.y.to(torch.long) - out = model(batch.x, batch.edge_index) - acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) - torch.distributed.all_reduce(acc_sum, - op=torch.distributed.ReduceOp.MEAN) - print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) - if rank == 0: - print("Total Program Runtime =", - round(time.perf_counter() - wall_clock_start, 2), "seconds") + with torch.no_grad(): + out = model(batch.x, batch.edge_index)[:batch.batch_size] + pred = out.argmax(dim=-1) + y = batch.y[:batch.batch_size].view(-1).to(torch.long) + total_correct += int((pred == y).sum()) + total_examples += y.size(0) + print(f"Test Acc: {total_correct / total_examples:.4f}") -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--hidden_channels', type=int, default=128) - parser.add_argument('--num_layers', type=int, default=2) - parser.add_argument('--lr', type=float, default=0.001) - parser.add_argument('--epochs', type=int, default=3) - parser.add_argument('--batch_size', type=int, default=2048) - parser.add_argument('--fan_out', type=int, default=16) - parser.add_argument( - "--use_gat_conv", - action='store_true', - help="Wether or not to use GATConv. (Defaults to using GCNConv)", - ) - parser.add_argument( - "--n_gat_conv_heads", - type=int, - default=4, - help="If using GATConv, number of attention heads to use", - ) - parser.add_argument( - "--cugraph_data_loader", - action='store_true', - help="Wether or not to use CuGraph for Neighbor Loading. \ - \nNote that this requires more GPU memory or \ - a reduction in batch_size/fan_out/hidden_channels/num_layers", - ) - args = parser.parse_args() - wall_clock_start = time.perf_counter() - if args.cugraph_data_loader: - from cugraph.testing.mg_utils import enable_spilling - enable_spilling() +if __name__ == '__main__': # Setup multi-node: torch.distributed.init_process_group("nccl") nprocs = dist.get_world_size() assert dist.is_initialized(), "Distributed cluster not initialized" dataset = PygNodePropPredDataset(name='ogbn-papers100M') split_idx = dataset.get_idx_split() + model = GCN(dataset.num_features, 64, dataset.num_classes) - data = dataset[0] - data.y = data.y.reshape(-1) - if args.use_gat_conv: - model = torch_geometric.nn.models.GAT(dataset.num_features, - args.hidden_channels, - args.num_layers, - dataset.num_classes, - heads=args.n_gat_conv_heads) - else: - model = torch_geometric.nn.models.GCN(dataset.num_features, - args.hidden_channels, - args.num_layers, - dataset.num_classes) - run_train(data, nprocs, model, args.epochs, args.batch_size, args.fan_out, - split_idx, dataset.num_classes, args.cugraph_data_loader, - wall_clock_start) + run(nprocs, dataset[0], split_idx, model) From 4ddb2fe03529b620023372b718b3a84dba04ee35 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 5 Jan 2024 09:18:49 -0800 Subject: [PATCH 098/197] Update examples/multi_gpu/papers100m_gcn.py Co-authored-by: Akihiro Nitta --- examples/multi_gpu/papers100m_gcn.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index cd0b6f54e406..e9dd9b2d0ee3 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -15,6 +15,8 @@ import torch_geometric +# Allow computation on objects that are larger than GPU memory +# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory os.environ['CUDF_SPILL'] = '1' os.environ['RAPIDS_NO_INITIALIZE'] = '1' From c4df7bbd662756c276699e0051c5a34eea881a5b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 5 Jan 2024 17:21:35 +0000 Subject: [PATCH 099/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn.py | 11 ++++------- examples/ogbn_papers_100m.py | 9 ++++----- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index e9dd9b2d0ee3..ab103e182e69 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -155,13 +155,10 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, for epoch in range(epochs): train_path = os.path.join(tempdir, f'samples_{epoch}') os.mkdir(train_path) - BulkSampleLoader( - cugraph_store, - cugraph_store, - input_nodes=split_idx['train'], - directory=train_path, - shuffle=True, drop_last=True, - **kwargs) + BulkSampleLoader(cugraph_store, cugraph_store, + input_nodes=split_idx['train'], + directory=train_path, shuffle=True, + drop_last=True, **kwargs) print('validation', len(split_idx['valid'])) eval_loader = CuGraphNeighborLoader(cugraph_store, diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 41f0ed30ada5..143de766245f 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -81,11 +81,10 @@ def get_num_workers() -> int: fs.add_data(data.x, "N", "x") fs.add_data(data.y, "N", "y") cugraph_store = CuGraphStore(fs, G, N) - train_loader = CuGraphNeighborLoader( - cugraph_store, - input_nodes=split_idx['train'], - shuffle=True, drop_last=True, - **kwargs) + train_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['train'], + shuffle=True, drop_last=True, + **kwargs) val_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['valid'], **kwargs) From 3ff46cd2e082c6b1e478cfaf1a9791fa2e49d506 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 5 Jan 2024 09:22:20 -0800 Subject: [PATCH 100/197] addressing review --- examples/multi_gpu/papers100m_gcn.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index ab103e182e69..92719d08f9cc 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -255,14 +255,14 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) - if rank == 0: - print("Total Program Runtime =", - round(time.perf_counter() - wall_clock_start, 2), "seconds") dist.barrier() if cugraph_data_loader: shutdown_dask_client(client) dist.barrier() + if rank == 0: + print("Total Program Runtime =", + round(time.perf_counter() - wall_clock_start, 2), "seconds") if __name__ == '__main__': From b409c2c4325b66ef4bfd6283c35574132dabfe65 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 5 Jan 2024 10:54:57 -0800 Subject: [PATCH 101/197] adding comment for rapids no init --- examples/multi_gpu/papers100m_gcn.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 92719d08f9cc..92ccc1f608c9 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -18,6 +18,9 @@ # Allow computation on objects that are larger than GPU memory # https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory os.environ['CUDF_SPILL'] = '1' + +# Ensures that a CUDA context is not created on import of rapids. +# Allows pytorch to create the context instead os.environ['RAPIDS_NO_INITIALIZE'] = '1' From 81522be0666d7eaf79336efb06a0060f9212be52 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 10 Jan 2024 18:24:51 -0800 Subject: [PATCH 102/197] eval on all ranks --- examples/multi_gpu/papers100m_gcn.py | 184 ++++++++++++++------------- 1 file changed, 98 insertions(+), 86 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 92ccc1f608c9..830191ac5463 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -40,25 +40,7 @@ def start_dask_cluster(): client.run(enable_spilling) print("Dask Cluster Setup Complete") - del client - return cluster - - -def create_dask_client(scheduler_address): - from cugraph.dask.comms import comms as Comms - from dask.distributed import Client, Lock - - client = Client(scheduler_address) - lock = Lock('comms_init') - if lock.acquire(timeout=100): - try: - Comms.initialize(p2p=True) - finally: - lock.release() - else: - raise RuntimeError("Failed to acquire lock to initialize comms") - - return client + return client, cluster def shutdown_dask_client(client): @@ -98,17 +80,17 @@ def init_pytorch_worker(rank, world_size, cugraph_data_loader=False): def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, split_idx, num_classes, cugraph_data_loader, wall_clock_start, - scheduler_address=None, tempdir=None): + tempdir=None): init_pytorch_worker( rank, world_size, cugraph_data_loader=cugraph_data_loader, ) - if cugraph_data_loader: - print(f'creating dask client on rank {rank}') - client = create_dask_client(scheduler_address) - print(f'created dask client on rank {rank}') + if cugraph_data_loader and rank == 0: + client, cluster = start_dask_cluster() + from cugraph.dask.comms import comms as Comms + Comms.initialize(p2p=True) else: split_idx['train'] = split_idx['train'].split( split_idx['train'].size(0) // world_size, dim=0)[rank].clone() @@ -125,36 +107,36 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, if cugraph_data_loader: import cugraph from cugraph_pyg.data import CuGraphStore - from cugraph_pyg.loader import BulkSampleLoader, CuGraphNeighborLoader + from cugraph_pyg.loader import BulkSampleLoader G = {("N", "E", "N"): data.edge_index} N = {"N": data.num_nodes} fs = cugraph.gnn.FeatureStore(backend="torch") fs.add_data(data.x, "N", "x") fs.add_data(data.y, "N", "y") - - from distributed import Event as Dask_Event - event = Dask_Event("cugraph_store_creation_event") dist.barrier() if rank == 0: print("Rank 0 creating its cugraph store and \ initializing distributed graph") cugraph_store = CuGraphStore(fs, G, N, multi_gpu=True) - event.set() print("Distributed graph initialization complete.") - else: + + if rank != 0: print(f"Rank {rank} waiting for distributed graph initialization") - if event.wait(timeout=1000): - print(f"Rank {rank} proceeding with store creation") - cugraph_store = CuGraphStore(fs, { - k: len(v) - for k, v in G.items() - }, N, multi_gpu=False) - print(f"Rank {rank} created store") + dist.barrier() + if rank != 0: + print(f"Rank {rank} proceeding with store creation") + cugraph_store = CuGraphStore(fs, { + k: len(v) + for k, v in G.items() + }, N, multi_gpu=False) + print(f"Rank {rank} created store") dist.barrier() - if rank == 0: + if rank == 0: + # Direct cuGraph to sample offline prior to the training loop + # Sampling will occur in parallel but will be initiated on rank 0 for epoch in range(epochs): train_path = os.path.join(tempdir, f'samples_{epoch}') os.mkdir(train_path) @@ -163,13 +145,19 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, directory=train_path, shuffle=True, drop_last=True, **kwargs) - print('validation', len(split_idx['valid'])) - eval_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['valid'], - **kwargs) - test_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['test'], - **kwargs) + print('validation', len(split_idx['valid'])) + eval_path = os.path.join(tempdir, f'samples_eval_{epoch}') + BulkSampleLoader(cugraph_store, cugraph_store, + input_nodes=split_idx['valid'], + directory=eval_path, + **kwargs) + + print('test', len(split_idx['test'])) + test_path = os.path.join(tempdir, f'samples_test') + BulkSampleLoader(cugraph_store, cugraph_store, + input_nodes=split_idx['test'], + directory=test_path, + **kwargs) dist.barrier() else: @@ -178,12 +166,10 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, train_loader = NeighborLoader(data, input_nodes=split_idx['train'], num_workers=num_work, shuffle=True, drop_last=True, **kwargs) - - if rank == 0: - eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], - num_workers=num_work, **kwargs) - test_loader = NeighborLoader(data, input_nodes=split_idx['test'], - num_workers=num_work, **kwargs) + eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], + num_workers=num_work, **kwargs) + test_loader = NeighborLoader(data, input_nodes=split_idx['test'], + num_workers=num_work, **kwargs) dist.barrier() eval_steps = 1000 @@ -201,7 +187,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, train_loader = BulkSampleLoader(cugraph_store, cugraph_store, directory=train_path, input_files=input_files) - with Join([model]): + with Join([model], divide_by_initial_world_size=False): for i, batch in enumerate(train_loader): if i >= warmup_steps: start = time.time() @@ -221,33 +207,25 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, print("Epoch: " + str(epoch) + ", Iteration: " + str(i) + ", Loss: " + str(loss)) dist.barrier() - with Join([model]): - if rank == 0: - print("Average Training Iteration Time:", - (time.time() - start) / (i - warmup_steps), "s/iter") - acc_sum = 0.0 - with torch.no_grad(): - for i, batch in enumerate(eval_loader): - if i >= eval_steps: - break - - batch = batch.to(rank) - if isinstance(batch, torch_geometric.data.HeteroData): - batch = batch.to_homogeneous() - batch_size = batch.num_sampled_nodes[0] - - batch.y = batch.y.to(torch.long) - out = model.module(batch.x, batch.edge_index) - acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) - print(f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) - dist.barrier() + if cugraph_data_loader: + eval_path = os.path.join(tempdir, f'samples_eval_{epoch}') + + input_files = np.array_split(np.array(os.listdir(eval_path)), + world_size)[rank] - with Join([model]): + eval_loader = BulkSampleLoader(cugraph_store, cugraph_store, + directory=eval_path, + input_files=input_files) if rank == 0: + print("Average Training Iteration Time:", + (time.time() - start) / (i - warmup_steps), "s/iter") + with Join([model], divide_by_initial_world_size=False): acc_sum = 0.0 with torch.no_grad(): - for i, batch in enumerate(test_loader): + for i, batch in enumerate(eval_loader): + if i >= eval_steps: + break + batch = batch.to(rank) if isinstance(batch, torch_geometric.data.HeteroData): batch = batch.to_homogeneous() @@ -256,11 +234,50 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) - print(f"Test Accuracy: {acc_sum/(i) * 100.0:.4f}%", ) + batch.y[:batch_size]) + + acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=rank) + dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) + nb = torch.tensor(float(i), dtype=torch.float32, device=acc_sum.device) + dist.all_reduce(nb, op=dist.ReduceOp.SUM) + dist.barrier() + if rank == 0: + print(f"Validation Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) + + with Join([model], divide_by_initial_world_size=False): + if cugraph_data_loader: + test_path = os.path.join(tempdir, f'samples_test') + + input_files = np.array_split(np.array(os.listdir(test_path)), + world_size)[rank] + test_loader = BulkSampleLoader(cugraph_store, cugraph_store, + directory=test_path, + input_files=input_files) + acc_sum = 0.0 + with torch.no_grad(): + for i, batch in enumerate(test_loader): + batch = batch.to(rank) + if isinstance(batch, torch_geometric.data.HeteroData): + batch = batch.to_homogeneous() + batch_size = batch.num_sampled_nodes[0] + + batch.y = batch.y.to(torch.long) + out = model.module(batch.x, batch.edge_index) + acc_sum += acc(out[:batch_size].softmax(dim=-1), + batch.y[:batch_size]) + + acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=rank) + dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) + nb = torch.tensor(float(i), dtype=torch.float32, device=acc_sum.device) + dist.all_reduce(nb, op=dist.ReduceOp.SUM) dist.barrier() + if rank == 0: + print(f"Test Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) - if cugraph_data_loader: + if cugraph_data_loader and rank == 0: + import gc + del cugraph_store + gc.collect() shutdown_dask_client(client) dist.barrier() if rank == 0: @@ -280,7 +297,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, parser.add_argument( "--use_gat_conv", action='store_true', - help="Wether or not to use GATConv. (Defaults to using GCNConv)", + help="Whether or not to use GATConv. (Defaults to using GCNConv)", ) parser.add_argument( "--n_gat_conv_heads", @@ -291,16 +308,15 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, parser.add_argument( "--cugraph_data_loader", action='store_true', - help="Wether or not to use CuGraph for Neighbor Loading. \ + help="Whether or not to use CuGraph for Neighbor Loading. \ \nNote that this requires more GPU memory or \ a reduction in batch_size/fan_out/hidden_channels/num_layers", ) args = parser.parse_args() wall_clock_start = time.perf_counter() - cluster = start_dask_cluster() if args.cugraph_data_loader else None - dataset = PygNodePropPredDataset(name='ogbn-papers100M') + dataset = PygNodePropPredDataset(name='ogbn-papers100M', root='/datasets/abarghi/ogb_datasets') split_idx = dataset.get_idx_split() data = dataset[0] data.y = data.y.reshape(-1) @@ -326,8 +342,4 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, args=(data, world_size, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, args.cugraph_data_loader, wall_clock_start, - None if cluster is None else cluster.scheduler_address, tempdir), nprocs=world_size, join=True) - - if cluster is not None: - cluster.close() From d9d02c97a73fb3358ec1a7ab7927e4ffe5355637 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 11 Jan 2024 02:26:15 +0000 Subject: [PATCH 103/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn.py | 47 +++++++++++++++------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 830191ac5463..d914a3f8799b 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -120,7 +120,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, initializing distributed graph") cugraph_store = CuGraphStore(fs, G, N, multi_gpu=True) print("Distributed graph initialization complete.") - + if rank != 0: print(f"Rank {rank} waiting for distributed graph initialization") dist.barrier() @@ -149,15 +149,13 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, eval_path = os.path.join(tempdir, f'samples_eval_{epoch}') BulkSampleLoader(cugraph_store, cugraph_store, input_nodes=split_idx['valid'], - directory=eval_path, - **kwargs) + directory=eval_path, **kwargs) print('test', len(split_idx['test'])) test_path = os.path.join(tempdir, f'samples_test') BulkSampleLoader(cugraph_store, cugraph_store, input_nodes=split_idx['test'], - directory=test_path, - **kwargs) + directory=test_path, **kwargs) dist.barrier() else: @@ -167,9 +165,9 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, num_workers=num_work, shuffle=True, drop_last=True, **kwargs) eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], - num_workers=num_work, **kwargs) + num_workers=num_work, **kwargs) test_loader = NeighborLoader(data, input_nodes=split_idx['test'], - num_workers=num_work, **kwargs) + num_workers=num_work, **kwargs) dist.barrier() eval_steps = 1000 @@ -214,11 +212,11 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, world_size)[rank] eval_loader = BulkSampleLoader(cugraph_store, cugraph_store, - directory=eval_path, - input_files=input_files) + directory=eval_path, + input_files=input_files) if rank == 0: print("Average Training Iteration Time:", - (time.time() - start) / (i - warmup_steps), "s/iter") + (time.time() - start) / (i - warmup_steps), "s/iter") with Join([model], divide_by_initial_world_size=False): acc_sum = 0.0 with torch.no_grad(): @@ -234,11 +232,13 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) + batch.y[:batch_size]) - acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=rank) + acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, + device=rank) dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) - nb = torch.tensor(float(i), dtype=torch.float32, device=acc_sum.device) + nb = torch.tensor(float(i), dtype=torch.float32, + device=acc_sum.device) dist.all_reduce(nb, op=dist.ReduceOp.SUM) dist.barrier() if rank == 0: @@ -251,8 +251,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, input_files = np.array_split(np.array(os.listdir(test_path)), world_size)[rank] test_loader = BulkSampleLoader(cugraph_store, cugraph_store, - directory=test_path, - input_files=input_files) + directory=test_path, + input_files=input_files) acc_sum = 0.0 with torch.no_grad(): for i, batch in enumerate(test_loader): @@ -264,11 +264,13 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) - - acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=rank) + batch.y[:batch_size]) + + acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, + device=rank) dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) - nb = torch.tensor(float(i), dtype=torch.float32, device=acc_sum.device) + nb = torch.tensor(float(i), dtype=torch.float32, + device=acc_sum.device) dist.all_reduce(nb, op=dist.ReduceOp.SUM) dist.barrier() if rank == 0: @@ -316,7 +318,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, args = parser.parse_args() wall_clock_start = time.perf_counter() - dataset = PygNodePropPredDataset(name='ogbn-papers100M', root='/datasets/abarghi/ogb_datasets') + dataset = PygNodePropPredDataset(name='ogbn-papers100M', + root='/datasets/abarghi/ogb_datasets') split_idx = dataset.get_idx_split() data = dataset[0] data.y = data.y.reshape(-1) @@ -341,5 +344,5 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, run_train, args=(data, world_size, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, - args.cugraph_data_loader, wall_clock_start, - tempdir), nprocs=world_size, join=True) + args.cugraph_data_loader, wall_clock_start, tempdir), + nprocs=world_size, join=True) From 6c36fd1fc778ee8e36f19288c9bf34d0e9240716 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 11 Jan 2024 12:29:32 +0900 Subject: [PATCH 104/197] Apply suggestions from code review --- CHANGELOG.md | 2 +- examples/multi_gpu/papers100m_gcn.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 574298d4aeda..c45964a1acda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- Upgrades for single node Papers100m examples (CuGraph, better default hyperparams, and the choice of GCN or GAT) ([#8173](https://github.com/pyg-team/pytorch_geometric/pull/8173)) +- Added support for cuGraph data loading and `GAT` in single node Papers100m examples ([#8173](https://github.com/pyg-team/pytorch_geometric/pull/8173)) - Added `TreeGraph` and `GridMotif` generators ([#8736](https://github.com/pyg-team/pytorch_geometric/pull/8736)) - Added an example for edge-level temporal sampling on a heterogenous graph ([#8383](https://github.com/pyg-team/pytorch_geometric/pull/8383)) - Added the `num_graphs` option to the `StochasticBlockModelDataset` ([#8648](https://github.com/pyg-team/pytorch_geometric/pull/8648)) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index d914a3f8799b..fff535b26714 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -19,8 +19,9 @@ # https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory os.environ['CUDF_SPILL'] = '1' -# Ensures that a CUDA context is not created on import of rapids. -# Allows pytorch to create the context instead +# Ensures that a CUDA context is not created on import of rapids to +# let PyTorch create one instead +# https://docs.rapids.ai/api/dask-cuda/stable/ucx/#software-requirements os.environ['RAPIDS_NO_INITIALIZE'] = '1' From 66dd97b3fa9022396938dcebb35be264fc79fbb4 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 11 Jan 2024 14:12:58 -0800 Subject: [PATCH 105/197] apply alexandria's fix Co-authored-by: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com> --- examples/multi_gpu/papers100m_gcn.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index fff535b26714..92a15dc6d6b8 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -64,6 +64,11 @@ def pyg_num_work(world_size): def init_pytorch_worker(rank, world_size, cugraph_data_loader=False): if cugraph_data_loader: + import rmm + rmm.reinitialize( + pool_allocator=True, + devices=[rank] + ) import cupy cupy.cuda.Device(rank).use() from rmm.allocators.cupy import rmm_cupy_allocator From 7ef00fed9b5aa78a34d5d87e8b65bae3d4d1590e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 11 Jan 2024 22:13:58 +0000 Subject: [PATCH 106/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 92a15dc6d6b8..afff11dff13f 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -65,10 +65,7 @@ def pyg_num_work(world_size): def init_pytorch_worker(rank, world_size, cugraph_data_loader=False): if cugraph_data_loader: import rmm - rmm.reinitialize( - pool_allocator=True, - devices=[rank] - ) + rmm.reinitialize(pool_allocator=True, devices=[rank]) import cupy cupy.cuda.Device(rank).use() from rmm.allocators.cupy import rmm_cupy_allocator From 4df8f10c01de9e6122a0bd7925bde005b0776de9 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 11 Jan 2024 16:22:04 -0800 Subject: [PATCH 107/197] cleaning up for pre-commit --- examples/multi_gpu/papers100m_gcn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index afff11dff13f..8ebc28635312 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -155,7 +155,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, directory=eval_path, **kwargs) print('test', len(split_idx['test'])) - test_path = os.path.join(tempdir, f'samples_test') + test_path = os.path.join(tempdir, 'samples_test') BulkSampleLoader(cugraph_store, cugraph_store, input_nodes=split_idx['test'], directory=test_path, **kwargs) @@ -249,7 +249,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, with Join([model], divide_by_initial_world_size=False): if cugraph_data_loader: - test_path = os.path.join(tempdir, f'samples_test') + test_path = os.path.join(tempdir, 'samples_test') input_files = np.array_split(np.array(os.listdir(test_path)), world_size)[rank] From e39aade6c941ccc0a51cbb77f26d3d761bc6baff Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 12 Jan 2024 10:48:53 -0800 Subject: [PATCH 108/197] alexandria's fix --- examples/multi_gpu/papers100m_gcn.py | 87 ++++++++++++++-------------- 1 file changed, 42 insertions(+), 45 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 8ebc28635312..ad1bbfa4419e 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -19,9 +19,8 @@ # https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory os.environ['CUDF_SPILL'] = '1' -# Ensures that a CUDA context is not created on import of rapids to -# let PyTorch create one instead -# https://docs.rapids.ai/api/dask-cuda/stable/ucx/#software-requirements +# Ensures that a CUDA context is not created on import of rapids. +# Allows pytorch to create the context instead os.environ['RAPIDS_NO_INITIALIZE'] = '1' @@ -33,6 +32,7 @@ def start_dask_cluster(): protocol="tcp", rmm_pool_size=None, memory_limit=None, + rmm_async=True, ) from dask.distributed import Client @@ -65,7 +65,9 @@ def pyg_num_work(world_size): def init_pytorch_worker(rank, world_size, cugraph_data_loader=False): if cugraph_data_loader: import rmm - rmm.reinitialize(pool_allocator=True, devices=[rank]) + if rank > 0: + rmm.reinitialize(devices=rank) + import cupy cupy.cuda.Device(rank).use() from rmm.allocators.cupy import rmm_cupy_allocator @@ -84,16 +86,18 @@ def init_pytorch_worker(rank, world_size, cugraph_data_loader=False): def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, split_idx, num_classes, cugraph_data_loader, wall_clock_start, tempdir=None): + init_pytorch_worker( rank, world_size, cugraph_data_loader=cugraph_data_loader, ) - if cugraph_data_loader and rank == 0: - client, cluster = start_dask_cluster() - from cugraph.dask.comms import comms as Comms - Comms.initialize(p2p=True) + if cugraph_data_loader: + if rank == 0: + client, cluster = start_dask_cluster() + from cugraph.dask.comms import comms as Comms + Comms.initialize(p2p=True) else: split_idx['train'] = split_idx['train'].split( split_idx['train'].size(0) // world_size, dim=0)[rank].clone() @@ -123,7 +127,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, initializing distributed graph") cugraph_store = CuGraphStore(fs, G, N, multi_gpu=True) print("Distributed graph initialization complete.") - + if rank != 0: print(f"Rank {rank} waiting for distributed graph initialization") dist.barrier() @@ -152,13 +156,15 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, eval_path = os.path.join(tempdir, f'samples_eval_{epoch}') BulkSampleLoader(cugraph_store, cugraph_store, input_nodes=split_idx['valid'], - directory=eval_path, **kwargs) + directory=eval_path, + **kwargs) print('test', len(split_idx['test'])) - test_path = os.path.join(tempdir, 'samples_test') + test_path = os.path.join(tempdir, f'samples_test') BulkSampleLoader(cugraph_store, cugraph_store, input_nodes=split_idx['test'], - directory=test_path, **kwargs) + directory=test_path, + **kwargs) dist.barrier() else: @@ -168,9 +174,9 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, num_workers=num_work, shuffle=True, drop_last=True, **kwargs) eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], - num_workers=num_work, **kwargs) + num_workers=num_work, **kwargs) test_loader = NeighborLoader(data, input_nodes=split_idx['test'], - num_workers=num_work, **kwargs) + num_workers=num_work, **kwargs) dist.barrier() eval_steps = 1000 @@ -211,16 +217,14 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, if cugraph_data_loader: eval_path = os.path.join(tempdir, f'samples_eval_{epoch}') - input_files = np.array_split(np.array(os.listdir(eval_path)), - world_size)[rank] + input_files = np.array(os.listdir(eval_path)) eval_loader = BulkSampleLoader(cugraph_store, cugraph_store, - directory=eval_path, - input_files=input_files) - if rank == 0: - print("Average Training Iteration Time:", - (time.time() - start) / (i - warmup_steps), "s/iter") + directory=eval_path, + input_files=input_files) with Join([model], divide_by_initial_world_size=False): + print("Average Training Iteration Time:", + (time.time() - start) / (i - warmup_steps), "s/iter") acc_sum = 0.0 with torch.no_grad(): for i, batch in enumerate(eval_loader): @@ -235,27 +239,24 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) + batch.y[:batch_size]) - acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, - device=rank) + acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=rank) dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) - nb = torch.tensor(float(i), dtype=torch.float32, - device=acc_sum.device) + nb = torch.tensor(float(i), dtype=torch.float32, device=acc_sum.device) dist.all_reduce(nb, op=dist.ReduceOp.SUM) - dist.barrier() - if rank == 0: print(f"Validation Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) + dist.barrier() with Join([model], divide_by_initial_world_size=False): if cugraph_data_loader: - test_path = os.path.join(tempdir, 'samples_test') + test_path = os.path.join(tempdir, f'samples_test') - input_files = np.array_split(np.array(os.listdir(test_path)), - world_size)[rank] + input_files = np.array(os.listdir(test_path)) + test_loader = BulkSampleLoader(cugraph_store, cugraph_store, - directory=test_path, - input_files=input_files) + directory=test_path, + input_files=input_files) acc_sum = 0.0 with torch.no_grad(): for i, batch in enumerate(test_loader): @@ -267,17 +268,14 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) - - acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, - device=rank) + batch.y[:batch_size]) + + acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=rank) dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) - nb = torch.tensor(float(i), dtype=torch.float32, - device=acc_sum.device) + nb = torch.tensor(float(i), dtype=torch.float32, device=acc_sum.device) dist.all_reduce(nb, op=dist.ReduceOp.SUM) + print(f"Test Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) dist.barrier() - if rank == 0: - print(f"Test Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) if cugraph_data_loader and rank == 0: import gc @@ -321,8 +319,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, args = parser.parse_args() wall_clock_start = time.perf_counter() - dataset = PygNodePropPredDataset(name='ogbn-papers100M', - root='/datasets/abarghi/ogb_datasets') + dataset = PygNodePropPredDataset(name='ogbn-papers100M', root='/datasets/abarghi/ogb_datasets') split_idx = dataset.get_idx_split() data = dataset[0] data.y = data.y.reshape(-1) @@ -347,5 +344,5 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, run_train, args=(data, world_size, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, - args.cugraph_data_loader, wall_clock_start, tempdir), - nprocs=world_size, join=True) + args.cugraph_data_loader, wall_clock_start, + tempdir), nprocs=world_size, join=True) From dff6fe3e64dd7088a0e8052538a8e3bb175073a2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 Jan 2024 18:49:52 +0000 Subject: [PATCH 109/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn.py | 51 +++++++++++++++------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index ad1bbfa4419e..ec16f6921712 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -86,7 +86,7 @@ def init_pytorch_worker(rank, world_size, cugraph_data_loader=False): def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, split_idx, num_classes, cugraph_data_loader, wall_clock_start, tempdir=None): - + init_pytorch_worker( rank, world_size, @@ -127,7 +127,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, initializing distributed graph") cugraph_store = CuGraphStore(fs, G, N, multi_gpu=True) print("Distributed graph initialization complete.") - + if rank != 0: print(f"Rank {rank} waiting for distributed graph initialization") dist.barrier() @@ -156,15 +156,13 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, eval_path = os.path.join(tempdir, f'samples_eval_{epoch}') BulkSampleLoader(cugraph_store, cugraph_store, input_nodes=split_idx['valid'], - directory=eval_path, - **kwargs) + directory=eval_path, **kwargs) print('test', len(split_idx['test'])) test_path = os.path.join(tempdir, f'samples_test') BulkSampleLoader(cugraph_store, cugraph_store, input_nodes=split_idx['test'], - directory=test_path, - **kwargs) + directory=test_path, **kwargs) dist.barrier() else: @@ -174,9 +172,9 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, num_workers=num_work, shuffle=True, drop_last=True, **kwargs) eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], - num_workers=num_work, **kwargs) + num_workers=num_work, **kwargs) test_loader = NeighborLoader(data, input_nodes=split_idx['test'], - num_workers=num_work, **kwargs) + num_workers=num_work, **kwargs) dist.barrier() eval_steps = 1000 @@ -220,11 +218,11 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, input_files = np.array(os.listdir(eval_path)) eval_loader = BulkSampleLoader(cugraph_store, cugraph_store, - directory=eval_path, - input_files=input_files) + directory=eval_path, + input_files=input_files) with Join([model], divide_by_initial_world_size=False): print("Average Training Iteration Time:", - (time.time() - start) / (i - warmup_steps), "s/iter") + (time.time() - start) / (i - warmup_steps), "s/iter") acc_sum = 0.0 with torch.no_grad(): for i, batch in enumerate(eval_loader): @@ -239,11 +237,13 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) + batch.y[:batch_size]) - acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=rank) + acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, + device=rank) dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) - nb = torch.tensor(float(i), dtype=torch.float32, device=acc_sum.device) + nb = torch.tensor(float(i), dtype=torch.float32, + device=acc_sum.device) dist.all_reduce(nb, op=dist.ReduceOp.SUM) print(f"Validation Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) dist.barrier() @@ -253,10 +253,10 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, test_path = os.path.join(tempdir, f'samples_test') input_files = np.array(os.listdir(test_path)) - + test_loader = BulkSampleLoader(cugraph_store, cugraph_store, - directory=test_path, - input_files=input_files) + directory=test_path, + input_files=input_files) acc_sum = 0.0 with torch.no_grad(): for i, batch in enumerate(test_loader): @@ -268,11 +268,13 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) - - acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device=rank) + batch.y[:batch_size]) + + acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, + device=rank) dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) - nb = torch.tensor(float(i), dtype=torch.float32, device=acc_sum.device) + nb = torch.tensor(float(i), dtype=torch.float32, + device=acc_sum.device) dist.all_reduce(nb, op=dist.ReduceOp.SUM) print(f"Test Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) dist.barrier() @@ -319,7 +321,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, args = parser.parse_args() wall_clock_start = time.perf_counter() - dataset = PygNodePropPredDataset(name='ogbn-papers100M', root='/datasets/abarghi/ogb_datasets') + dataset = PygNodePropPredDataset(name='ogbn-papers100M', + root='/datasets/abarghi/ogb_datasets') split_idx = dataset.get_idx_split() data = dataset[0] data.y = data.y.reshape(-1) @@ -344,5 +347,5 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, run_train, args=(data, world_size, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, - args.cugraph_data_loader, wall_clock_start, - tempdir), nprocs=world_size, join=True) + args.cugraph_data_loader, wall_clock_start, tempdir), + nprocs=world_size, join=True) From 15486988e72e850f0e52cb13e553adef32bee18c Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 12 Jan 2024 10:55:27 -0800 Subject: [PATCH 110/197] fixing precommit ci --- examples/multi_gpu/papers100m_gcn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index ec16f6921712..0513a7187d03 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -159,7 +159,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, directory=eval_path, **kwargs) print('test', len(split_idx['test'])) - test_path = os.path.join(tempdir, f'samples_test') + test_path = os.path.join(tempdir, 'samples_test') BulkSampleLoader(cugraph_store, cugraph_store, input_nodes=split_idx['test'], directory=test_path, **kwargs) @@ -250,7 +250,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, with Join([model], divide_by_initial_world_size=False): if cugraph_data_loader: - test_path = os.path.join(tempdir, f'samples_test') + test_path = os.path.join(tempdir, 'samples_test') input_files = np.array(os.listdir(test_path)) From 76602e7a1e9460b2c145890133f39469619a5797 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 16 Jan 2024 12:08:10 -0800 Subject: [PATCH 111/197] fixing eval on all ranks for native pyg --- examples/multi_gpu/papers100m_gcn.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 0513a7187d03..86f21691513e 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -101,6 +101,10 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, else: split_idx['train'] = split_idx['train'].split( split_idx['train'].size(0) // world_size, dim=0)[rank].clone() + split_idx['valid'] = split_idx['valid'].split( + split_idx['valid'].size(0) // world_size, dim=0)[rank].clone() + split_idx['test'] = split_idx['test'].split( + split_idx['test'].size(0) // world_size, dim=0)[rank].clone() model = model.to(rank) model = DistributedDataParallel(model, device_ids=[rank]) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, From 869f6084f7108d9470c78723ed5e741157243bac Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 22 Jan 2024 12:58:46 -0800 Subject: [PATCH 112/197] cleaning --- examples/multi_gpu/papers100m_gcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 86f21691513e..679af657af5c 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -326,7 +326,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, wall_clock_start = time.perf_counter() dataset = PygNodePropPredDataset(name='ogbn-papers100M', - root='/datasets/abarghi/ogb_datasets') + root='/datasets/ogb_datasets') split_idx = dataset.get_idx_split() data = dataset[0] data.y = data.y.reshape(-1) From 54a9cf59648a139e2b5fdcdf1d2676b64d39b6ac Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 22 Jan 2024 17:09:20 -0800 Subject: [PATCH 113/197] cleaning --- examples/ogbn_papers_100m.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 143de766245f..55979244f717 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -39,7 +39,8 @@ wall_clock_start = time.perf_counter() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -dataset = PygNodePropPredDataset(name='ogbn-papers100M') +dataset = PygNodePropPredDataset(name='ogbn-papers100M', + root='/datasets/ogb_datasets') split_idx = dataset.get_idx_split() if args.cugraph_data_loader: from cugraph.testing.mg_utils import enable_spilling From 7f0dc202dace4878835ac908b1425248d6715e5e Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 29 Jan 2024 12:44:49 -0800 Subject: [PATCH 114/197] new better hyperparams increasing test acc to 45% --- examples/ogbn_papers_100m.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 55979244f717..a33d7e8b4577 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -11,12 +11,12 @@ from torch_geometric.loader import NeighborLoader parser = argparse.ArgumentParser() -parser.add_argument('--hidden_channels', type=int, default=128) -parser.add_argument('--num_layers', type=int, default=2) +parser.add_argument('--hidden_channels', type=int, default=256) +parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=3) -parser.add_argument('--batch_size', type=int, default=2048) -parser.add_argument('--fan_out', type=int, default=16) +parser.add_argument('--batch_size', type=int, default=1024) +parser.add_argument('--fan_out', type=int, default=10) parser.add_argument( "--use_gat_conv", action='store_true', @@ -55,7 +55,7 @@ def get_num_workers() -> int: kwargs = dict( - num_neighbors=[args.fan_out, args.fan_out], + num_neighbors=[args.fan_out] * args.num_layers, batch_size=args.batch_size, ) # Set Up Neighbor Loading From 48e7e2af4b30292aebe0bd1ebbd9044477cc3fda Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 29 Jan 2024 12:47:39 -0800 Subject: [PATCH 115/197] new 45% test acc hyperparams --- examples/multi_gpu/papers100m_gcn.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 679af657af5c..0c8d0fd4a3d4 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -85,7 +85,7 @@ def init_pytorch_worker(rank, world_size, cugraph_data_loader=False): def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, split_idx, num_classes, cugraph_data_loader, wall_clock_start, - tempdir=None): + tempdir=None, num_layers=3): init_pytorch_worker( rank, @@ -111,7 +111,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, weight_decay=0.0005) kwargs = dict( - num_neighbors=[fan_out, fan_out], + num_neighbors=[fan_out] * num_layers, batch_size=batch_size, ) # Set Up Neighbor Loading @@ -297,12 +297,12 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--hidden_channels', type=int, default=128) - parser.add_argument('--num_layers', type=int, default=2) + parser.add_argument('--hidden_channels', type=int, default=256) + parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--lr', type=float, default=0.001) - parser.add_argument('--epochs', type=int, default=3) - parser.add_argument('--batch_size', type=int, default=2048) - parser.add_argument('--fan_out', type=int, default=16) + parser.add_argument('--epochs', type=int, default=20) + parser.add_argument('--batch_size', type=int, default=1024) + parser.add_argument('--fan_out', type=int, default=10) parser.add_argument( "--use_gat_conv", action='store_true', @@ -351,5 +351,5 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, run_train, args=(data, world_size, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, - args.cugraph_data_loader, wall_clock_start, tempdir), + args.cugraph_data_loader, wall_clock_start, tempdir, args.num_layers), nprocs=world_size, join=True) From 87d923306a09183428c1c2060606be8bec815ec8 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 29 Jan 2024 12:48:00 -0800 Subject: [PATCH 116/197] 20 epochs --- examples/ogbn_papers_100m.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index a33d7e8b4577..ee3e7600205e 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -12,7 +12,7 @@ parser = argparse.ArgumentParser() parser.add_argument('--hidden_channels', type=int, default=256) -parser.add_argument('--num_layers', type=int, default=3) +parser.add_argument('--num_layers', type=int, default=20) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=3) parser.add_argument('--batch_size', type=int, default=1024) From 6d5b057c22a793f85da87e9876b75841732918e8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 29 Jan 2024 20:50:11 +0000 Subject: [PATCH 117/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 0c8d0fd4a3d4..4408e2559754 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -348,8 +348,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, with tempfile.TemporaryDirectory() as tempdir: mp.spawn( - run_train, - args=(data, world_size, model, args.epochs, args.batch_size, - args.fan_out, split_idx, dataset.num_classes, - args.cugraph_data_loader, wall_clock_start, tempdir, args.num_layers), + run_train, args=(data, world_size, model, args.epochs, + args.batch_size, args.fan_out, split_idx, + dataset.num_classes, args.cugraph_data_loader, + wall_clock_start, tempdir, args.num_layers), nprocs=world_size, join=True) From 183d6713e6d46beee268e6ef4ae3d492ed73887b Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 29 Jan 2024 14:25:06 -0800 Subject: [PATCH 118/197] adding cuda synchronize --- examples/multi_gpu/papers100m_gcn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 4408e2559754..1be3a2760fa8 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -216,6 +216,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, print("Epoch: " + str(epoch) + ", Iteration: " + str(i) + ", Loss: " + str(loss)) dist.barrier() + torch.cuda.synchronize() if cugraph_data_loader: eval_path = os.path.join(tempdir, f'samples_eval_{epoch}') From 4b13b307e2b173a4a47482986eeba6ebd6838e22 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 29 Jan 2024 14:25:43 -0800 Subject: [PATCH 119/197] adding cuda sync --- examples/ogbn_papers_100m.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index ee3e7600205e..3a2d53f59c6f 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -137,7 +137,7 @@ def train(): if i % 10 == 0: print(f'Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}') - + torch.cuda.synchronize() print(f'Average Training Iteration Time (s/iter): \ {(time.perf_counter() - start_avg_time)/(i-warmup_steps):.6f}') From a0bd85b77ad3fb74ca542bed153197ddcd4b476a Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 29 Jan 2024 14:28:41 -0800 Subject: [PATCH 120/197] fixing timing --- examples/multi_gpu/papers100m_gcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 1be3a2760fa8..31a68b9c75d3 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -198,7 +198,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, input_files=input_files) with Join([model], divide_by_initial_world_size=False): for i, batch in enumerate(train_loader): - if i >= warmup_steps: + if i == warmup_steps: start = time.time() batch = batch.to(rank) From 4a15b7ac3b38b8e654a056914ff4f34b9ab5d378 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 29 Jan 2024 15:09:44 -0800 Subject: [PATCH 121/197] cleaning --- examples/multi_gpu/papers100m_gcn.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 31a68b9c75d3..c8c2544c4eeb 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -217,6 +217,9 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, ", Loss: " + str(loss)) dist.barrier() torch.cuda.synchronize() + if rank == 0: + print("Average Training Iteration Time:", + (time.time() - start) / (i - warmup_steps), "s/iter") if cugraph_data_loader: eval_path = os.path.join(tempdir, f'samples_eval_{epoch}') @@ -226,8 +229,6 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, directory=eval_path, input_files=input_files) with Join([model], divide_by_initial_world_size=False): - print("Average Training Iteration Time:", - (time.time() - start) / (i - warmup_steps), "s/iter") acc_sum = 0.0 with torch.no_grad(): for i, batch in enumerate(eval_loader): From e047aa229af46f58a49d17753dfc6c2b9ca414cc Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 29 Jan 2024 17:27:13 -0800 Subject: [PATCH 122/197] cleaning --- examples/multi_gpu/papers100m_gcn.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index c8c2544c4eeb..07749971344f 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -251,7 +251,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, nb = torch.tensor(float(i), dtype=torch.float32, device=acc_sum.device) dist.all_reduce(nb, op=dist.ReduceOp.SUM) - print(f"Validation Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) + if rank == 0: + print(f"Validation Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) dist.barrier() with Join([model], divide_by_initial_world_size=False): @@ -282,7 +283,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, nb = torch.tensor(float(i), dtype=torch.float32, device=acc_sum.device) dist.all_reduce(nb, op=dist.ReduceOp.SUM) - print(f"Test Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) + if rank == 0: + print(f"Test Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) dist.barrier() if cugraph_data_loader and rank == 0: From 704f974481b7a08c01df1307017d14475c828541 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 30 Jan 2024 10:35:51 -0800 Subject: [PATCH 123/197] fixing typo --- examples/ogbn_papers_100m.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 3a2d53f59c6f..ce38e750dde6 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -12,9 +12,9 @@ parser = argparse.ArgumentParser() parser.add_argument('--hidden_channels', type=int, default=256) -parser.add_argument('--num_layers', type=int, default=20) +parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--lr', type=float, default=0.001) -parser.add_argument('--epochs', type=int, default=3) +parser.add_argument('--epochs', type=int, default=20) parser.add_argument('--batch_size', type=int, default=1024) parser.add_argument('--fan_out', type=int, default=10) parser.add_argument( From 1706a037fbb24626aa6439d95c2dd506affa60fe Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 30 Jan 2024 13:58:03 -0800 Subject: [PATCH 124/197] new hyperparams --- examples/multi_gpu/papers100m_gcn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 07749971344f..e11864b3d238 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -302,11 +302,11 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, parser = argparse.ArgumentParser() parser.add_argument('--hidden_channels', type=int, default=256) - parser.add_argument('--num_layers', type=int, default=3) + parser.add_argument('--num_layers', type=int, default=2) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=20) parser.add_argument('--batch_size', type=int, default=1024) - parser.add_argument('--fan_out', type=int, default=10) + parser.add_argument('--fan_out', type=int, default=30) parser.add_argument( "--use_gat_conv", action='store_true', From 074e7ae9dc7210c2c05fa0a7eee4621db9f69149 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 30 Jan 2024 13:58:21 -0800 Subject: [PATCH 125/197] new hyperparams --- examples/ogbn_papers_100m.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index ce38e750dde6..1c953a407e65 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -12,11 +12,11 @@ parser = argparse.ArgumentParser() parser.add_argument('--hidden_channels', type=int, default=256) -parser.add_argument('--num_layers', type=int, default=3) +parser.add_argument('--num_layers', type=int, default=2) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=20) parser.add_argument('--batch_size', type=int, default=1024) -parser.add_argument('--fan_out', type=int, default=10) +parser.add_argument('--fan_out', type=int, default=30) parser.add_argument( "--use_gat_conv", action='store_true', From e0fc93369ae47357aa713696c98d0d81ea42412e Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 30 Jan 2024 16:32:48 -0800 Subject: [PATCH 126/197] cuda sync for timing --- examples/multi_gpu/papers100m_gcn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index e11864b3d238..46e2c618a585 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -199,6 +199,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, with Join([model], divide_by_initial_world_size=False): for i, batch in enumerate(train_loader): if i == warmup_steps: + torch.cuda.synchronize() start = time.time() batch = batch.to(rank) From 5fb709442a073e279f54b117c40bcdfb2d082c21 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 30 Jan 2024 16:35:59 -0800 Subject: [PATCH 127/197] cuda sync for timing --- examples/ogbn_papers_100m.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 1c953a407e65..f7a1900db31e 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -125,6 +125,7 @@ def train(): batch = batch.to_homogeneous() if i == warmup_steps: + torch.cuda.synchronize() start_avg_time = time.perf_counter() batch = batch.to(device) optimizer.zero_grad() From 15e63f373c7720bd9e3818ebec6484f30a9e4036 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 30 Jan 2024 16:42:17 -0800 Subject: [PATCH 128/197] adding n_devices flag --- examples/multi_gpu/papers100m_gcn.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 46e2c618a585..2af6d70dc3f8 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -326,6 +326,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, \nNote that this requires more GPU memory or \ a reduction in batch_size/fan_out/hidden_channels/num_layers", ) + parser.add_argument("--n_devices", type=int, default=-1, + help="1-8 to use that many GPUs. Defaults to all available GPUs") args = parser.parse_args() wall_clock_start = time.perf_counter() @@ -348,13 +350,21 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, dataset.num_classes) print("Data =", data) - world_size = torch.cuda.device_count() + if args.n_devices == -1: + world_size = torch.cuda.device_count() + else: + world_size = args.n_devices print('Let\'s use', world_size, 'GPUs!') - - with tempfile.TemporaryDirectory() as tempdir: - mp.spawn( - run_train, args=(data, world_size, model, args.epochs, - args.batch_size, args.fan_out, split_idx, - dataset.num_classes, args.cugraph_data_loader, - wall_clock_start, tempdir, args.num_layers), - nprocs=world_size, join=True) + if world_size > 1: + with tempfile.TemporaryDirectory() as tempdir: + mp.spawn( + run_train, args=(data, world_size, model, args.epochs, + args.batch_size, args.fan_out, split_idx, + dataset.num_classes, args.cugraph_data_loader, + wall_clock_start, tempdir, args.num_layers), + nprocs=world_size, join=True) + else: + run_train(data, world_size, model, args.epochs, + args.batch_size, args.fan_out, split_idx, + dataset.num_classes, args.cugraph_data_loader, + wall_clock_start, tempdir, args.num_layers) From 74ab468329e71f4491f57214bca0c26886105d81 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 31 Jan 2024 00:44:37 +0000 Subject: [PATCH 129/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 2af6d70dc3f8..af8d02c69b3b 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -326,8 +326,9 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, \nNote that this requires more GPU memory or \ a reduction in batch_size/fan_out/hidden_channels/num_layers", ) - parser.add_argument("--n_devices", type=int, default=-1, - help="1-8 to use that many GPUs. Defaults to all available GPUs") + parser.add_argument( + "--n_devices", type=int, default=-1, + help="1-8 to use that many GPUs. Defaults to all available GPUs") args = parser.parse_args() wall_clock_start = time.perf_counter() @@ -364,7 +365,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, wall_clock_start, tempdir, args.num_layers), nprocs=world_size, join=True) else: - run_train(data, world_size, model, args.epochs, - args.batch_size, args.fan_out, split_idx, - dataset.num_classes, args.cugraph_data_loader, - wall_clock_start, tempdir, args.num_layers) + run_train(data, world_size, model, args.epochs, args.batch_size, + args.fan_out, split_idx, dataset.num_classes, + args.cugraph_data_loader, wall_clock_start, tempdir, + args.num_layers) From 301420c12cefbbc94d0427b71cf81933dd0762a9 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 31 Jan 2024 10:07:28 -0800 Subject: [PATCH 130/197] adding timer for training prep --- examples/multi_gpu/papers100m_gcn.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index af8d02c69b3b..134ed4781f9a 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -180,11 +180,14 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, test_loader = NeighborLoader(data, input_nodes=split_idx['test'], num_workers=num_work, **kwargs) - dist.barrier() eval_steps = 1000 warmup_steps = 20 acc = Accuracy(task="multiclass", num_classes=num_classes).to(rank) + dist.barrier() + torch.cuda.synchronize() if rank == 0: + print("Total time before training begins=", + round(time.perf_counter() - wall_clock_start, 2), "seconds") print("Beginning training...") for epoch in range(epochs): if cugraph_data_loader: From 133fbf663bc19a1fa5fb171cc042311f55674242 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 31 Jan 2024 10:09:04 -0800 Subject: [PATCH 131/197] training prep timer --- examples/ogbn_papers_100m.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index f7a1900db31e..07c8285401ac 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -164,7 +164,9 @@ def test(loader: NeighborLoader, val_steps: Optional[int] = None): return total_correct / total_examples - +torch.cuda.synchronize() +print("Total time before training begins=", round(time.perf_counter() - wall_clock_start, 2), "seconds") +print("Beginning training...") for epoch in range(1, 1 + args.epochs): train() val_acc = test(val_loader, val_steps=100) From 370ab980247fa4ad8c100c9459c47fff5854e8bf Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 31 Jan 2024 18:10:10 +0000 Subject: [PATCH 132/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/ogbn_papers_100m.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 07c8285401ac..617909f99a2f 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -164,8 +164,10 @@ def test(loader: NeighborLoader, val_steps: Optional[int] = None): return total_correct / total_examples + torch.cuda.synchronize() -print("Total time before training begins=", round(time.perf_counter() - wall_clock_start, 2), "seconds") +print("Total time before training begins=", + round(time.perf_counter() - wall_clock_start, 2), "seconds") print("Beginning training...") for epoch in range(1, 1 + args.epochs): train() From ddc4262fd568e92cb4f33a77be14911c2c16f375 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 31 Jan 2024 12:54:58 -0800 Subject: [PATCH 133/197] fixing syntax --- examples/multi_gpu/papers100m_gcn.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 134ed4781f9a..da6d3ecbe3a5 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -359,16 +359,16 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, else: world_size = args.n_devices print('Let\'s use', world_size, 'GPUs!') - if world_size > 1: - with tempfile.TemporaryDirectory() as tempdir: + with tempfile.TemporaryDirectory() as tempdir: + if world_size > 1: mp.spawn( run_train, args=(data, world_size, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, args.cugraph_data_loader, wall_clock_start, tempdir, args.num_layers), nprocs=world_size, join=True) - else: - run_train(data, world_size, model, args.epochs, args.batch_size, - args.fan_out, split_idx, dataset.num_classes, - args.cugraph_data_loader, wall_clock_start, tempdir, - args.num_layers) + else: + run_train(data, world_size, model, args.epochs, args.batch_size, + args.fan_out, split_idx, dataset.num_classes, + args.cugraph_data_loader, wall_clock_start, tempdir, + args.num_layers) From 38c65bfdf674894f31153dc40452d75a5b13ae9a Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 31 Jan 2024 12:58:31 -0800 Subject: [PATCH 134/197] fixing typo --- examples/multi_gpu/papers100m_gcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index da6d3ecbe3a5..e25ec6601dac 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -368,7 +368,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, wall_clock_start, tempdir, args.num_layers), nprocs=world_size, join=True) else: - run_train(data, world_size, model, args.epochs, args.batch_size, + run_train(0, data, world_size, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, args.cugraph_data_loader, wall_clock_start, tempdir, args.num_layers) From 6190e1689ec2a6273a3552e0d7841557e0f03360 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 5 Feb 2024 12:02:57 -0800 Subject: [PATCH 135/197] clean up --- examples/multi_gpu/papers100m_gcn.py | 34 ++++++++++++++++------------ 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index e25ec6601dac..944499eddbdc 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -219,11 +219,12 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, if rank == 0 and i % 10 == 0: print("Epoch: " + str(epoch) + ", Iteration: " + str(i) + ", Loss: " + str(loss)) + nb = i + 1 dist.barrier() torch.cuda.synchronize() if rank == 0: print("Average Training Iteration Time:", - (time.time() - start) / (i - warmup_steps), "s/iter") + (time.time() - start) / (nb - warmup_steps), "s/iter") if cugraph_data_loader: eval_path = os.path.join(tempdir, f'samples_eval_{epoch}') @@ -248,13 +249,15 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, out = model.module(batch.x, batch.edge_index) acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) - - acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, - device=rank) - dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) - nb = torch.tensor(float(i), dtype=torch.float32, - device=acc_sum.device) - dist.all_reduce(nb, op=dist.ReduceOp.SUM) + if world_size > 1: + acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, + device=rank) + dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) + nb = torch.tensor(float(i + 1), dtype=torch.float32, + device=acc_sum.device) + dist.all_reduce(nb, op=dist.ReduceOp.SUM) + else: + nb = i + 1 if rank == 0: print(f"Validation Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) dist.barrier() @@ -281,12 +284,15 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) - acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, - device=rank) - dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) - nb = torch.tensor(float(i), dtype=torch.float32, - device=acc_sum.device) - dist.all_reduce(nb, op=dist.ReduceOp.SUM) + if world_size > 1: + acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, + device=rank) + dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) + nb = torch.tensor(float(i + 1), dtype=torch.float32, + device=acc_sum.device) + dist.all_reduce(nb, op=dist.ReduceOp.SUM) + else: + nb = i + 1 if rank == 0: print(f"Test Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) dist.barrier() From 7ea89d34a05189d5c3eb896ec5d3cd9e40b49db4 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 5 Feb 2024 12:32:41 -0800 Subject: [PATCH 136/197] cleaning --- examples/multi_gpu/papers100m_gcn.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 944499eddbdc..724f9e25e8bd 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -219,7 +219,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, if rank == 0 and i % 10 == 0: print("Epoch: " + str(epoch) + ", Iteration: " + str(i) + ", Loss: " + str(loss)) - nb = i + 1 + nb = i + 1.0 dist.barrier() torch.cuda.synchronize() if rank == 0: @@ -257,7 +257,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, device=acc_sum.device) dist.all_reduce(nb, op=dist.ReduceOp.SUM) else: - nb = i + 1 + nb = i + 1.0 if rank == 0: print(f"Validation Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) dist.barrier() @@ -292,7 +292,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, device=acc_sum.device) dist.all_reduce(nb, op=dist.ReduceOp.SUM) else: - nb = i + 1 + nb = i + 1.0 if rank == 0: print(f"Test Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) dist.barrier() From 88ce13a6877d7d9f0c1ac9635464da0f89fefabf Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 5 Feb 2024 13:20:52 -0800 Subject: [PATCH 137/197] better_timer --- examples/multi_gpu/papers100m_gcn.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 724f9e25e8bd..3b1b2c7facb7 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -186,8 +186,9 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, dist.barrier() torch.cuda.synchronize() if rank == 0: - print("Total time before training begins=", - round(time.perf_counter() - wall_clock_start, 2), "seconds") + prep_time = round(time.perf_counter() - wall_clock_start, 2) + print("Total time before training begins (prep_time) =", + prep_time, "seconds") print("Beginning training...") for epoch in range(epochs): if cugraph_data_loader: @@ -304,8 +305,11 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, shutdown_dask_client(client) dist.barrier() if rank == 0: - print("Total Program Runtime =", - round(time.perf_counter() - wall_clock_start, 2), "seconds") + total_time = round(time.perf_counter() - wall_clock_start, 2) + print("Total Program Runtime (total_time) =", + total_time, "seconds") + print("total_time - prep_time =", + total_time - prep_time, "seconds") if __name__ == '__main__': From 9ea04c8d8dc3c5504e0407dd4e7ccec15556890a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 5 Feb 2024 21:21:56 +0000 Subject: [PATCH 138/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 3b1b2c7facb7..e6b4fb8848cc 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -187,8 +187,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, torch.cuda.synchronize() if rank == 0: prep_time = round(time.perf_counter() - wall_clock_start, 2) - print("Total time before training begins (prep_time) =", - prep_time, "seconds") + print("Total time before training begins (prep_time) =", prep_time, + "seconds") print("Beginning training...") for epoch in range(epochs): if cugraph_data_loader: @@ -306,10 +306,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, dist.barrier() if rank == 0: total_time = round(time.perf_counter() - wall_clock_start, 2) - print("Total Program Runtime (total_time) =", - total_time, "seconds") - print("total_time - prep_time =", - total_time - prep_time, "seconds") + print("Total Program Runtime (total_time) =", total_time, "seconds") + print("total_time - prep_time =", total_time - prep_time, "seconds") if __name__ == '__main__': From 149abc68e5a4a75f62e0fb5071c35daab9ec0e80 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Mon, 5 Feb 2024 13:22:34 -0800 Subject: [PATCH 139/197] better timing --- examples/ogbn_papers_100m.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 617909f99a2f..6fc03e9cf319 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -166,8 +166,9 @@ def test(loader: NeighborLoader, val_steps: Optional[int] = None): torch.cuda.synchronize() -print("Total time before training begins=", - round(time.perf_counter() - wall_clock_start, 2), "seconds") +prep_time = round(time.perf_counter() - wall_clock_start, 2) +print("Total time before training begins (prep_time)=", + prep_time, "seconds") print("Beginning training...") for epoch in range(1, 1 + args.epochs): train() @@ -176,5 +177,7 @@ def test(loader: NeighborLoader, val_steps: Optional[int] = None): test_acc = test(test_loader) print(f'Test Acc: {test_acc:.4f}') -print("Total Program Runtime =", - round(time.perf_counter() - wall_clock_start, 2), "seconds") +total_time = round(time.perf_counter() - wall_clock_start, 2) +print("Total Program Runtime (total_time) =", + total_time, "seconds") +print("total_time - prep_time =", total_time - prep_time, "seconds") From ffc8918f8a32e92bfbb8066f80b0ed9bbdb9a832 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 5 Feb 2024 21:24:01 +0000 Subject: [PATCH 140/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/ogbn_papers_100m.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 6fc03e9cf319..4b33cac4989e 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -167,8 +167,7 @@ def test(loader: NeighborLoader, val_steps: Optional[int] = None): torch.cuda.synchronize() prep_time = round(time.perf_counter() - wall_clock_start, 2) -print("Total time before training begins (prep_time)=", - prep_time, "seconds") +print("Total time before training begins (prep_time)=", prep_time, "seconds") print("Beginning training...") for epoch in range(1, 1 + args.epochs): train() @@ -178,6 +177,5 @@ def test(loader: NeighborLoader, val_steps: Optional[int] = None): test_acc = test(test_loader) print(f'Test Acc: {test_acc:.4f}') total_time = round(time.perf_counter() - wall_clock_start, 2) -print("Total Program Runtime (total_time) =", - total_time, "seconds") +print("Total Program Runtime (total_time) =", total_time, "seconds") print("total_time - prep_time =", total_time - prep_time, "seconds") From 00d04e24e64cf9400e9e6a871472bb274b5733cd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 Feb 2024 17:58:34 +0000 Subject: [PATCH 141/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5de29f8b1785..b3dd6c359d5d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added - Added support for cuGraph data loading and `GAT` in single node Papers100m examples ([#8173](https://github.com/pyg-team/pytorch_geometric/pull/8173)) -======= + \======= + ### Changed - Breaking Change: Added support for `EdgeIndex` in `cugraph` GNN layers ([#8938](https://github.com/pyg-team/pytorch_geometric/pull/8937)) From aeb40006d14ad3a341a48b87eb94db242a5cde06 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 20 Feb 2024 13:31:17 -0800 Subject: [PATCH 142/197] cleanup accuracy eval --- examples/multi_gpu/papers100m_gcn.py | 31 ++++++---------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index e6b4fb8848cc..6beb58889421 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -235,7 +235,6 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, directory=eval_path, input_files=input_files) with Join([model], divide_by_initial_world_size=False): - acc_sum = 0.0 with torch.no_grad(): for i, batch in enumerate(eval_loader): if i >= eval_steps: @@ -248,19 +247,11 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) - acc_sum += acc(out[:batch_size].softmax(dim=-1), + acc_i = acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) - if world_size > 1: - acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, - device=rank) - dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) - nb = torch.tensor(float(i + 1), dtype=torch.float32, - device=acc_sum.device) - dist.all_reduce(nb, op=dist.ReduceOp.SUM) - else: - nb = i + 1.0 + acc_sum = acc.compute() if rank == 0: - print(f"Validation Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) + print(f"Validation Accuracy: {acc_sum * 100.0:.4f}%", ) dist.barrier() with Join([model], divide_by_initial_world_size=False): @@ -272,7 +263,6 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, test_loader = BulkSampleLoader(cugraph_store, cugraph_store, directory=test_path, input_files=input_files) - acc_sum = 0.0 with torch.no_grad(): for i, batch in enumerate(test_loader): batch = batch.to(rank) @@ -282,20 +272,11 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) - acc_sum += acc(out[:batch_size].softmax(dim=-1), + acc_i = acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) - - if world_size > 1: - acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, - device=rank) - dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) - nb = torch.tensor(float(i + 1), dtype=torch.float32, - device=acc_sum.device) - dist.all_reduce(nb, op=dist.ReduceOp.SUM) - else: - nb = i + 1.0 + acc_sum = acc.compute() if rank == 0: - print(f"Test Accuracy: {acc_sum/(nb) * 100.0:.4f}%", ) + print(f"Test Accuracy: {acc_sum * 100.0:.4f}%", ) dist.barrier() if cugraph_data_loader and rank == 0: From 5af3086eaf64be62fa43d90470ce391fd9fa9e87 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 Feb 2024 21:32:18 +0000 Subject: [PATCH 143/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 6beb58889421..4d490cf9018f 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -248,7 +248,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) acc_i = acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) + batch.y[:batch_size]) acc_sum = acc.compute() if rank == 0: print(f"Validation Accuracy: {acc_sum * 100.0:.4f}%", ) @@ -273,7 +273,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) acc_i = acc(out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) + batch.y[:batch_size]) acc_sum = acc.compute() if rank == 0: print(f"Test Accuracy: {acc_sum * 100.0:.4f}%", ) From eb5fbf086cabe133fc0a81121c2542cc22a8ef9e Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 20 Feb 2024 13:35:47 -0800 Subject: [PATCH 144/197] precommit cleanup --- examples/multi_gpu/papers100m_gcn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 4d490cf9018f..bdee07c99f8a 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -247,7 +247,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) - acc_i = acc(out[:batch_size].softmax(dim=-1), + acc_i = acc(out[:batch_size].softmax(dim=-1), # noqa batch.y[:batch_size]) acc_sum = acc.compute() if rank == 0: @@ -272,7 +272,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) - acc_i = acc(out[:batch_size].softmax(dim=-1), + acc_i = acc(out[:batch_size].softmax(dim=-1), # noqa batch.y[:batch_size]) acc_sum = acc.compute() if rank == 0: From 7bd924cb01f769a45f42fa08c367b23bce9c76d8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 Feb 2024 21:37:44 +0000 Subject: [PATCH 145/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index bdee07c99f8a..655fba01a2c0 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -247,8 +247,9 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) - acc_i = acc(out[:batch_size].softmax(dim=-1), # noqa - batch.y[:batch_size]) + acc_i = acc( + out[:batch_size].softmax(dim=-1), # noqa + batch.y[:batch_size]) acc_sum = acc.compute() if rank == 0: print(f"Validation Accuracy: {acc_sum * 100.0:.4f}%", ) @@ -272,8 +273,9 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) - acc_i = acc(out[:batch_size].softmax(dim=-1), # noqa - batch.y[:batch_size]) + acc_i = acc( + out[:batch_size].softmax(dim=-1), # noqa + batch.y[:batch_size]) acc_sum = acc.compute() if rank == 0: print(f"Test Accuracy: {acc_sum * 100.0:.4f}%", ) From 590712460434d580b317fdb6ff1fb0dfb0addf57 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 20 Feb 2024 13:43:17 -0800 Subject: [PATCH 146/197] cleaning precommit ci up --- examples/multi_gpu/papers100m_gcn.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 655fba01a2c0..0bbe2df1bd50 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -247,8 +247,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) - acc_i = acc( - out[:batch_size].softmax(dim=-1), # noqa + acc_i = acc( # noqa + out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) acc_sum = acc.compute() if rank == 0: @@ -273,8 +273,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) - acc_i = acc( - out[:batch_size].softmax(dim=-1), # noqa + acc_i = acc( # noqa + out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) acc_sum = acc.compute() if rank == 0: From c4316451c9498f0f1f51edd6f484b1c8e1ed81b2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 Feb 2024 21:44:17 +0000 Subject: [PATCH 147/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 0bbe2df1bd50..b1eddff4e935 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -248,8 +248,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) acc_i = acc( # noqa - out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) + out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) acc_sum = acc.compute() if rank == 0: print(f"Validation Accuracy: {acc_sum * 100.0:.4f}%", ) @@ -273,9 +272,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch.y = batch.y.to(torch.long) out = model.module(batch.x, batch.edge_index) - acc_i = acc( # noqa - out[:batch_size].softmax(dim=-1), - batch.y[:batch_size]) + acc_i = acc( # noqa + out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) acc_sum = acc.compute() if rank == 0: print(f"Test Accuracy: {acc_sum * 100.0:.4f}%", ) From 816ac8b20478085b311936d5c4e22c6dfad465ad Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 20 Feb 2024 14:47:29 -0800 Subject: [PATCH 148/197] splitting examples up w/ and w/o cugraph --- examples/ogbn_papers_100m.py | 61 +++++------------------------------- 1 file changed, 8 insertions(+), 53 deletions(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 4b33cac4989e..592ab009ff93 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -28,13 +28,6 @@ default=4, help="If using GATConv, number of attention heads to use", ) -parser.add_argument( - "--cugraph_data_loader", - action='store_true', - help="Wether or not to use CuGraph for Neighbor Loading. \ - \nNote that this requires more GPU memory or \ - a reduction in batch_size/fan_out/hidden_channels/num_layers", -) args = parser.parse_args() wall_clock_start = time.perf_counter() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') @@ -42,11 +35,6 @@ dataset = PygNodePropPredDataset(name='ogbn-papers100M', root='/datasets/ogb_datasets') split_idx = dataset.get_idx_split() -if args.cugraph_data_loader: - from cugraph.testing.mg_utils import enable_spilling - enable_spilling() - - def get_num_workers() -> int: try: return len(os.sched_getaffinity(0)) // 2 @@ -60,47 +48,14 @@ def get_num_workers() -> int: ) # Set Up Neighbor Loading data = dataset[0] -if args.cugraph_data_loader: - import cupy - import rmm - - rmm.reinitialize(devices=[0], pool_allocator=True, initial_pool_size=78e9, - managed_memory=True) - - from rmm.allocators.torch import rmm_torch_allocator - torch.cuda.memory.change_current_allocator(rmm_torch_allocator) - - from rmm.allocators.cupy import rmm_cupy_allocator - cupy.cuda.set_allocator(rmm_cupy_allocator) - - import cugraph - from cugraph_pyg.data import CuGraphStore - from cugraph_pyg.loader import CuGraphNeighborLoader - G = {("N", "E", "N"): data.edge_index} - N = {"N": data.num_nodes} - fs = cugraph.gnn.FeatureStore(backend="torch") - fs.add_data(data.x, "N", "x") - fs.add_data(data.y, "N", "y") - cugraph_store = CuGraphStore(fs, G, N) - train_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['train'], - shuffle=True, drop_last=True, - **kwargs) - val_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['valid'], - **kwargs) - test_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['test'], - **kwargs) -else: - num_work = get_num_workers() - train_loader = NeighborLoader(data=data, input_nodes=split_idx['train'], - num_workers=num_work, drop_last=True, - shuffle=False, **kwargs) - val_loader = NeighborLoader(data=data, input_nodes=split_idx['valid'], - num_workers=num_work, **kwargs) - test_loader = NeighborLoader(data=data, input_nodes=split_idx['test'], - num_workers=num_work, **kwargs) +num_work = get_num_workers() +train_loader = NeighborLoader(data=data, input_nodes=split_idx['train'], + num_workers=num_work, drop_last=True, + shuffle=False, **kwargs) +val_loader = NeighborLoader(data=data, input_nodes=split_idx['valid'], + num_workers=num_work, **kwargs) +test_loader = NeighborLoader(data=data, input_nodes=split_idx['test'], + num_workers=num_work, **kwargs) if args.use_gat_conv: model = torch_geometric.nn.models.GAT( From 2868caaa990db386eaa02d408eaf905fb2a9162c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 Feb 2024 22:48:30 +0000 Subject: [PATCH 149/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/ogbn_papers_100m.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 592ab009ff93..9defae99330d 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -35,6 +35,8 @@ dataset = PygNodePropPredDataset(name='ogbn-papers100M', root='/datasets/ogb_datasets') split_idx = dataset.get_idx_split() + + def get_num_workers() -> int: try: return len(os.sched_getaffinity(0)) // 2 From 68db875b27308f5f6328efeabdbc7e6539c8b642 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 20 Feb 2024 14:48:46 -0800 Subject: [PATCH 150/197] splitting single gpu examples w/ and w/o cugraph --- examples/ogbn_papers_100m_cugraph.py | 159 +++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 examples/ogbn_papers_100m_cugraph.py diff --git a/examples/ogbn_papers_100m_cugraph.py b/examples/ogbn_papers_100m_cugraph.py new file mode 100644 index 000000000000..f2328f1d19d9 --- /dev/null +++ b/examples/ogbn_papers_100m_cugraph.py @@ -0,0 +1,159 @@ +import argparse +import os +import time +from typing import Optional + +import torch +import torch.nn.functional as F +from ogb.nodeproppred import PygNodePropPredDataset + +import torch_geometric +from torch_geometric.loader import NeighborLoader +import cupy +import rmm +import cugraph +from cugraph_pyg.data import CuGraphStore +from cugraph_pyg.loader import CuGraphNeighborLoader +from rmm.allocators.torch import rmm_torch_allocator +from rmm.allocators.cupy import rmm_cupy_allocator + +parser = argparse.ArgumentParser() +parser.add_argument('--hidden_channels', type=int, default=256) +parser.add_argument('--num_layers', type=int, default=2) +parser.add_argument('--lr', type=float, default=0.001) +parser.add_argument('--epochs', type=int, default=20) +parser.add_argument('--batch_size', type=int, default=1024) +parser.add_argument('--fan_out', type=int, default=30) +parser.add_argument( + "--use_gat_conv", + action='store_true', + help="Wether or not to use GATConv. (Defaults to using GCNConv)", +) +parser.add_argument( + "--n_gat_conv_heads", + type=int, + default=4, + help="If using GATConv, number of attention heads to use", +) +args = parser.parse_args() +wall_clock_start = time.perf_counter() +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +dataset = PygNodePropPredDataset(name='ogbn-papers100M', + root='/datasets/ogb_datasets') +split_idx = dataset.get_idx_split() +from cugraph.testing.mg_utils import enable_spilling +enable_spilling() + + +def get_num_workers() -> int: + try: + return len(os.sched_getaffinity(0)) // 2 + except Exception: + return os.cpu_count() // 2 + + +kwargs = dict( + num_neighbors=[args.fan_out] * args.num_layers, + batch_size=args.batch_size, +) +# Set Up Neighbor Loading +data = dataset[0] +rmm.reinitialize(devices=[0], pool_allocator=True, initial_pool_size=78e9, + managed_memory=True) +torch.cuda.memory.change_current_allocator(rmm_torch_allocator) +cupy.cuda.set_allocator(rmm_cupy_allocator) +G = {("N", "E", "N"): data.edge_index} +N = {"N": data.num_nodes} +fs = cugraph.gnn.FeatureStore(backend="torch") +fs.add_data(data.x, "N", "x") +fs.add_data(data.y, "N", "y") +cugraph_store = CuGraphStore(fs, G, N) +train_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['train'], + shuffle=True, drop_last=True, + **kwargs) +val_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['valid'], + **kwargs) +test_loader = CuGraphNeighborLoader(cugraph_store, + input_nodes=split_idx['test'], + **kwargs) + +if args.use_gat_conv: + model = torch_geometric.nn.models.GAT( + dataset.num_features, args.hidden_channels, args.num_layers, + dataset.num_classes, heads=args.n_gat_conv_heads).to(device) +else: + model = torch_geometric.nn.models.GCN(dataset.num_features, + args.hidden_channels, + args.num_layers, + dataset.num_classes).to(device) + +optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, + weight_decay=0.0005) + +warmup_steps = 20 + + +def train(): + model.train() + for i, batch in enumerate(train_loader): + if isinstance(batch, torch_geometric.data.HeteroData): + batch = batch.to_homogeneous() + + if i == warmup_steps: + torch.cuda.synchronize() + start_avg_time = time.perf_counter() + batch = batch.to(device) + optimizer.zero_grad() + batch_size = batch.num_sampled_nodes[0] + out = model(batch.x, batch.edge_index)[:batch_size] + y = batch.y[:batch_size].view(-1).to(torch.long) + loss = F.cross_entropy(out, y) + loss.backward() + optimizer.step() + + if i % 10 == 0: + print(f'Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}') + torch.cuda.synchronize() + print(f'Average Training Iteration Time (s/iter): \ + {(time.perf_counter() - start_avg_time)/(i-warmup_steps):.6f}') + + +@torch.no_grad() +def test(loader: NeighborLoader, val_steps: Optional[int] = None): + model.eval() + + total_correct = total_examples = 0 + for i, batch in enumerate(loader): + if val_steps is not None and i >= val_steps: + break + if isinstance(batch, torch_geometric.data.HeteroData): + batch = batch.to_homogeneous() + batch = batch.to(device) + batch_size = batch.num_sampled_nodes[0] + out = model(batch.x, batch.edge_index)[:batch_size] + pred = out.argmax(dim=-1) + y = batch.y[:batch_size].view(-1).to(torch.long) + + total_correct += int((pred == y).sum()) + total_examples += y.size(0) + + return total_correct / total_examples + + +torch.cuda.synchronize() +prep_time = round(time.perf_counter() - wall_clock_start, 2) +print("Total time before training begins (prep_time)=", prep_time, "seconds") +print("Beginning training...") +for epoch in range(1, 1 + args.epochs): + train() + val_acc = test(val_loader, val_steps=100) + print(f'Val Acc: ~{val_acc:.4f}') + +test_acc = test(test_loader) +print(f'Test Acc: {test_acc:.4f}') +total_time = round(time.perf_counter() - wall_clock_start, 2) +print("Total Program Runtime (total_time) =", total_time, "seconds") +print("total_time - prep_time =", total_time - prep_time, "seconds") \ No newline at end of file From 727e5a515c73bf2730553b1afe517b380d71554d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 Feb 2024 22:50:32 +0000 Subject: [PATCH 151/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/ogbn_papers_100m_cugraph.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/examples/ogbn_papers_100m_cugraph.py b/examples/ogbn_papers_100m_cugraph.py index f2328f1d19d9..d9a7cba5a283 100644 --- a/examples/ogbn_papers_100m_cugraph.py +++ b/examples/ogbn_papers_100m_cugraph.py @@ -3,19 +3,19 @@ import time from typing import Optional +import cugraph +import cupy +import rmm import torch import torch.nn.functional as F +from cugraph_pyg.data import CuGraphStore +from cugraph_pyg.loader import CuGraphNeighborLoader from ogb.nodeproppred import PygNodePropPredDataset +from rmm.allocators.cupy import rmm_cupy_allocator +from rmm.allocators.torch import rmm_torch_allocator import torch_geometric from torch_geometric.loader import NeighborLoader -import cupy -import rmm -import cugraph -from cugraph_pyg.data import CuGraphStore -from cugraph_pyg.loader import CuGraphNeighborLoader -from rmm.allocators.torch import rmm_torch_allocator -from rmm.allocators.cupy import rmm_cupy_allocator parser = argparse.ArgumentParser() parser.add_argument('--hidden_channels', type=int, default=256) @@ -43,6 +43,7 @@ root='/datasets/ogb_datasets') split_idx = dataset.get_idx_split() from cugraph.testing.mg_utils import enable_spilling + enable_spilling() @@ -71,14 +72,11 @@ def get_num_workers() -> int: cugraph_store = CuGraphStore(fs, G, N) train_loader = CuGraphNeighborLoader(cugraph_store, input_nodes=split_idx['train'], - shuffle=True, drop_last=True, - **kwargs) + shuffle=True, drop_last=True, **kwargs) val_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['valid'], - **kwargs) + input_nodes=split_idx['valid'], **kwargs) test_loader = CuGraphNeighborLoader(cugraph_store, - input_nodes=split_idx['test'], - **kwargs) + input_nodes=split_idx['test'], **kwargs) if args.use_gat_conv: model = torch_geometric.nn.models.GAT( @@ -156,4 +154,4 @@ def test(loader: NeighborLoader, val_steps: Optional[int] = None): print(f'Test Acc: {test_acc:.4f}') total_time = round(time.perf_counter() - wall_clock_start, 2) print("Total Program Runtime (total_time) =", total_time, "seconds") -print("total_time - prep_time =", total_time - prep_time, "seconds") \ No newline at end of file +print("total_time - prep_time =", total_time - prep_time, "seconds") From 4ddcf485b322200676b3a325975cb72fe1bf68f5 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 20 Feb 2024 14:50:48 -0800 Subject: [PATCH 152/197] splitting single node multigpu examples --- examples/multi_gpu/papers100m_gcn.py | 267 ++++++--------------------- 1 file changed, 54 insertions(+), 213 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index b1eddff4e935..ffb2d02bffd8 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -9,46 +9,12 @@ import torch.multiprocessing as mp import torch.nn.functional as F from ogb.nodeproppred import PygNodePropPredDataset -from torch.distributed.algorithms.join import Join from torch.nn.parallel import DistributedDataParallel from torchmetrics import Accuracy +from torch_geometric.loader import NeighborLoader import torch_geometric -# Allow computation on objects that are larger than GPU memory -# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory -os.environ['CUDF_SPILL'] = '1' - -# Ensures that a CUDA context is not created on import of rapids. -# Allows pytorch to create the context instead -os.environ['RAPIDS_NO_INITIALIZE'] = '1' - - -def start_dask_cluster(): - from cugraph.testing.mg_utils import enable_spilling - from dask_cuda import LocalCUDACluster - - cluster = LocalCUDACluster( - protocol="tcp", - rmm_pool_size=None, - memory_limit=None, - rmm_async=True, - ) - - from dask.distributed import Client - client = Client(cluster) - client.wait_for_workers(n_workers=len(cluster.workers)) - client.run(enable_spilling) - - print("Dask Cluster Setup Complete") - return client, cluster - - -def shutdown_dask_client(client): - from cugraph.dask.comms import comms as Comms - Comms.destroy() - client.close() - def pyg_num_work(world_size): num_work = None @@ -62,49 +28,27 @@ def pyg_num_work(world_size): return int(num_work) -def init_pytorch_worker(rank, world_size, cugraph_data_loader=False): - if cugraph_data_loader: - import rmm - if rank > 0: - rmm.reinitialize(devices=rank) - - import cupy - cupy.cuda.Device(rank).use() - from rmm.allocators.cupy import rmm_cupy_allocator - cupy.cuda.set_allocator(rmm_cupy_allocator) - - from cugraph.testing.mg_utils import enable_spilling - enable_spilling() - - torch.cuda.set_device(rank) - +def init_pytorch_worker(rank, world_size): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '12355' dist.init_process_group('nccl', rank=rank, world_size=world_size) def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, - split_idx, num_classes, cugraph_data_loader, wall_clock_start, + split_idx, num_classes, wall_clock_start, tempdir=None, num_layers=3): init_pytorch_worker( rank, world_size, - cugraph_data_loader=cugraph_data_loader, ) - if cugraph_data_loader: - if rank == 0: - client, cluster = start_dask_cluster() - from cugraph.dask.comms import comms as Comms - Comms.initialize(p2p=True) - else: - split_idx['train'] = split_idx['train'].split( - split_idx['train'].size(0) // world_size, dim=0)[rank].clone() - split_idx['valid'] = split_idx['valid'].split( - split_idx['valid'].size(0) // world_size, dim=0)[rank].clone() - split_idx['test'] = split_idx['test'].split( - split_idx['test'].size(0) // world_size, dim=0)[rank].clone() + split_idx['train'] = split_idx['train'].split( + split_idx['train'].size(0) // world_size, dim=0)[rank].clone() + split_idx['valid'] = split_idx['valid'].split( + split_idx['valid'].size(0) // world_size, dim=0)[rank].clone() + split_idx['test'] = split_idx['test'].split( + split_idx['test'].size(0) // world_size, dim=0)[rank].clone() model = model.to(rank) model = DistributedDataParallel(model, device_ids=[rank]) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, @@ -114,71 +58,14 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, num_neighbors=[fan_out] * num_layers, batch_size=batch_size, ) - # Set Up Neighbor Loading - if cugraph_data_loader: - import cugraph - from cugraph_pyg.data import CuGraphStore - from cugraph_pyg.loader import BulkSampleLoader - G = {("N", "E", "N"): data.edge_index} - N = {"N": data.num_nodes} - fs = cugraph.gnn.FeatureStore(backend="torch") - fs.add_data(data.x, "N", "x") - fs.add_data(data.y, "N", "y") - dist.barrier() - - if rank == 0: - print("Rank 0 creating its cugraph store and \ - initializing distributed graph") - cugraph_store = CuGraphStore(fs, G, N, multi_gpu=True) - print("Distributed graph initialization complete.") - - if rank != 0: - print(f"Rank {rank} waiting for distributed graph initialization") - dist.barrier() - - if rank != 0: - print(f"Rank {rank} proceeding with store creation") - cugraph_store = CuGraphStore(fs, { - k: len(v) - for k, v in G.items() - }, N, multi_gpu=False) - print(f"Rank {rank} created store") - dist.barrier() - - if rank == 0: - # Direct cuGraph to sample offline prior to the training loop - # Sampling will occur in parallel but will be initiated on rank 0 - for epoch in range(epochs): - train_path = os.path.join(tempdir, f'samples_{epoch}') - os.mkdir(train_path) - BulkSampleLoader(cugraph_store, cugraph_store, - input_nodes=split_idx['train'], - directory=train_path, shuffle=True, - drop_last=True, **kwargs) - - print('validation', len(split_idx['valid'])) - eval_path = os.path.join(tempdir, f'samples_eval_{epoch}') - BulkSampleLoader(cugraph_store, cugraph_store, - input_nodes=split_idx['valid'], - directory=eval_path, **kwargs) - - print('test', len(split_idx['test'])) - test_path = os.path.join(tempdir, 'samples_test') - BulkSampleLoader(cugraph_store, cugraph_store, - input_nodes=split_idx['test'], - directory=test_path, **kwargs) - - dist.barrier() - else: - from torch_geometric.loader import NeighborLoader - num_work = pyg_num_work(world_size) - train_loader = NeighborLoader(data, input_nodes=split_idx['train'], - num_workers=num_work, shuffle=True, - drop_last=True, **kwargs) - eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], - num_workers=num_work, **kwargs) - test_loader = NeighborLoader(data, input_nodes=split_idx['test'], - num_workers=num_work, **kwargs) + num_work = pyg_num_work(world_size) + train_loader = NeighborLoader(data, input_nodes=split_idx['train'], + num_workers=num_work, shuffle=True, + drop_last=True, **kwargs) + eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], + num_workers=num_work, **kwargs) + test_loader = NeighborLoader(data, input_nodes=split_idx['test'], + num_workers=num_work, **kwargs) eval_steps = 1000 warmup_steps = 20 @@ -191,80 +78,32 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, "seconds") print("Beginning training...") for epoch in range(epochs): - if cugraph_data_loader: - train_path = os.path.join(tempdir, f'samples_{epoch}') - - input_files = np.array_split(np.array(os.listdir(train_path)), - world_size)[rank] - - train_loader = BulkSampleLoader(cugraph_store, cugraph_store, - directory=train_path, - input_files=input_files) - with Join([model], divide_by_initial_world_size=False): - for i, batch in enumerate(train_loader): - if i == warmup_steps: - torch.cuda.synchronize() - start = time.time() - batch = batch.to(rank) - - if isinstance(batch, torch_geometric.data.HeteroData): - batch = batch.to_homogeneous() - batch_size = batch.num_sampled_nodes[0] - - batch.y = batch.y.to(torch.long) - optimizer.zero_grad() - out = model(batch.x, batch.edge_index) - loss = F.cross_entropy(out[:batch_size], batch.y[:batch_size]) - loss.backward() - optimizer.step() - if rank == 0 and i % 10 == 0: - print("Epoch: " + str(epoch) + ", Iteration: " + str(i) + - ", Loss: " + str(loss)) + for i, batch in enumerate(train_loader): + if i == warmup_steps: + torch.cuda.synchronize() + start = time.time() + batch = batch.to(rank) + batch_size = batch.num_sampled_nodes[0] + batch.y = batch.y.to(torch.long) + optimizer.zero_grad() + out = model(batch.x, batch.edge_index) + loss = F.cross_entropy(out[:batch_size], batch.y[:batch_size]) + loss.backward() + optimizer.step() + if rank == 0 and i % 10 == 0: + print("Epoch: " + str(epoch) + ", Iteration: " + str(i) + + ", Loss: " + str(loss)) nb = i + 1.0 dist.barrier() torch.cuda.synchronize() if rank == 0: print("Average Training Iteration Time:", (time.time() - start) / (nb - warmup_steps), "s/iter") - if cugraph_data_loader: - eval_path = os.path.join(tempdir, f'samples_eval_{epoch}') - - input_files = np.array(os.listdir(eval_path)) - - eval_loader = BulkSampleLoader(cugraph_store, cugraph_store, - directory=eval_path, - input_files=input_files) - with Join([model], divide_by_initial_world_size=False): - with torch.no_grad(): - for i, batch in enumerate(eval_loader): - if i >= eval_steps: - break - - batch = batch.to(rank) - if isinstance(batch, torch_geometric.data.HeteroData): - batch = batch.to_homogeneous() - batch_size = batch.num_sampled_nodes[0] - - batch.y = batch.y.to(torch.long) - out = model.module(batch.x, batch.edge_index) - acc_i = acc( # noqa - out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) - acc_sum = acc.compute() - if rank == 0: - print(f"Validation Accuracy: {acc_sum * 100.0:.4f}%", ) - dist.barrier() - - with Join([model], divide_by_initial_world_size=False): - if cugraph_data_loader: - test_path = os.path.join(tempdir, 'samples_test') - - input_files = np.array(os.listdir(test_path)) - - test_loader = BulkSampleLoader(cugraph_store, cugraph_store, - directory=test_path, - input_files=input_files) with torch.no_grad(): - for i, batch in enumerate(test_loader): + for i, batch in enumerate(eval_loader): + if i >= eval_steps: + break + batch = batch.to(rank) if isinstance(batch, torch_geometric.data.HeteroData): batch = batch.to_homogeneous() @@ -276,14 +115,23 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) acc_sum = acc.compute() if rank == 0: - print(f"Test Accuracy: {acc_sum * 100.0:.4f}%", ) - dist.barrier() + print(f"Validation Accuracy: {acc_sum * 100.0:.4f}%", ) + dist.barrier() - if cugraph_data_loader and rank == 0: - import gc - del cugraph_store - gc.collect() - shutdown_dask_client(client) + with torch.no_grad(): + for i, batch in enumerate(test_loader): + batch = batch.to(rank) + if isinstance(batch, torch_geometric.data.HeteroData): + batch = batch.to_homogeneous() + batch_size = batch.num_sampled_nodes[0] + + batch.y = batch.y.to(torch.long) + out = model.module(batch.x, batch.edge_index) + acc_i = acc( # noqa + out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) + acc_sum = acc.compute() + if rank == 0: + print(f"Test Accuracy: {acc_sum * 100.0:.4f}%", ) dist.barrier() if rank == 0: total_time = round(time.perf_counter() - wall_clock_start, 2) @@ -311,13 +159,6 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, default=4, help="If using GATConv, number of attention heads to use", ) - parser.add_argument( - "--cugraph_data_loader", - action='store_true', - help="Whether or not to use CuGraph for Neighbor Loading. \ - \nNote that this requires more GPU memory or \ - a reduction in batch_size/fan_out/hidden_channels/num_layers", - ) parser.add_argument( "--n_devices", type=int, default=-1, help="1-8 to use that many GPUs. Defaults to all available GPUs") @@ -353,11 +194,11 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, mp.spawn( run_train, args=(data, world_size, model, args.epochs, args.batch_size, args.fan_out, split_idx, - dataset.num_classes, args.cugraph_data_loader, + dataset.num_classes, wall_clock_start, tempdir, args.num_layers), nprocs=world_size, join=True) else: run_train(0, data, world_size, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, - args.cugraph_data_loader, wall_clock_start, tempdir, + wall_clock_start, tempdir, args.num_layers) From 2f9a744ade49cfdd770afa3133053e10cbb0a52b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 Feb 2024 22:52:37 +0000 Subject: [PATCH 153/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index ffb2d02bffd8..bb15b77a09e7 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -11,9 +11,9 @@ from ogb.nodeproppred import PygNodePropPredDataset from torch.nn.parallel import DistributedDataParallel from torchmetrics import Accuracy -from torch_geometric.loader import NeighborLoader import torch_geometric +from torch_geometric.loader import NeighborLoader def pyg_num_work(world_size): @@ -35,8 +35,8 @@ def init_pytorch_worker(rank, world_size): def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, - split_idx, num_classes, wall_clock_start, - tempdir=None, num_layers=3): + split_idx, num_classes, wall_clock_start, tempdir=None, + num_layers=3): init_pytorch_worker( rank, @@ -192,13 +192,12 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, with tempfile.TemporaryDirectory() as tempdir: if world_size > 1: mp.spawn( - run_train, args=(data, world_size, model, args.epochs, - args.batch_size, args.fan_out, split_idx, - dataset.num_classes, - wall_clock_start, tempdir, args.num_layers), + run_train, + args=(data, world_size, model, args.epochs, args.batch_size, + args.fan_out, split_idx, dataset.num_classes, + wall_clock_start, tempdir, args.num_layers), nprocs=world_size, join=True) else: run_train(0, data, world_size, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, - wall_clock_start, tempdir, - args.num_layers) + wall_clock_start, tempdir, args.num_layers) From e84c3a89c4e5afb28b52c9e98272d969481dab5e Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 20 Feb 2024 15:00:51 -0800 Subject: [PATCH 154/197] Create papers100m_gcn_cugraph.py --- examples/multi_gpu/papers100m_gcn_cugraph.py | 332 +++++++++++++++++++ 1 file changed, 332 insertions(+) create mode 100644 examples/multi_gpu/papers100m_gcn_cugraph.py diff --git a/examples/multi_gpu/papers100m_gcn_cugraph.py b/examples/multi_gpu/papers100m_gcn_cugraph.py new file mode 100644 index 000000000000..1922fe81c61e --- /dev/null +++ b/examples/multi_gpu/papers100m_gcn_cugraph.py @@ -0,0 +1,332 @@ +import argparse +import os +import tempfile +import time + +import numpy as np +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +import torch.nn.functional as F +from ogb.nodeproppred import PygNodePropPredDataset +from torch.distributed.algorithms.join import Join +from torch.nn.parallel import DistributedDataParallel +from torchmetrics import Accuracy + +import torch_geometric + +# Allow computation on objects that are larger than GPU memory +# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory +os.environ['CUDF_SPILL'] = '1' + +# Ensures that a CUDA context is not created on import of rapids. +# Allows pytorch to create the context instead +os.environ['RAPIDS_NO_INITIALIZE'] = '1' + + +def start_dask_cluster(): + from cugraph.testing.mg_utils import enable_spilling + from dask_cuda import LocalCUDACluster + + cluster = LocalCUDACluster( + protocol="tcp", + rmm_pool_size=None, + memory_limit=None, + rmm_async=True, + ) + + from dask.distributed import Client + client = Client(cluster) + client.wait_for_workers(n_workers=len(cluster.workers)) + client.run(enable_spilling) + + print("Dask Cluster Setup Complete") + return client, cluster + + +def shutdown_dask_client(client): + from cugraph.dask.comms import comms as Comms + Comms.destroy() + client.close() + + +def pyg_num_work(world_size): + num_work = None + if hasattr(os, "sched_getaffinity"): + try: + num_work = len(os.sched_getaffinity(0)) / (2 * world_size) + except Exception: + pass + if num_work is None: + num_work = os.cpu_count() / (2 * world_size) + return int(num_work) + + +def init_pytorch_worker(rank, world_size): + import rmm + if rank > 0: + rmm.reinitialize(devices=rank) + + import cupy + cupy.cuda.Device(rank).use() + from rmm.allocators.cupy import rmm_cupy_allocator + cupy.cuda.set_allocator(rmm_cupy_allocator) + + from cugraph.testing.mg_utils import enable_spilling + enable_spilling() + + torch.cuda.set_device(rank) + + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12355' + dist.init_process_group('nccl', rank=rank, world_size=world_size) + + +def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, + split_idx, num_classes, wall_clock_start, + tempdir=None, num_layers=3): + + init_pytorch_worker( + rank, + world_size, + ) + + if rank == 0: + client, cluster = start_dask_cluster() + from cugraph.dask.comms import comms as Comms + Comms.initialize(p2p=True) + model = model.to(rank) + model = DistributedDataParallel(model, device_ids=[rank]) + optimizer = torch.optim.Adam(model.parameters(), lr=0.01, + weight_decay=0.0005) + + kwargs = dict( + num_neighbors=[fan_out] * num_layers, + batch_size=batch_size, + ) + # Set Up Neighbor Loading + import cugraph + from cugraph_pyg.data import CuGraphStore + from cugraph_pyg.loader import BulkSampleLoader + G = {("N", "E", "N"): data.edge_index} + N = {"N": data.num_nodes} + fs = cugraph.gnn.FeatureStore(backend="torch") + fs.add_data(data.x, "N", "x") + fs.add_data(data.y, "N", "y") + dist.barrier() + + if rank == 0: + print("Rank 0 creating its cugraph store and \ + initializing distributed graph") + cugraph_store = CuGraphStore(fs, G, N, multi_gpu=True) + print("Distributed graph initialization complete.") + + if rank != 0: + print(f"Rank {rank} waiting for distributed graph initialization") + dist.barrier() + + if rank != 0: + print(f"Rank {rank} proceeding with store creation") + cugraph_store = CuGraphStore(fs, { + k: len(v) + for k, v in G.items() + }, N, multi_gpu=False) + print(f"Rank {rank} created store") + dist.barrier() + + if rank == 0: + # Direct cuGraph to sample offline prior to the training loop + # Sampling will occur in parallel but will be initiated on rank 0 + for epoch in range(epochs): + train_path = os.path.join(tempdir, f'samples_{epoch}') + os.mkdir(train_path) + BulkSampleLoader(cugraph_store, cugraph_store, + input_nodes=split_idx['train'], + directory=train_path, shuffle=True, + drop_last=True, **kwargs) + + print('validation', len(split_idx['valid'])) + eval_path = os.path.join(tempdir, f'samples_eval_{epoch}') + BulkSampleLoader(cugraph_store, cugraph_store, + input_nodes=split_idx['valid'], + directory=eval_path, **kwargs) + + print('test', len(split_idx['test'])) + test_path = os.path.join(tempdir, 'samples_test') + BulkSampleLoader(cugraph_store, cugraph_store, + input_nodes=split_idx['test'], + directory=test_path, **kwargs) + + dist.barrier() + + eval_steps = 1000 + warmup_steps = 20 + acc = Accuracy(task="multiclass", num_classes=num_classes).to(rank) + dist.barrier() + torch.cuda.synchronize() + if rank == 0: + prep_time = round(time.perf_counter() - wall_clock_start, 2) + print("Total time before training begins (prep_time) =", prep_time, + "seconds") + print("Beginning training...") + for epoch in range(epochs): + train_path = os.path.join(tempdir, f'samples_{epoch}') + + input_files = np.array_split(np.array(os.listdir(train_path)), + world_size)[rank] + + train_loader = BulkSampleLoader(cugraph_store, cugraph_store, + directory=train_path, + input_files=input_files) + with Join([model], divide_by_initial_world_size=False): + for i, batch in enumerate(train_loader): + if i == warmup_steps: + torch.cuda.synchronize() + start = time.time() + batch = batch.to(rank) + + if isinstance(batch, torch_geometric.data.HeteroData): + batch = batch.to_homogeneous() + batch_size = batch.num_sampled_nodes[0] + + batch.y = batch.y.to(torch.long) + optimizer.zero_grad() + out = model(batch.x, batch.edge_index) + loss = F.cross_entropy(out[:batch_size], batch.y[:batch_size]) + loss.backward() + optimizer.step() + if rank == 0 and i % 10 == 0: + print("Epoch: " + str(epoch) + ", Iteration: " + str(i) + + ", Loss: " + str(loss)) + nb = i + 1.0 + dist.barrier() + torch.cuda.synchronize() + if rank == 0: + print("Average Training Iteration Time:", + (time.time() - start) / (nb - warmup_steps), "s/iter") + eval_path = os.path.join(tempdir, f'samples_eval_{epoch}') + + input_files = np.array(os.listdir(eval_path)) + + eval_loader = BulkSampleLoader(cugraph_store, cugraph_store, + directory=eval_path, + input_files=input_files) + with Join([model], divide_by_initial_world_size=False): + with torch.no_grad(): + for i, batch in enumerate(eval_loader): + if i >= eval_steps: + break + + batch = batch.to(rank) + if isinstance(batch, torch_geometric.data.HeteroData): + batch = batch.to_homogeneous() + batch_size = batch.num_sampled_nodes[0] + + batch.y = batch.y.to(torch.long) + out = model.module(batch.x, batch.edge_index) + acc_i = acc( # noqa + out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) + acc_sum = acc.compute() + if rank == 0: + print(f"Validation Accuracy: {acc_sum * 100.0:.4f}%", ) + dist.barrier() + + with Join([model], divide_by_initial_world_size=False): + if cugraph_data_loader: + test_path = os.path.join(tempdir, 'samples_test') + + input_files = np.array(os.listdir(test_path)) + + test_loader = BulkSampleLoader(cugraph_store, cugraph_store, + directory=test_path, + input_files=input_files) + with torch.no_grad(): + for i, batch in enumerate(test_loader): + batch = batch.to(rank) + if isinstance(batch, torch_geometric.data.HeteroData): + batch = batch.to_homogeneous() + batch_size = batch.num_sampled_nodes[0] + + batch.y = batch.y.to(torch.long) + out = model.module(batch.x, batch.edge_index) + acc_i = acc( # noqa + out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) + acc_sum = acc.compute() + if rank == 0: + print(f"Test Accuracy: {acc_sum * 100.0:.4f}%", ) + dist.barrier() + + import gc + del cugraph_store + gc.collect() + shutdown_dask_client(client) + dist.barrier() + if rank == 0: + total_time = round(time.perf_counter() - wall_clock_start, 2) + print("Total Program Runtime (total_time) =", total_time, "seconds") + print("total_time - prep_time =", total_time - prep_time, "seconds") + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('--hidden_channels', type=int, default=256) + parser.add_argument('--num_layers', type=int, default=2) + parser.add_argument('--lr', type=float, default=0.001) + parser.add_argument('--epochs', type=int, default=20) + parser.add_argument('--batch_size', type=int, default=1024) + parser.add_argument('--fan_out', type=int, default=30) + parser.add_argument( + "--use_gat_conv", + action='store_true', + help="Whether or not to use GATConv. (Defaults to using GCNConv)", + ) + parser.add_argument( + "--n_gat_conv_heads", + type=int, + default=4, + help="If using GATConv, number of attention heads to use", + ) + parser.add_argument( + "--n_devices", type=int, default=-1, + help="1-8 to use that many GPUs. Defaults to all available GPUs") + + args = parser.parse_args() + wall_clock_start = time.perf_counter() + + dataset = PygNodePropPredDataset(name='ogbn-papers100M', + root='/datasets/ogb_datasets') + split_idx = dataset.get_idx_split() + data = dataset[0] + data.y = data.y.reshape(-1) + if args.use_gat_conv: + model = torch_geometric.nn.models.GAT(dataset.num_features, + args.hidden_channels, + args.num_layers, + dataset.num_classes, + heads=args.n_gat_conv_heads) + else: + model = torch_geometric.nn.models.GCN(dataset.num_features, + args.hidden_channels, + args.num_layers, + dataset.num_classes) + + print("Data =", data) + if args.n_devices == -1: + world_size = torch.cuda.device_count() + else: + world_size = args.n_devices + print('Let\'s use', world_size, 'GPUs!') + with tempfile.TemporaryDirectory() as tempdir: + if world_size > 1: + mp.spawn( + run_train, args=(data, world_size, model, args.epochs, + args.batch_size, args.fan_out, split_idx, + dataset.num_classes, + wall_clock_start, tempdir, args.num_layers), + nprocs=world_size, join=True) + else: + run_train(0, data, world_size, model, args.epochs, args.batch_size, + args.fan_out, split_idx, dataset.num_classes, + wall_clock_start, tempdir, + args.num_layers) From b9de75698932d40f954bd3814eee62dbd5849fb7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 Feb 2024 23:02:06 +0000 Subject: [PATCH 155/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn_cugraph.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_cugraph.py b/examples/multi_gpu/papers100m_gcn_cugraph.py index 1922fe81c61e..b17dd86fc91e 100644 --- a/examples/multi_gpu/papers100m_gcn_cugraph.py +++ b/examples/multi_gpu/papers100m_gcn_cugraph.py @@ -83,8 +83,8 @@ def init_pytorch_worker(rank, world_size): def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, - split_idx, num_classes, wall_clock_start, - tempdir=None, num_layers=3): + split_idx, num_classes, wall_clock_start, tempdir=None, + num_layers=3): init_pytorch_worker( rank, @@ -154,8 +154,8 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, print('test', len(split_idx['test'])) test_path = os.path.join(tempdir, 'samples_test') BulkSampleLoader(cugraph_store, cugraph_store, - input_nodes=split_idx['test'], - directory=test_path, **kwargs) + input_nodes=split_idx['test'], directory=test_path, + **kwargs) dist.barrier() @@ -320,13 +320,12 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, with tempfile.TemporaryDirectory() as tempdir: if world_size > 1: mp.spawn( - run_train, args=(data, world_size, model, args.epochs, - args.batch_size, args.fan_out, split_idx, - dataset.num_classes, - wall_clock_start, tempdir, args.num_layers), + run_train, + args=(data, world_size, model, args.epochs, args.batch_size, + args.fan_out, split_idx, dataset.num_classes, + wall_clock_start, tempdir, args.num_layers), nprocs=world_size, join=True) else: run_train(0, data, world_size, model, args.epochs, args.batch_size, args.fan_out, split_idx, dataset.num_classes, - wall_clock_start, tempdir, - args.num_layers) + wall_clock_start, tempdir, args.num_layers) From f78737e2bd3c711e8609587df4d3b75daaf8cd09 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 20 Feb 2024 18:07:39 -0800 Subject: [PATCH 156/197] cleaning --- examples/multi_gpu/papers100m_gcn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index bb15b77a09e7..c72546c1dab9 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -3,7 +3,6 @@ import tempfile import time -import numpy as np import torch import torch.distributed as dist import torch.multiprocessing as mp From fd27a341853e98843e0b6550af04b88705d9a29a Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 20 Feb 2024 18:09:56 -0800 Subject: [PATCH 157/197] clean up --- examples/multi_gpu/papers100m_gcn_cugraph.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_cugraph.py b/examples/multi_gpu/papers100m_gcn_cugraph.py index b17dd86fc91e..12ee257328a5 100644 --- a/examples/multi_gpu/papers100m_gcn_cugraph.py +++ b/examples/multi_gpu/papers100m_gcn_cugraph.py @@ -232,14 +232,13 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, dist.barrier() with Join([model], divide_by_initial_world_size=False): - if cugraph_data_loader: - test_path = os.path.join(tempdir, 'samples_test') + test_path = os.path.join(tempdir, 'samples_test') - input_files = np.array(os.listdir(test_path)) + input_files = np.array(os.listdir(test_path)) - test_loader = BulkSampleLoader(cugraph_store, cugraph_store, - directory=test_path, - input_files=input_files) + test_loader = BulkSampleLoader(cugraph_store, cugraph_store, + directory=test_path, + input_files=input_files) with torch.no_grad(): for i, batch in enumerate(test_loader): batch = batch.to(rank) From 3400ad133ca6df1a9827790a1798ce1fbb3ce1aa Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 20 Feb 2024 18:10:31 -0800 Subject: [PATCH 158/197] clean up --- examples/ogbn_papers_100m_cugraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ogbn_papers_100m_cugraph.py b/examples/ogbn_papers_100m_cugraph.py index d9a7cba5a283..a266d8fe1081 100644 --- a/examples/ogbn_papers_100m_cugraph.py +++ b/examples/ogbn_papers_100m_cugraph.py @@ -13,6 +13,7 @@ from ogb.nodeproppred import PygNodePropPredDataset from rmm.allocators.cupy import rmm_cupy_allocator from rmm.allocators.torch import rmm_torch_allocator +from cugraph.testing.mg_utils import enable_spilling import torch_geometric from torch_geometric.loader import NeighborLoader @@ -42,7 +43,6 @@ dataset = PygNodePropPredDataset(name='ogbn-papers100M', root='/datasets/ogb_datasets') split_idx = dataset.get_idx_split() -from cugraph.testing.mg_utils import enable_spilling enable_spilling() From 7136d1d8e0e422188ed2976e60875feb72f9e4ae Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 21 Feb 2024 02:11:55 +0000 Subject: [PATCH 159/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/ogbn_papers_100m_cugraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ogbn_papers_100m_cugraph.py b/examples/ogbn_papers_100m_cugraph.py index a266d8fe1081..b08a7758fab7 100644 --- a/examples/ogbn_papers_100m_cugraph.py +++ b/examples/ogbn_papers_100m_cugraph.py @@ -8,12 +8,12 @@ import rmm import torch import torch.nn.functional as F +from cugraph.testing.mg_utils import enable_spilling from cugraph_pyg.data import CuGraphStore from cugraph_pyg.loader import CuGraphNeighborLoader from ogb.nodeproppred import PygNodePropPredDataset from rmm.allocators.cupy import rmm_cupy_allocator from rmm.allocators.torch import rmm_torch_allocator -from cugraph.testing.mg_utils import enable_spilling import torch_geometric from torch_geometric.loader import NeighborLoader From d8923991f9e389877eddfbb1c7412faec87391be Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 29 Feb 2024 09:14:26 -0800 Subject: [PATCH 160/197] Update CHANGELOG.md --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc4c70e298d9..c3b41d29b12b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added - Added support for cuGraph data loading and `GAT` in single node Papers100m examples ([#8173](https://github.com/pyg-team/pytorch_geometric/pull/8173)) -- Added `EdgeIndex.resize_` functionality ([#8983](https://github.com/pyg-team/pytorch_geometric/pull/8983)) - Added a `ogbn-mag240m` example ([#8249](https://github.com/pyg-team/pytorch_geometric/pull/8249/)) - Added `EdgeIndex.sparse_resize_` functionality ([#8983](https://github.com/pyg-team/pytorch_geometric/pull/8983)) - Added approximate `faiss`-based KNN-search ([#8952](https://github.com/pyg-team/pytorch_geometric/pull/8952)) From 6fcf2082a2fbe26cb4cb43578b82760cd3acb1e9 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Thu, 29 Feb 2024 10:55:00 -0800 Subject: [PATCH 161/197] cleanup --- examples/multi_gpu/papers100m_gcn.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index c72546c1dab9..6fc8baade3a4 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -41,13 +41,13 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, rank, world_size, ) - - split_idx['train'] = split_idx['train'].split( - split_idx['train'].size(0) // world_size, dim=0)[rank].clone() - split_idx['valid'] = split_idx['valid'].split( - split_idx['valid'].size(0) // world_size, dim=0)[rank].clone() - split_idx['test'] = split_idx['test'].split( - split_idx['test'].size(0) // world_size, dim=0)[rank].clone() + if world_size > 1: + split_idx['train'] = split_idx['train'].split( + split_idx['train'].size(0) // world_size, dim=0)[rank].clone() + split_idx['valid'] = split_idx['valid'].split( + split_idx['valid'].size(0) // world_size, dim=0)[rank].clone() + split_idx['test'] = split_idx['test'].split( + split_idx['test'].size(0) // world_size, dim=0)[rank].clone() model = model.to(rank) model = DistributedDataParallel(model, device_ids=[rank]) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, From 7dd879de7c70e8f9e043b9f44fd1a70c703d2ab4 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 5 Mar 2024 14:13:12 -0800 Subject: [PATCH 162/197] Update README.md --- examples/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/README.md b/examples/README.md index 25fe7a7cb66d..43cf5e8d41ba 100644 --- a/examples/README.md +++ b/examples/README.md @@ -13,6 +13,8 @@ For examples on [Open Graph Benchmark](https://ogb.stanford.edu/) datasets, see - [`ogbn_proteins_deepgcn.py`](./ogbn_proteins_deepgcn.py) is an example to showcase how to train deep GNNs on the `ogbn-proteins` dataset. - [`ogbn_papers_100m.py`](./ogbn_papers_100m.py) is an example for training a GNN on the large-scale `ogbn-papers100m` dataset, containing approximately ~1.6B edges. +- [`ogbn_papers_100m_cugraph.py`](./ogbn_papers_100m_cugraph.py) shows how to accelerate the OGB-Papers100m workflow using CuGraph. + For examples on using `torch.compile`, see the examples under [`examples/compile`](./compile). For examples on scaling PyG up via multi-GPUs, see the examples under [`examples/multi_gpu`](./multi_gpu). From be579bb190f8300d2a86eee5ec200aabd8db52cd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 5 Mar 2024 22:14:13 +0000 Subject: [PATCH 163/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/README.md b/examples/README.md index 43cf5e8d41ba..df9d8e3bfe08 100644 --- a/examples/README.md +++ b/examples/README.md @@ -10,7 +10,9 @@ For a simple link prediction example, see [`link_pred.py`](./link_pred.py). For examples on [Open Graph Benchmark](https://ogb.stanford.edu/) datasets, see the `ogbn_*.py` examples: - [`ogbn_products_sage.py`](./ogbn_products_sage.py) and [`ogbn_products_gat.py`](./ogbn_products_gat.py) show how to train [`GraphSAGE`](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.models.GraphSAGE.html) and [`GAT`](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.models.GAT.html) models on the `ogbn-products` dataset. + - [`ogbn_proteins_deepgcn.py`](./ogbn_proteins_deepgcn.py) is an example to showcase how to train deep GNNs on the `ogbn-proteins` dataset. + - [`ogbn_papers_100m.py`](./ogbn_papers_100m.py) is an example for training a GNN on the large-scale `ogbn-papers100m` dataset, containing approximately ~1.6B edges. - [`ogbn_papers_100m_cugraph.py`](./ogbn_papers_100m_cugraph.py) shows how to accelerate the OGB-Papers100m workflow using CuGraph. From 45200ab39a48b0e2bc52a0a371972f8483c60159 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 5 Mar 2024 14:18:04 -0800 Subject: [PATCH 164/197] Update README.md --- examples/multi_gpu/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/README.md b/examples/multi_gpu/README.md index ac55bdd12acd..163f5f4a81b8 100644 --- a/examples/multi_gpu/README.md +++ b/examples/multi_gpu/README.md @@ -7,8 +7,9 @@ | [`distributed_batching.py`](./distributed_batching.py) | single-node | Example for training GNNs on multiple graphs. | | [`distributed_sampling.py`](./distributed_sampling.py) | single-node | Example for training GNNs on a homogeneous graph with neighbor sampling. | | [`distributed_sampling_multinode.py`](./distributed_sampling_multinode.py) | multi-node | Example for training GNNs on a homogeneous graph with neighbor sampling on multiple nodes. | -| [`distributed_sampling_multinode.sbatch`](./distributed_sampling_multinode.sbatch) | multi-node | Example for submitting a training job to a Slurm cluster using [`distributed_sampling_multi_node.py`](./distributed_sampling_multinode.py). | -| [`papers100m_gcn.py`](./papers100m_gcn.py) | single-node | Example for training GNNs on a homogeneous graph. | +| [`distributed_sampling_multinode.sbatch`](./distributed_sampling_multinode.sbatch) | multi-node | Example for submitting a training job to a Slurm cluster using [`distributed_sampling_multi_node.py`](./distributed_sampling_multinode.py). | +| [`papers100m_gcn.py`](./papers100m_gcn.py) | single-node | Example for training GNNs on the Papers100m homogeneous graph w/ ~1.6B edges. | +| [`papers100m_gcn_cugraph.py`](./papers100m_gcn_cugraph.py`) | single-node | Example for accelerating GNN training on Papers100m using CuGraph. | | [`papers100m_gcn_multinode.py`](./papers100m_gcn_multinode.py) | multi-node | Example for training GNNs on a homogeneous graph on multiple nodes. | | [`mag240m_graphsage.py`](./mag240m_graphsage.py) | single-node | Example for training GNNs on a large heterogeneous graph. | | [`taobao.py`](./taobao.py) | single-node | Example for training link prediction GNNs on a heterogeneous graph. | From 4af9464451b0d004affc295bd1cef1836d9c6630 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 5 Mar 2024 22:19:04 +0000 Subject: [PATCH 165/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/README.md b/examples/multi_gpu/README.md index 163f5f4a81b8..8fdf553a27be 100644 --- a/examples/multi_gpu/README.md +++ b/examples/multi_gpu/README.md @@ -7,9 +7,9 @@ | [`distributed_batching.py`](./distributed_batching.py) | single-node | Example for training GNNs on multiple graphs. | | [`distributed_sampling.py`](./distributed_sampling.py) | single-node | Example for training GNNs on a homogeneous graph with neighbor sampling. | | [`distributed_sampling_multinode.py`](./distributed_sampling_multinode.py) | multi-node | Example for training GNNs on a homogeneous graph with neighbor sampling on multiple nodes. | -| [`distributed_sampling_multinode.sbatch`](./distributed_sampling_multinode.sbatch) | multi-node | Example for submitting a training job to a Slurm cluster using [`distributed_sampling_multi_node.py`](./distributed_sampling_multinode.py). | +| [`distributed_sampling_multinode.sbatch`](./distributed_sampling_multinode.sbatch) | multi-node | Example for submitting a training job to a Slurm cluster using [`distributed_sampling_multi_node.py`](./distributed_sampling_multinode.py). | | [`papers100m_gcn.py`](./papers100m_gcn.py) | single-node | Example for training GNNs on the Papers100m homogeneous graph w/ ~1.6B edges. | -| [`papers100m_gcn_cugraph.py`](./papers100m_gcn_cugraph.py`) | single-node | Example for accelerating GNN training on Papers100m using CuGraph. | +| [`papers100m_gcn_cugraph.py`](./papers100m_gcn_cugraph.py%60) | single-node | Example for accelerating GNN training on Papers100m using CuGraph. | | [`papers100m_gcn_multinode.py`](./papers100m_gcn_multinode.py) | multi-node | Example for training GNNs on a homogeneous graph on multiple nodes. | | [`mag240m_graphsage.py`](./mag240m_graphsage.py) | single-node | Example for training GNNs on a large heterogeneous graph. | | [`taobao.py`](./taobao.py) | single-node | Example for training link prediction GNNs on a heterogeneous graph. | From 061e1ce41eefa380d9200694250aa19b0c86d4de Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 5 Mar 2024 15:08:29 -0800 Subject: [PATCH 166/197] Update README.md --- examples/README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/README.md b/examples/README.md index df9d8e3bfe08..7dc235cbb2fc 100644 --- a/examples/README.md +++ b/examples/README.md @@ -10,11 +10,8 @@ For a simple link prediction example, see [`link_pred.py`](./link_pred.py). For examples on [Open Graph Benchmark](https://ogb.stanford.edu/) datasets, see the `ogbn_*.py` examples: - [`ogbn_products_sage.py`](./ogbn_products_sage.py) and [`ogbn_products_gat.py`](./ogbn_products_gat.py) show how to train [`GraphSAGE`](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.models.GraphSAGE.html) and [`GAT`](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.models.GAT.html) models on the `ogbn-products` dataset. - - [`ogbn_proteins_deepgcn.py`](./ogbn_proteins_deepgcn.py) is an example to showcase how to train deep GNNs on the `ogbn-proteins` dataset. - - [`ogbn_papers_100m.py`](./ogbn_papers_100m.py) is an example for training a GNN on the large-scale `ogbn-papers100m` dataset, containing approximately ~1.6B edges. - - [`ogbn_papers_100m_cugraph.py`](./ogbn_papers_100m_cugraph.py) shows how to accelerate the OGB-Papers100m workflow using CuGraph. For examples on using `torch.compile`, see the examples under [`examples/compile`](./compile). From eced7f3623fc36283c11c6af0659d0e746eda8a2 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 13:54:52 -0800 Subject: [PATCH 167/197] Update examples/README.md Co-authored-by: Matthias Fey --- examples/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/README.md b/examples/README.md index 7dc235cbb2fc..35a792090b2c 100644 --- a/examples/README.md +++ b/examples/README.md @@ -12,7 +12,7 @@ For examples on [Open Graph Benchmark](https://ogb.stanford.edu/) datasets, see - [`ogbn_products_sage.py`](./ogbn_products_sage.py) and [`ogbn_products_gat.py`](./ogbn_products_gat.py) show how to train [`GraphSAGE`](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.models.GraphSAGE.html) and [`GAT`](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.models.GAT.html) models on the `ogbn-products` dataset. - [`ogbn_proteins_deepgcn.py`](./ogbn_proteins_deepgcn.py) is an example to showcase how to train deep GNNs on the `ogbn-proteins` dataset. - [`ogbn_papers_100m.py`](./ogbn_papers_100m.py) is an example for training a GNN on the large-scale `ogbn-papers100m` dataset, containing approximately ~1.6B edges. -- [`ogbn_papers_100m_cugraph.py`](./ogbn_papers_100m_cugraph.py) shows how to accelerate the OGB-Papers100m workflow using CuGraph. +- [`ogbn_papers_100m_cugraph.py`](./ogbn_papers_100m_cugraph.py) shows how to accelerate the `ogbn-papers100m` workflow using [CuGraph](...). For examples on using `torch.compile`, see the examples under [`examples/compile`](./compile). From 04b7db57342ba4a64912fde8711edcd7d65966ca Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 13:55:35 -0800 Subject: [PATCH 168/197] Update README.md --- examples/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/README.md b/examples/README.md index 35a792090b2c..336b3d816a82 100644 --- a/examples/README.md +++ b/examples/README.md @@ -12,7 +12,7 @@ For examples on [Open Graph Benchmark](https://ogb.stanford.edu/) datasets, see - [`ogbn_products_sage.py`](./ogbn_products_sage.py) and [`ogbn_products_gat.py`](./ogbn_products_gat.py) show how to train [`GraphSAGE`](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.models.GraphSAGE.html) and [`GAT`](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.models.GAT.html) models on the `ogbn-products` dataset. - [`ogbn_proteins_deepgcn.py`](./ogbn_proteins_deepgcn.py) is an example to showcase how to train deep GNNs on the `ogbn-proteins` dataset. - [`ogbn_papers_100m.py`](./ogbn_papers_100m.py) is an example for training a GNN on the large-scale `ogbn-papers100m` dataset, containing approximately ~1.6B edges. -- [`ogbn_papers_100m_cugraph.py`](./ogbn_papers_100m_cugraph.py) shows how to accelerate the `ogbn-papers100m` workflow using [CuGraph](...). +- [`ogbn_papers_100m_cugraph.py`](./ogbn_papers_100m_cugraph.py) shows how to accelerate the `ogbn-papers100m` workflow using [CuGraph](https://github.com/rapidsai/cugraph). For examples on using `torch.compile`, see the examples under [`examples/compile`](./compile). From 5c4088a2a91a8e9b0eed6ecede48b049f4f03c6d Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 13:56:02 -0800 Subject: [PATCH 169/197] Update examples/multi_gpu/README.md Co-authored-by: Matthias Fey --- examples/multi_gpu/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_gpu/README.md b/examples/multi_gpu/README.md index 8fdf553a27be..f9ce9bbf97f8 100644 --- a/examples/multi_gpu/README.md +++ b/examples/multi_gpu/README.md @@ -8,7 +8,7 @@ | [`distributed_sampling.py`](./distributed_sampling.py) | single-node | Example for training GNNs on a homogeneous graph with neighbor sampling. | | [`distributed_sampling_multinode.py`](./distributed_sampling_multinode.py) | multi-node | Example for training GNNs on a homogeneous graph with neighbor sampling on multiple nodes. | | [`distributed_sampling_multinode.sbatch`](./distributed_sampling_multinode.sbatch) | multi-node | Example for submitting a training job to a Slurm cluster using [`distributed_sampling_multi_node.py`](./distributed_sampling_multinode.py). | -| [`papers100m_gcn.py`](./papers100m_gcn.py) | single-node | Example for training GNNs on the Papers100m homogeneous graph w/ ~1.6B edges. | +| [`papers100m_gcn.py`](./papers100m_gcn.py) | single-node | Example for training GNNs on the `ogbn-papers100M` homogeneous graph w/ ~1.6B edges. | | [`papers100m_gcn_cugraph.py`](./papers100m_gcn_cugraph.py%60) | single-node | Example for accelerating GNN training on Papers100m using CuGraph. | | [`papers100m_gcn_multinode.py`](./papers100m_gcn_multinode.py) | multi-node | Example for training GNNs on a homogeneous graph on multiple nodes. | | [`mag240m_graphsage.py`](./mag240m_graphsage.py) | single-node | Example for training GNNs on a large heterogeneous graph. | From a110a25996569ccec74ec54c4b41f946dd36e03a Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 13:56:40 -0800 Subject: [PATCH 170/197] Update examples/multi_gpu/README.md Co-authored-by: Matthias Fey --- examples/multi_gpu/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_gpu/README.md b/examples/multi_gpu/README.md index f9ce9bbf97f8..d4974675f705 100644 --- a/examples/multi_gpu/README.md +++ b/examples/multi_gpu/README.md @@ -9,7 +9,7 @@ | [`distributed_sampling_multinode.py`](./distributed_sampling_multinode.py) | multi-node | Example for training GNNs on a homogeneous graph with neighbor sampling on multiple nodes. | | [`distributed_sampling_multinode.sbatch`](./distributed_sampling_multinode.sbatch) | multi-node | Example for submitting a training job to a Slurm cluster using [`distributed_sampling_multi_node.py`](./distributed_sampling_multinode.py). | | [`papers100m_gcn.py`](./papers100m_gcn.py) | single-node | Example for training GNNs on the `ogbn-papers100M` homogeneous graph w/ ~1.6B edges. | -| [`papers100m_gcn_cugraph.py`](./papers100m_gcn_cugraph.py%60) | single-node | Example for accelerating GNN training on Papers100m using CuGraph. | +| [`papers100m_gcn_cugraph.py`](./papers100m_gcn_cugraph.py%60) | single-node | Example for accelerating GNN training on `ogbn-papers100M` using [CuGraph](...). | | [`papers100m_gcn_multinode.py`](./papers100m_gcn_multinode.py) | multi-node | Example for training GNNs on a homogeneous graph on multiple nodes. | | [`mag240m_graphsage.py`](./mag240m_graphsage.py) | single-node | Example for training GNNs on a large heterogeneous graph. | | [`taobao.py`](./taobao.py) | single-node | Example for training link prediction GNNs on a heterogeneous graph. | From 887c769eb76d6e979ff74445c782cf699b533077 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 6 Mar 2024 21:58:05 +0000 Subject: [PATCH 171/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/README.md b/examples/multi_gpu/README.md index d4974675f705..91defd65875d 100644 --- a/examples/multi_gpu/README.md +++ b/examples/multi_gpu/README.md @@ -8,8 +8,8 @@ | [`distributed_sampling.py`](./distributed_sampling.py) | single-node | Example for training GNNs on a homogeneous graph with neighbor sampling. | | [`distributed_sampling_multinode.py`](./distributed_sampling_multinode.py) | multi-node | Example for training GNNs on a homogeneous graph with neighbor sampling on multiple nodes. | | [`distributed_sampling_multinode.sbatch`](./distributed_sampling_multinode.sbatch) | multi-node | Example for submitting a training job to a Slurm cluster using [`distributed_sampling_multi_node.py`](./distributed_sampling_multinode.py). | -| [`papers100m_gcn.py`](./papers100m_gcn.py) | single-node | Example for training GNNs on the `ogbn-papers100M` homogeneous graph w/ ~1.6B edges. | -| [`papers100m_gcn_cugraph.py`](./papers100m_gcn_cugraph.py%60) | single-node | Example for accelerating GNN training on `ogbn-papers100M` using [CuGraph](...). | +| [`papers100m_gcn.py`](./papers100m_gcn.py) | single-node | Example for training GNNs on the `ogbn-papers100M` homogeneous graph w/ ~1.6B edges. | +| [`papers100m_gcn_cugraph.py`](./papers100m_gcn_cugraph.py%60) | single-node | Example for accelerating GNN training on `ogbn-papers100M` using [CuGraph](...). | | [`papers100m_gcn_multinode.py`](./papers100m_gcn_multinode.py) | multi-node | Example for training GNNs on a homogeneous graph on multiple nodes. | | [`mag240m_graphsage.py`](./mag240m_graphsage.py) | single-node | Example for training GNNs on a large heterogeneous graph. | | [`taobao.py`](./taobao.py) | single-node | Example for training link prediction GNNs on a heterogeneous graph. | From 02ec82838dccabe17d250d8e6e54d9dab3256871 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 14:56:18 -0800 Subject: [PATCH 172/197] cleaning up num workers function --- examples/multi_gpu/papers100m_gcn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 6fc8baade3a4..3d6888f2fc3f 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -15,7 +15,7 @@ from torch_geometric.loader import NeighborLoader -def pyg_num_work(world_size): +def get_num_workers(world_size): num_work = None if hasattr(os, "sched_getaffinity"): try: @@ -57,7 +57,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, num_neighbors=[fan_out] * num_layers, batch_size=batch_size, ) - num_work = pyg_num_work(world_size) + num_work = get_num_workers(world_size) train_loader = NeighborLoader(data, input_nodes=split_idx['train'], num_workers=num_work, shuffle=True, drop_last=True, **kwargs) From b9ec4de6cc7220337cf296d5658ca5e2becaa5f9 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 14:59:26 -0800 Subject: [PATCH 173/197] cugraph neighborloader only returns HeteroData objects, papers100m is homo so using to_homo on minibatch every iteration --- examples/multi_gpu/papers100m_gcn_cugraph.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn_cugraph.py b/examples/multi_gpu/papers100m_gcn_cugraph.py index 12ee257328a5..4888b4db8448 100644 --- a/examples/multi_gpu/papers100m_gcn_cugraph.py +++ b/examples/multi_gpu/papers100m_gcn_cugraph.py @@ -185,8 +185,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, start = time.time() batch = batch.to(rank) - if isinstance(batch, torch_geometric.data.HeteroData): - batch = batch.to_homogeneous() + batch = batch.to_homogeneous() batch_size = batch.num_sampled_nodes[0] batch.y = batch.y.to(torch.long) @@ -218,8 +217,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, break batch = batch.to(rank) - if isinstance(batch, torch_geometric.data.HeteroData): - batch = batch.to_homogeneous() + batch = batch.to_homogeneous() batch_size = batch.num_sampled_nodes[0] batch.y = batch.y.to(torch.long) @@ -242,8 +240,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, with torch.no_grad(): for i, batch in enumerate(test_loader): batch = batch.to(rank) - if isinstance(batch, torch_geometric.data.HeteroData): - batch = batch.to_homogeneous() + batch = batch.to_homogeneous() batch_size = batch.num_sampled_nodes[0] batch.y = batch.y.to(torch.long) From a98bede33d482e1671536bd1d4c73ec6bafc56cb Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 15:00:00 -0800 Subject: [PATCH 174/197] syntax cleanup Co-authored-by: Matthias Fey --- examples/ogbn_papers_100m.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 9defae99330d..c74ecd4b2c5e 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -67,7 +67,7 @@ def get_num_workers() -> int: model = torch_geometric.nn.models.GCN(dataset.num_features, args.hidden_channels, args.num_layers, - dataset.num_classes).to(device) + dataset.num_classes, ).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005) From 92c9147f53d5479979864412941f1e5fecbf90ca Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 6 Mar 2024 23:01:27 +0000 Subject: [PATCH 175/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/ogbn_papers_100m.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index c74ecd4b2c5e..1813350ad5f9 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -64,10 +64,12 @@ def get_num_workers() -> int: dataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes, heads=args.n_gat_conv_heads).to(device) else: - model = torch_geometric.nn.models.GCN(dataset.num_features, - args.hidden_channels, - args.num_layers, - dataset.num_classes, ).to(device) + model = torch_geometric.nn.models.GCN( + dataset.num_features, + args.hidden_channels, + args.num_layers, + dataset.num_classes, + ).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005) From cbd60c255febc1a88aa28fb89c46d8ea98162ca8 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 15:01:53 -0800 Subject: [PATCH 176/197] cugraph always returns heterodata, need to_homo --- examples/ogbn_papers_100m_cugraph.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/ogbn_papers_100m_cugraph.py b/examples/ogbn_papers_100m_cugraph.py index b08a7758fab7..92b7fb6cc54c 100644 --- a/examples/ogbn_papers_100m_cugraph.py +++ b/examples/ogbn_papers_100m_cugraph.py @@ -86,7 +86,7 @@ def get_num_workers() -> int: model = torch_geometric.nn.models.GCN(dataset.num_features, args.hidden_channels, args.num_layers, - dataset.num_classes).to(device) + dataset.num_classes, ).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005) @@ -97,8 +97,7 @@ def get_num_workers() -> int: def train(): model.train() for i, batch in enumerate(train_loader): - if isinstance(batch, torch_geometric.data.HeteroData): - batch = batch.to_homogeneous() + batch = batch.to_homogeneous() if i == warmup_steps: torch.cuda.synchronize() @@ -127,8 +126,7 @@ def test(loader: NeighborLoader, val_steps: Optional[int] = None): for i, batch in enumerate(loader): if val_steps is not None and i >= val_steps: break - if isinstance(batch, torch_geometric.data.HeteroData): - batch = batch.to_homogeneous() + batch = batch.to_homogeneous() batch = batch.to(device) batch_size = batch.num_sampled_nodes[0] out = model(batch.x, batch.edge_index)[:batch_size] From ba2d9c3c4d85fc31a0c379341d285e72427e4f90 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 15:02:39 -0800 Subject: [PATCH 177/197] to homo not needed for vanilla pyg neighborloader --- examples/ogbn_papers_100m.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/examples/ogbn_papers_100m.py b/examples/ogbn_papers_100m.py index 1813350ad5f9..56e55119ad49 100644 --- a/examples/ogbn_papers_100m.py +++ b/examples/ogbn_papers_100m.py @@ -80,9 +80,6 @@ def get_num_workers() -> int: def train(): model.train() for i, batch in enumerate(train_loader): - if isinstance(batch, torch_geometric.data.HeteroData): - batch = batch.to_homogeneous() - if i == warmup_steps: torch.cuda.synchronize() start_avg_time = time.perf_counter() @@ -110,8 +107,6 @@ def test(loader: NeighborLoader, val_steps: Optional[int] = None): for i, batch in enumerate(loader): if val_steps is not None and i >= val_steps: break - if isinstance(batch, torch_geometric.data.HeteroData): - batch = batch.to_homogeneous() batch = batch.to(device) batch_size = batch.num_sampled_nodes[0] out = model(batch.x, batch.edge_index)[:batch_size] From 4d2533de33fdeecd7530f83ff17821d01311a0dd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 6 Mar 2024 23:03:52 +0000 Subject: [PATCH 178/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/ogbn_papers_100m_cugraph.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/ogbn_papers_100m_cugraph.py b/examples/ogbn_papers_100m_cugraph.py index 92b7fb6cc54c..58d808bcd3a8 100644 --- a/examples/ogbn_papers_100m_cugraph.py +++ b/examples/ogbn_papers_100m_cugraph.py @@ -83,10 +83,12 @@ def get_num_workers() -> int: dataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes, heads=args.n_gat_conv_heads).to(device) else: - model = torch_geometric.nn.models.GCN(dataset.num_features, - args.hidden_channels, - args.num_layers, - dataset.num_classes, ).to(device) + model = torch_geometric.nn.models.GCN( + dataset.num_features, + args.hidden_channels, + args.num_layers, + dataset.num_classes, + ).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005) From 235224f71f5dd90fbb429114331bc9f77162b84b Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 15:04:48 -0800 Subject: [PATCH 179/197] no to_homo needed for vanilla --- examples/multi_gpu/papers100m_gcn.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 3d6888f2fc3f..c2ddb538c181 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -104,8 +104,6 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, break batch = batch.to(rank) - if isinstance(batch, torch_geometric.data.HeteroData): - batch = batch.to_homogeneous() batch_size = batch.num_sampled_nodes[0] batch.y = batch.y.to(torch.long) @@ -120,8 +118,6 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, with torch.no_grad(): for i, batch in enumerate(test_loader): batch = batch.to(rank) - if isinstance(batch, torch_geometric.data.HeteroData): - batch = batch.to_homogeneous() batch_size = batch.num_sampled_nodes[0] batch.y = batch.y.to(torch.long) From e51811dd7e9940f0315450b71fe852cfb7569c89 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 15:06:21 -0800 Subject: [PATCH 180/197] Update examples/multi_gpu/papers100m_gcn.py Co-authored-by: Matthias Fey --- examples/multi_gpu/papers100m_gcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index c2ddb538c181..b0da8e9f0c94 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -61,7 +61,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, train_loader = NeighborLoader(data, input_nodes=split_idx['train'], num_workers=num_work, shuffle=True, drop_last=True, **kwargs) - eval_loader = NeighborLoader(data, input_nodes=split_idx['valid'], + val_loader = NeighborLoader(data, input_nodes=split_idx['valid'], num_workers=num_work, **kwargs) test_loader = NeighborLoader(data, input_nodes=split_idx['test'], num_workers=num_work, **kwargs) From e4805b1cf38e3ba308fd609bc1519cecc4a5fc2d Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 15:06:51 -0800 Subject: [PATCH 181/197] eval->val loader --- examples/multi_gpu/papers100m_gcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index b0da8e9f0c94..524b83f4de46 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -99,7 +99,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, print("Average Training Iteration Time:", (time.time() - start) / (nb - warmup_steps), "s/iter") with torch.no_grad(): - for i, batch in enumerate(eval_loader): + for i, batch in enumerate(val_loader): if i >= eval_steps: break From 4596bcb5eaae3676f48662b81e0c2fde34ac6c1d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 6 Mar 2024 23:07:28 +0000 Subject: [PATCH 182/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 524b83f4de46..fda3231d6f7c 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -62,7 +62,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, num_workers=num_work, shuffle=True, drop_last=True, **kwargs) val_loader = NeighborLoader(data, input_nodes=split_idx['valid'], - num_workers=num_work, **kwargs) + num_workers=num_work, **kwargs) test_loader = NeighborLoader(data, input_nodes=split_idx['test'], num_workers=num_work, **kwargs) From 42855d8794a33aa61dec74ac5da2578e1d097764 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 15:08:37 -0800 Subject: [PATCH 183/197] reset acc after each eval --- examples/multi_gpu/papers100m_gcn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index fda3231d6f7c..35546b557e9f 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -114,6 +114,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, if rank == 0: print(f"Validation Accuracy: {acc_sum * 100.0:.4f}%", ) dist.barrier() + acc.reset() with torch.no_grad(): for i, batch in enumerate(test_loader): From 10c56cbc5e68c9850529d1d7d5e208b3477eac7a Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 15:10:44 -0800 Subject: [PATCH 184/197] acc resets and syntax cleanup --- examples/multi_gpu/papers100m_gcn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 35546b557e9f..7965a8f426b8 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -129,6 +129,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, if rank == 0: print(f"Test Accuracy: {acc_sum * 100.0:.4f}%", ) dist.barrier() + acc.reset() if rank == 0: total_time = round(time.perf_counter() - wall_clock_start, 2) print("Total Program Runtime (total_time) =", total_time, "seconds") @@ -177,7 +178,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, model = torch_geometric.nn.models.GCN(dataset.num_features, args.hidden_channels, args.num_layers, - dataset.num_classes) + dataset.num_classes, ) print("Data =", data) if args.n_devices == -1: From ce3dcfc2d8694199c655229d7a1b16747efa50a8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 6 Mar 2024 23:11:46 +0000 Subject: [PATCH 185/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 7965a8f426b8..34f607e0b6b9 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -175,10 +175,12 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, dataset.num_classes, heads=args.n_gat_conv_heads) else: - model = torch_geometric.nn.models.GCN(dataset.num_features, - args.hidden_channels, - args.num_layers, - dataset.num_classes, ) + model = torch_geometric.nn.models.GCN( + dataset.num_features, + args.hidden_channels, + args.num_layers, + dataset.num_classes, + ) print("Data =", data) if args.n_devices == -1: From d9ae4138a5f1b4c8c7bb6ac0ff9c8983ed8bb4ca Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 15:15:00 -0800 Subject: [PATCH 186/197] cleanup of init_pytorch_worker(no longer a function, just commenting it) --- examples/multi_gpu/papers100m_gcn.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 34f607e0b6b9..40e0528102b3 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -28,19 +28,18 @@ def get_num_workers(world_size): def init_pytorch_worker(rank, world_size): - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '12355' - dist.init_process_group('nccl', rank=rank, world_size=world_size) + def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, split_idx, num_classes, wall_clock_start, tempdir=None, num_layers=3): - init_pytorch_worker( - rank, - world_size, - ) + # init pytorch worker + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12355' + dist.init_process_group('nccl', rank=rank, world_size=world_size) + if world_size > 1: split_idx['train'] = split_idx['train'].split( split_idx['train'].size(0) // world_size, dim=0)[rank].clone() From 37dabfa1fa15f27ff80d78e412563bd8cfe73d4f Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 15:15:35 -0800 Subject: [PATCH 187/197] typo cleanup --- examples/multi_gpu/papers100m_gcn.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 40e0528102b3..3074c28aff59 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -24,11 +24,7 @@ def get_num_workers(world_size): pass if num_work is None: num_work = os.cpu_count() / (2 * world_size) - return int(num_work) - - -def init_pytorch_worker(rank, world_size): - + return int(num_work) def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, From 01bd18c5a9ea8022559cc853ff1c63c9fba7edfc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 6 Mar 2024 23:17:13 +0000 Subject: [PATCH 188/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index 3074c28aff59..ca55918f3b5d 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -24,7 +24,7 @@ def get_num_workers(world_size): pass if num_work is None: num_work = os.cpu_count() / (2 * world_size) - return int(num_work) + return int(num_work) def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, From 4d11c46f80f746926f0fa522c46ca1c50c799826 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 15:25:34 -0800 Subject: [PATCH 189/197] model.module not needed for vanilla PyG --- examples/multi_gpu/papers100m_gcn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_gpu/papers100m_gcn.py b/examples/multi_gpu/papers100m_gcn.py index ca55918f3b5d..e7f6fe0dcf4b 100644 --- a/examples/multi_gpu/papers100m_gcn.py +++ b/examples/multi_gpu/papers100m_gcn.py @@ -102,7 +102,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch_size = batch.num_sampled_nodes[0] batch.y = batch.y.to(torch.long) - out = model.module(batch.x, batch.edge_index) + out = model(batch.x, batch.edge_index) acc_i = acc( # noqa out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) acc_sum = acc.compute() @@ -117,7 +117,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, batch_size = batch.num_sampled_nodes[0] batch.y = batch.y.to(torch.long) - out = model.module(batch.x, batch.edge_index) + out = model(batch.x, batch.edge_index) acc_i = acc( # noqa out[:batch_size].softmax(dim=-1), batch.y[:batch_size]) acc_sum = acc.compute() From 84f1f55551b7117595d6d71975222b1194ba70f7 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 6 Mar 2024 15:31:15 -0800 Subject: [PATCH 190/197] comments for cugraph feature store --- examples/multi_gpu/papers100m_gcn_cugraph.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/multi_gpu/papers100m_gcn_cugraph.py b/examples/multi_gpu/papers100m_gcn_cugraph.py index 4888b4db8448..d51a3aafffa1 100644 --- a/examples/multi_gpu/papers100m_gcn_cugraph.py +++ b/examples/multi_gpu/papers100m_gcn_cugraph.py @@ -108,10 +108,15 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, import cugraph from cugraph_pyg.data import CuGraphStore from cugraph_pyg.loader import BulkSampleLoader + # define the edges of the Graph G = {("N", "E", "N"): data.edge_index} + # define the number of nodes in Graph N = {"N": data.num_nodes} + # initialize feature store fs = cugraph.gnn.FeatureStore(backend="torch") + # store node features as x fs.add_data(data.x, "N", "x") + # store node labels as y fs.add_data(data.y, "N", "y") dist.barrier() From 265f05879f4c1f0cab58e1903b96d2e0fecb69fd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 6 Mar 2024 23:32:47 +0000 Subject: [PATCH 191/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_gpu/papers100m_gcn_cugraph.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/multi_gpu/papers100m_gcn_cugraph.py b/examples/multi_gpu/papers100m_gcn_cugraph.py index d51a3aafffa1..0473da463683 100644 --- a/examples/multi_gpu/papers100m_gcn_cugraph.py +++ b/examples/multi_gpu/papers100m_gcn_cugraph.py @@ -108,6 +108,7 @@ def run_train(rank, data, world_size, model, epochs, batch_size, fan_out, import cugraph from cugraph_pyg.data import CuGraphStore from cugraph_pyg.loader import BulkSampleLoader + # define the edges of the Graph G = {("N", "E", "N"): data.edge_index} # define the number of nodes in Graph From 542a71811faa3b9da1e97a2c305a88c6e2492166 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 8 Mar 2024 12:23:19 -0800 Subject: [PATCH 192/197] fix from alexandria for single gpu --- examples/ogbn_papers_100m_cugraph.py | 43 ++++++++++++++++------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/examples/ogbn_papers_100m_cugraph.py b/examples/ogbn_papers_100m_cugraph.py index 58d808bcd3a8..d596205e82c3 100644 --- a/examples/ogbn_papers_100m_cugraph.py +++ b/examples/ogbn_papers_100m_cugraph.py @@ -3,17 +3,27 @@ import time from typing import Optional -import cugraph +import torch import cupy import rmm -import torch -import torch.nn.functional as F + +from rmm.allocators.cupy import rmm_cupy_allocator +from rmm.allocators.torch import rmm_torch_allocator + +# Must change allocators immediately upon import +# or else other imports will cause memory to be +# allocated and prevent changing the allocator +rmm.reinitialize(devices=[0], pool_allocator=True, + managed_memory=True) +cupy.cuda.set_allocator(rmm_cupy_allocator) +torch.cuda.memory.change_current_allocator(rmm_torch_allocator) + +import cugraph from cugraph.testing.mg_utils import enable_spilling from cugraph_pyg.data import CuGraphStore from cugraph_pyg.loader import CuGraphNeighborLoader -from ogb.nodeproppred import PygNodePropPredDataset -from rmm.allocators.cupy import rmm_cupy_allocator -from rmm.allocators.torch import rmm_torch_allocator + +import torch.nn.functional as F import torch_geometric from torch_geometric.loader import NeighborLoader @@ -38,14 +48,6 @@ ) args = parser.parse_args() wall_clock_start = time.perf_counter() -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - -dataset = PygNodePropPredDataset(name='ogbn-papers100M', - root='/datasets/ogb_datasets') -split_idx = dataset.get_idx_split() - -enable_spilling() - def get_num_workers() -> int: try: @@ -59,11 +61,16 @@ def get_num_workers() -> int: batch_size=args.batch_size, ) # Set Up Neighbor Loading +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +enable_spilling() + +from ogb.nodeproppred import PygNodePropPredDataset +dataset = PygNodePropPredDataset(name='ogbn-papers100M', + root='/datasets/ogb_datasets') +split_idx = dataset.get_idx_split() data = dataset[0] -rmm.reinitialize(devices=[0], pool_allocator=True, initial_pool_size=78e9, - managed_memory=True) -torch.cuda.memory.change_current_allocator(rmm_torch_allocator) -cupy.cuda.set_allocator(rmm_cupy_allocator) + G = {("N", "E", "N"): data.edge_index} N = {"N": data.num_nodes} fs = cugraph.gnn.FeatureStore(backend="torch") From 0c643b7dbc166e7db727a675bd74d9e13ef23590 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 8 Mar 2024 20:24:19 +0000 Subject: [PATCH 193/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/ogbn_papers_100m_cugraph.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/examples/ogbn_papers_100m_cugraph.py b/examples/ogbn_papers_100m_cugraph.py index d596205e82c3..9991955e3930 100644 --- a/examples/ogbn_papers_100m_cugraph.py +++ b/examples/ogbn_papers_100m_cugraph.py @@ -3,28 +3,25 @@ import time from typing import Optional -import torch import cupy import rmm - +import torch from rmm.allocators.cupy import rmm_cupy_allocator from rmm.allocators.torch import rmm_torch_allocator # Must change allocators immediately upon import # or else other imports will cause memory to be # allocated and prevent changing the allocator -rmm.reinitialize(devices=[0], pool_allocator=True, - managed_memory=True) +rmm.reinitialize(devices=[0], pool_allocator=True, managed_memory=True) cupy.cuda.set_allocator(rmm_cupy_allocator) torch.cuda.memory.change_current_allocator(rmm_torch_allocator) import cugraph +import torch.nn.functional as F from cugraph.testing.mg_utils import enable_spilling from cugraph_pyg.data import CuGraphStore from cugraph_pyg.loader import CuGraphNeighborLoader -import torch.nn.functional as F - import torch_geometric from torch_geometric.loader import NeighborLoader @@ -49,6 +46,7 @@ args = parser.parse_args() wall_clock_start = time.perf_counter() + def get_num_workers() -> int: try: return len(os.sched_getaffinity(0)) // 2 @@ -66,6 +64,7 @@ def get_num_workers() -> int: enable_spilling() from ogb.nodeproppred import PygNodePropPredDataset + dataset = PygNodePropPredDataset(name='ogbn-papers100M', root='/datasets/ogb_datasets') split_idx = dataset.get_idx_split() From 388f61916892de7c0b21426e20fd4f574e882f95 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 8 Mar 2024 13:39:59 -0800 Subject: [PATCH 194/197] precommit CI --- examples/ogbn_papers_100m_cugraph.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/ogbn_papers_100m_cugraph.py b/examples/ogbn_papers_100m_cugraph.py index 9991955e3930..58ecbdcbfdf8 100644 --- a/examples/ogbn_papers_100m_cugraph.py +++ b/examples/ogbn_papers_100m_cugraph.py @@ -16,14 +16,14 @@ cupy.cuda.set_allocator(rmm_cupy_allocator) torch.cuda.memory.change_current_allocator(rmm_torch_allocator) -import cugraph -import torch.nn.functional as F -from cugraph.testing.mg_utils import enable_spilling -from cugraph_pyg.data import CuGraphStore -from cugraph_pyg.loader import CuGraphNeighborLoader - -import torch_geometric -from torch_geometric.loader import NeighborLoader +import cugraph # noqa +import torch.nn.functional as F # noqa +from cugraph.testing.mg_utils import enable_spilling # noqa +from cugraph_pyg.data import CuGraphStore # noqa +from cugraph_pyg.loader import CuGraphNeighborLoader # noqa + +import torch_geometric # noqa +from torch_geometric.loader import NeighborLoader # noqa parser = argparse.ArgumentParser() parser.add_argument('--hidden_channels', type=int, default=256) From c740a968536a1a7581565ac74fff38678de47933 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 8 Mar 2024 21:41:10 +0000 Subject: [PATCH 195/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/ogbn_papers_100m_cugraph.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/ogbn_papers_100m_cugraph.py b/examples/ogbn_papers_100m_cugraph.py index 58ecbdcbfdf8..d01b410015f8 100644 --- a/examples/ogbn_papers_100m_cugraph.py +++ b/examples/ogbn_papers_100m_cugraph.py @@ -16,14 +16,14 @@ cupy.cuda.set_allocator(rmm_cupy_allocator) torch.cuda.memory.change_current_allocator(rmm_torch_allocator) -import cugraph # noqa -import torch.nn.functional as F # noqa -from cugraph.testing.mg_utils import enable_spilling # noqa -from cugraph_pyg.data import CuGraphStore # noqa -from cugraph_pyg.loader import CuGraphNeighborLoader # noqa - -import torch_geometric # noqa -from torch_geometric.loader import NeighborLoader # noqa +import cugraph # noqa +import torch.nn.functional as F # noqa +from cugraph.testing.mg_utils import enable_spilling # noqa +from cugraph_pyg.data import CuGraphStore # noqa +from cugraph_pyg.loader import CuGraphNeighborLoader # noqa + +import torch_geometric # noqa +from torch_geometric.loader import NeighborLoader # noqa parser = argparse.ArgumentParser() parser.add_argument('--hidden_channels', type=int, default=256) From 8613822c1ef92804515dda4ffb9ded1315f2449c Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Fri, 8 Mar 2024 17:10:35 -0800 Subject: [PATCH 196/197] Update ogbn_papers_100m_cugraph.py --- examples/ogbn_papers_100m_cugraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ogbn_papers_100m_cugraph.py b/examples/ogbn_papers_100m_cugraph.py index d01b410015f8..d272354ae2bd 100644 --- a/examples/ogbn_papers_100m_cugraph.py +++ b/examples/ogbn_papers_100m_cugraph.py @@ -63,7 +63,7 @@ def get_num_workers() -> int: enable_spilling() -from ogb.nodeproppred import PygNodePropPredDataset +from ogb.nodeproppred import PygNodePropPredDataset # noqa dataset = PygNodePropPredDataset(name='ogbn-papers100M', root='/datasets/ogb_datasets') From c1df757f54e9ae10ae8c6b91db79cb0c826c0910 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 9 Mar 2024 01:11:35 +0000 Subject: [PATCH 197/197] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/ogbn_papers_100m_cugraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ogbn_papers_100m_cugraph.py b/examples/ogbn_papers_100m_cugraph.py index d272354ae2bd..c9df4e07e597 100644 --- a/examples/ogbn_papers_100m_cugraph.py +++ b/examples/ogbn_papers_100m_cugraph.py @@ -63,7 +63,7 @@ def get_num_workers() -> int: enable_spilling() -from ogb.nodeproppred import PygNodePropPredDataset # noqa +from ogb.nodeproppred import PygNodePropPredDataset # noqa dataset = PygNodePropPredDataset(name='ogbn-papers100M', root='/datasets/ogb_datasets')