diff --git a/benchmark/pics/cpu_scatter.png b/benchmark/pics/cpu_scatter.png new file mode 100644 index 000000000..d0729e7a6 Binary files /dev/null and b/benchmark/pics/cpu_scatter.png differ diff --git a/benchmark/pics/cpu_scatter.svg b/benchmark/pics/cpu_scatter.svg new file mode 100644 index 000000000..5f977c38d --- /dev/null +++ b/benchmark/pics/cpu_scatter.svg @@ -0,0 +1,333 @@ + + + + + + + + + + + + Matrix Size + + + + + + + 25 + + + + + 210 + + + + + 215 + + + + + 220 + + + + + + + + geometricflux + + + + + pytorch-scatter + + + + + + + + + + + + + + + framework + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 101 + + + + + 102 + + + + + 103 + + + + + 104 + + + + + 105 + + + + + 106 + + + + + 107 + + + + + + + Time (μs) + + + + + + + Scatter add performance on CPU + + + + + + + + + + diff --git a/benchmark/pics/gpu_scatter.png b/benchmark/pics/gpu_scatter.png new file mode 100644 index 000000000..9129d1be0 Binary files /dev/null and b/benchmark/pics/gpu_scatter.png differ diff --git a/benchmark/pics/gpu_scatter.svg b/benchmark/pics/gpu_scatter.svg new file mode 100644 index 000000000..e8a317ab3 --- /dev/null +++ b/benchmark/pics/gpu_scatter.svg @@ -0,0 +1,325 @@ + + + + + + + + + + + + Matrix Size + + + + + + + 25 + + + + + 210 + + + + + 215 + + + + + 220 + + + + + + + + geometricflux + + + + + pytorch-scatter + + + + + + + + + + + + + + + framework + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 100 + + + + + 101 + + + + + 102 + + + + + 103 + + + + + 104 + + + + + 105 + + + + + + + Time (μs) + + + + + + + Scatter add performance on GPU + + + + + + + + + + diff --git a/benchmark/scatter.jl b/benchmark/scatter.jl new file mode 100644 index 000000000..4b5ad56ce --- /dev/null +++ b/benchmark/scatter.jl @@ -0,0 +1,67 @@ +using CUDAdrv +using CUDAnative +using CuArrays +using GeometricFlux +using DataFrames +using CSV +using BenchmarkTools +using BenchmarkTools: Trial, TrialEstimate, median, mean +# using ProfileView + +d = 50 +nbins = 20 +getinfo(te::TrialEstimate) = te.time, te.gctime, te.memory +getstats(t::Trial) = [getinfo(minimum(t)), getinfo(median(t)), getinfo(mean(t)), + getinfo(maximum(t))] + +metadata = DataFrame(device=String[], dim=Int[], sample=Int[], bins=Int[]) +mintime = DataFrame(min_time=Float64[], min_gc=Float64[], min_mem=Int[]) +medtime = DataFrame(med_time=Float64[], med_gc=Float64[], med_mem=Int[]) +meantime = DataFrame(mean_time=Float64[], mean_gc=Float64[], mean_mem=Int[]) +maxtime = DataFrame(max_time=Float64[], max_gc=Float64[], max_mem=Int[]) + +for l = [2^5, 2^10, 2^15, 2^20] + hist = zeros(Float32, d, nbins) + δ = rand(Float32, d, l) + idx = rand(1:nbins, l) + + hist_gpu = CuArray(hist) + δ_gpu = CuArray(δ) + idx_gpu = CuArray(idx) + + scatter_add!(hist, δ, idx) + scatter_add!(hist_gpu, δ_gpu, idx_gpu) + + b_cpu = @benchmark scatter_add!($hist, $δ, $idx) + b_gpu = @benchmark scatter_add!($hist_gpu, $δ_gpu, $idx_gpu) + s_cpu = getstats(b_cpu) + s_gpu = getstats(b_gpu) + + push!(metadata, ("cpu", d, l, nbins)) + push!(mintime, s_cpu[1]) + push!(medtime, s_cpu[2]) + push!(meantime, s_cpu[3]) + push!(maxtime, s_cpu[4]) + + push!(metadata, ("gpu", d, l, nbins)) + push!(mintime, s_gpu[1]) + push!(medtime, s_gpu[2]) + push!(meantime, s_gpu[3]) + push!(maxtime, s_gpu[4]) +end + +data = hcat(metadata, mintime, medtime, meantime, maxtime) +CSV.write("benchmark_scatter_julia.tsv", data; delim="\t") + + +## Benchmark +# @benchmark scatter_add!($hist, $δ, $idx) +# CuArrays.@time scatter_add!(hist_gpu, δ_gpu, idx_gpu) + +## Profiling +# sudo nvprof --profile-from-start off julia-1.3 benchmarks/scatter.jl +# sudo nvprof --profile-from-start off --print-gpu-trace julia-1.3 --proj benchmarks/scatter.jl +# sudo chown yuehhua -R /home/yuehhua/.julia/ + +# @profview scatter_add!(hist, δ, idx) +# CUDAdrv.@profile scatter_add!(hist_gpu, δ_gpu, idx_gpu) diff --git a/benchmark/scatter_py.jl b/benchmark/scatter_py.jl new file mode 100644 index 000000000..9bc7b638d --- /dev/null +++ b/benchmark/scatter_py.jl @@ -0,0 +1,61 @@ +using PyCall +using DataFrames +using CSV +using BenchmarkTools +using BenchmarkTools: Trial, TrialEstimate, median, mean + +py""" +import torch +import torch_scatter as sc +torch.set_num_threads(12) +cuda = torch.device("cuda:0") +d = 50 +nbins = 20 +""" + +d = 50 +nbins = 20 +getinfo(te::TrialEstimate) = te.time, te.gctime, te.memory +getstats(t::Trial) = [getinfo(minimum(t)), getinfo(median(t)), getinfo(mean(t)), + getinfo(maximum(t))] + +metadata = DataFrame(device=String[], dim=Int[], sample=Int[], bins=Int[]) +mintime = DataFrame(min_time=Float64[], min_gc=Float64[], min_mem=Int[]) +medtime = DataFrame(med_time=Float64[], med_gc=Float64[], med_mem=Int[]) +meantime = DataFrame(mean_time=Float64[], mean_gc=Float64[], mean_mem=Int[]) +maxtime = DataFrame(max_time=Float64[], max_gc=Float64[], max_mem=Int[]) + +for l = [2^5, 2^10, 2^15, 2^20] + py""" + hist = torch.zeros([d, nbins], dtype=torch.float32) + delta = torch.rand([d, $(l)], dtype=torch.float32) + idx = torch.randint(0, nbins, size=($(l),)) + + hist_gpu = torch.zeros([d, nbins], dtype=torch.float32, device=cuda) + delta_gpu = torch.rand([d, $(l)], dtype=torch.float32, device=cuda) + idx_gpu = torch.randint(0, nbins, size=($(l),), device=cuda) + + sc.scatter_add(delta, idx, out=hist) + sc.scatter_add(delta_gpu, idx_gpu, out=hist_gpu) + """ + + b_cpu = @benchmark py"sc.scatter_add(delta, idx, out=hist)"; + b_gpu = @benchmark py"sc.scatter_add(delta_gpu, idx_gpu, out=hist_gpu)"; + s_cpu = getstats(b_cpu) + s_gpu = getstats(b_gpu) + + push!(metadata, ("cpu", d, l, nbins)) + push!(mintime, s_cpu[1]) + push!(medtime, s_cpu[2]) + push!(meantime, s_cpu[3]) + push!(maxtime, s_cpu[4]) + + push!(metadata, ("gpu", d, l, nbins)) + push!(mintime, s_gpu[1]) + push!(medtime, s_gpu[2]) + push!(meantime, s_gpu[3]) + push!(maxtime, s_gpu[4]) +end + +data = hcat(metadata, mintime, medtime, meantime, maxtime) +CSV.write("benchmark_scatter_pytorch.tsv", data; delim="\t")