diff --git a/benchmark/pics/cpu_scatter.png b/benchmark/pics/cpu_scatter.png
new file mode 100644
index 000000000..d0729e7a6
Binary files /dev/null and b/benchmark/pics/cpu_scatter.png differ
diff --git a/benchmark/pics/cpu_scatter.svg b/benchmark/pics/cpu_scatter.svg
new file mode 100644
index 000000000..5f977c38d
--- /dev/null
+++ b/benchmark/pics/cpu_scatter.svg
@@ -0,0 +1,333 @@
+
+
diff --git a/benchmark/pics/gpu_scatter.png b/benchmark/pics/gpu_scatter.png
new file mode 100644
index 000000000..9129d1be0
Binary files /dev/null and b/benchmark/pics/gpu_scatter.png differ
diff --git a/benchmark/pics/gpu_scatter.svg b/benchmark/pics/gpu_scatter.svg
new file mode 100644
index 000000000..e8a317ab3
--- /dev/null
+++ b/benchmark/pics/gpu_scatter.svg
@@ -0,0 +1,325 @@
+
+
diff --git a/benchmark/scatter.jl b/benchmark/scatter.jl
new file mode 100644
index 000000000..4b5ad56ce
--- /dev/null
+++ b/benchmark/scatter.jl
@@ -0,0 +1,67 @@
+using CUDAdrv
+using CUDAnative
+using CuArrays
+using GeometricFlux
+using DataFrames
+using CSV
+using BenchmarkTools
+using BenchmarkTools: Trial, TrialEstimate, median, mean
+# using ProfileView
+
+d = 50
+nbins = 20
+getinfo(te::TrialEstimate) = te.time, te.gctime, te.memory
+getstats(t::Trial) = [getinfo(minimum(t)), getinfo(median(t)), getinfo(mean(t)),
+ getinfo(maximum(t))]
+
+metadata = DataFrame(device=String[], dim=Int[], sample=Int[], bins=Int[])
+mintime = DataFrame(min_time=Float64[], min_gc=Float64[], min_mem=Int[])
+medtime = DataFrame(med_time=Float64[], med_gc=Float64[], med_mem=Int[])
+meantime = DataFrame(mean_time=Float64[], mean_gc=Float64[], mean_mem=Int[])
+maxtime = DataFrame(max_time=Float64[], max_gc=Float64[], max_mem=Int[])
+
+for l = [2^5, 2^10, 2^15, 2^20]
+ hist = zeros(Float32, d, nbins)
+ δ = rand(Float32, d, l)
+ idx = rand(1:nbins, l)
+
+ hist_gpu = CuArray(hist)
+ δ_gpu = CuArray(δ)
+ idx_gpu = CuArray(idx)
+
+ scatter_add!(hist, δ, idx)
+ scatter_add!(hist_gpu, δ_gpu, idx_gpu)
+
+ b_cpu = @benchmark scatter_add!($hist, $δ, $idx)
+ b_gpu = @benchmark scatter_add!($hist_gpu, $δ_gpu, $idx_gpu)
+ s_cpu = getstats(b_cpu)
+ s_gpu = getstats(b_gpu)
+
+ push!(metadata, ("cpu", d, l, nbins))
+ push!(mintime, s_cpu[1])
+ push!(medtime, s_cpu[2])
+ push!(meantime, s_cpu[3])
+ push!(maxtime, s_cpu[4])
+
+ push!(metadata, ("gpu", d, l, nbins))
+ push!(mintime, s_gpu[1])
+ push!(medtime, s_gpu[2])
+ push!(meantime, s_gpu[3])
+ push!(maxtime, s_gpu[4])
+end
+
+data = hcat(metadata, mintime, medtime, meantime, maxtime)
+CSV.write("benchmark_scatter_julia.tsv", data; delim="\t")
+
+
+## Benchmark
+# @benchmark scatter_add!($hist, $δ, $idx)
+# CuArrays.@time scatter_add!(hist_gpu, δ_gpu, idx_gpu)
+
+## Profiling
+# sudo nvprof --profile-from-start off julia-1.3 benchmarks/scatter.jl
+# sudo nvprof --profile-from-start off --print-gpu-trace julia-1.3 --proj benchmarks/scatter.jl
+# sudo chown yuehhua -R /home/yuehhua/.julia/
+
+# @profview scatter_add!(hist, δ, idx)
+# CUDAdrv.@profile scatter_add!(hist_gpu, δ_gpu, idx_gpu)
diff --git a/benchmark/scatter_py.jl b/benchmark/scatter_py.jl
new file mode 100644
index 000000000..9bc7b638d
--- /dev/null
+++ b/benchmark/scatter_py.jl
@@ -0,0 +1,61 @@
+using PyCall
+using DataFrames
+using CSV
+using BenchmarkTools
+using BenchmarkTools: Trial, TrialEstimate, median, mean
+
+py"""
+import torch
+import torch_scatter as sc
+torch.set_num_threads(12)
+cuda = torch.device("cuda:0")
+d = 50
+nbins = 20
+"""
+
+d = 50
+nbins = 20
+getinfo(te::TrialEstimate) = te.time, te.gctime, te.memory
+getstats(t::Trial) = [getinfo(minimum(t)), getinfo(median(t)), getinfo(mean(t)),
+ getinfo(maximum(t))]
+
+metadata = DataFrame(device=String[], dim=Int[], sample=Int[], bins=Int[])
+mintime = DataFrame(min_time=Float64[], min_gc=Float64[], min_mem=Int[])
+medtime = DataFrame(med_time=Float64[], med_gc=Float64[], med_mem=Int[])
+meantime = DataFrame(mean_time=Float64[], mean_gc=Float64[], mean_mem=Int[])
+maxtime = DataFrame(max_time=Float64[], max_gc=Float64[], max_mem=Int[])
+
+for l = [2^5, 2^10, 2^15, 2^20]
+ py"""
+ hist = torch.zeros([d, nbins], dtype=torch.float32)
+ delta = torch.rand([d, $(l)], dtype=torch.float32)
+ idx = torch.randint(0, nbins, size=($(l),))
+
+ hist_gpu = torch.zeros([d, nbins], dtype=torch.float32, device=cuda)
+ delta_gpu = torch.rand([d, $(l)], dtype=torch.float32, device=cuda)
+ idx_gpu = torch.randint(0, nbins, size=($(l),), device=cuda)
+
+ sc.scatter_add(delta, idx, out=hist)
+ sc.scatter_add(delta_gpu, idx_gpu, out=hist_gpu)
+ """
+
+ b_cpu = @benchmark py"sc.scatter_add(delta, idx, out=hist)";
+ b_gpu = @benchmark py"sc.scatter_add(delta_gpu, idx_gpu, out=hist_gpu)";
+ s_cpu = getstats(b_cpu)
+ s_gpu = getstats(b_gpu)
+
+ push!(metadata, ("cpu", d, l, nbins))
+ push!(mintime, s_cpu[1])
+ push!(medtime, s_cpu[2])
+ push!(meantime, s_cpu[3])
+ push!(maxtime, s_cpu[4])
+
+ push!(metadata, ("gpu", d, l, nbins))
+ push!(mintime, s_gpu[1])
+ push!(medtime, s_gpu[2])
+ push!(meantime, s_gpu[3])
+ push!(maxtime, s_gpu[4])
+end
+
+data = hcat(metadata, mintime, medtime, meantime, maxtime)
+CSV.write("benchmark_scatter_pytorch.tsv", data; delim="\t")