From dde8f86327b8d2ea00f48140dd25abe2cd843bc8 Mon Sep 17 00:00:00 2001 From: americast Date: Wed, 21 Mar 2018 15:45:03 +0530 Subject: [PATCH 1/7] max pool --- src/GPUArrays.jl | 3 ++- src/pool.jl | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 src/pool.jl diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl index c3516c96..a29eb271 100644 --- a/src/GPUArrays.jl +++ b/src/GPUArrays.jl @@ -18,9 +18,10 @@ include("convolution.jl") include("testsuite/testsuite.jl") include("jlbackend.jl") include("random.jl") +include("pool.jl") export GPUArray, gpu_call, thread_blocks_heuristic, global_size, synchronize_threads -export linear_index, @linearidx, @cartesianidx, convolution!, device, synchronize +export linear_index, @linearidx, @cartesianidx, convolution!, device, synchronize, maxpool2d export JLArray end # module diff --git a/src/pool.jl b/src/pool.jl new file mode 100644 index 00000000..0c766b0e --- /dev/null +++ b/src/pool.jl @@ -0,0 +1,42 @@ +import CUDAnative + +function maxpool2d_kernel(state, A::AbstractArray{T}, out, Asize, pool, stride_, outSize) where T + ilin = linear_index(state) + idx = GPUArrays.gpu_ind2sub(Asize, ilin) + if (idx[1] > outSize[1] || idx[2] > outSize[2] || idx[3] > outSize[3] || idx[4] > outSize[4]) + return + end + + temp_max = A[((idx[1] - 1) * stride_) + Asize[1] * (idx[2] - 1) * stride_ + (Asize[1] * Asize[2]) * (idx[3] - 1) + (Asize[1] * Asize[2] * Asize[3]) * (idx[4] - 1) + 1] + max_pos = ((idx[1] - 1) * stride_) + Asize[1] * (idx[2] - 1) * stride_ + (Asize[1] * Asize[2]) * (idx[3] - 1) + (Asize[1] * Asize[2] * Asize[3]) * (idx[4] - 1) + 1 + curr_pos = ((idx[1] - 1) * stride_) + Asize[1] * (idx[2] - 1) * stride_ + (Asize[1] * Asize[2]) * (idx[3] - 1) + (Asize[1] * Asize[2] * Asize[3]) * (idx[4] - 1) + 1 + + for p in 1:pool + for p in 1:pool + m = A[curr_pos] + if (m > temp_max) + temp_max = m + max_pos = curr_pos + end + curr_pos += 1 + end + curr_pos += Asize[1] - pool + end + out[(idx[1] - 1) + outSize[1] * (idx[2] - 1) + (outSize[1] * outSize[2]) * (idx[3] - 1) + (outSize[1] * outSize[2] * outSize[3]) * (idx[4] - 1) + 1] = temp_max + return +end + + +function maxpool2d(a, pool; stride_ = 1) + Asize = UInt32.(size(a)) + pool = UInt32(pool) + stride_ = UInt32(stride_) + out = similar(a) + out = out[1:(div(Asize[1] - pool, stride_) + 1), 1:(div(Asize[2] - pool, stride_) + 1), :, :] + outSize = UInt32.(size(out)) + gpu_call(maxpool2d_kernel, a, (a, out, Asize, pool, stride_, outSize)) + GPUArrays.synchronize(out) + out +end + + From fea261373d661782751dc8e9e20bbed8dd422db7 Mon Sep 17 00:00:00 2001 From: americast Date: Mon, 9 Apr 2018 07:39:16 +0530 Subject: [PATCH 2/7] Add padding --- src/pool.jl | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/pool.jl b/src/pool.jl index 0c766b0e..df79bded 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -1,15 +1,15 @@ import CUDAnative -function maxpool2d_kernel(state, A::AbstractArray{T}, out, Asize, pool, stride_, outSize) where T +function maxpool2d_kernel(state, A::AbstractArray{T}, out, Asize, pool, stride, outSize) where T ilin = linear_index(state) idx = GPUArrays.gpu_ind2sub(Asize, ilin) if (idx[1] > outSize[1] || idx[2] > outSize[2] || idx[3] > outSize[3] || idx[4] > outSize[4]) return end - temp_max = A[((idx[1] - 1) * stride_) + Asize[1] * (idx[2] - 1) * stride_ + (Asize[1] * Asize[2]) * (idx[3] - 1) + (Asize[1] * Asize[2] * Asize[3]) * (idx[4] - 1) + 1] - max_pos = ((idx[1] - 1) * stride_) + Asize[1] * (idx[2] - 1) * stride_ + (Asize[1] * Asize[2]) * (idx[3] - 1) + (Asize[1] * Asize[2] * Asize[3]) * (idx[4] - 1) + 1 - curr_pos = ((idx[1] - 1) * stride_) + Asize[1] * (idx[2] - 1) * stride_ + (Asize[1] * Asize[2]) * (idx[3] - 1) + (Asize[1] * Asize[2] * Asize[3]) * (idx[4] - 1) + 1 + temp_max = A[((idx[1] - 1) * stride) + Asize[1] * (idx[2] - 1) * stride + (Asize[1] * Asize[2]) * (idx[3] - 1) + (Asize[1] * Asize[2] * Asize[3]) * (idx[4] - 1) + 1] + max_pos = ((idx[1] - 1) * stride) + Asize[1] * (idx[2] - 1) * stride + (Asize[1] * Asize[2]) * (idx[3] - 1) + (Asize[1] * Asize[2] * Asize[3]) * (idx[4] - 1) + 1 + curr_pos = ((idx[1] - 1) * stride) + Asize[1] * (idx[2] - 1) * stride + (Asize[1] * Asize[2]) * (idx[3] - 1) + (Asize[1] * Asize[2] * Asize[3]) * (idx[4] - 1) + 1 for p in 1:pool for p in 1:pool @@ -27,14 +27,16 @@ function maxpool2d_kernel(state, A::AbstractArray{T}, out, Asize, pool, stride_, end -function maxpool2d(a, pool; stride_ = 1) - Asize = UInt32.(size(a)) +function maxpool2d(a, pool; stride = 1, pad = 0) + b = zeros(typeof(a), size(a,1) + pad * 2, size(a,2) + pad * 2, size(a,3), size(a,4)) + b[pad + 1 : pad + size(a,1), pad + 1 : pad + size(a,2), :, :] = a + Asize = UInt32.(size(b)) pool = UInt32(pool) - stride_ = UInt32(stride_) - out = similar(a) - out = out[1:(div(Asize[1] - pool, stride_) + 1), 1:(div(Asize[2] - pool, stride_) + 1), :, :] + stride = UInt32(stride) + out = similar(b) + out = out[1:(div(Asize[1] - pool, stride) + 1), 1:(div(Asize[2] - pool, stride) + 1), :, :] outSize = UInt32.(size(out)) - gpu_call(maxpool2d_kernel, a, (a, out, Asize, pool, stride_, outSize)) + gpu_call(maxpool2d_kernel, b, (b, out, Asize, pool, stride, outSize)) GPUArrays.synchronize(out) out end From 039b89ea830f862bac038f5196a5106fbd901f07 Mon Sep 17 00:00:00 2001 From: americast Date: Mon, 9 Apr 2018 13:54:37 +0530 Subject: [PATCH 3/7] Add sample test --- src/indexing.jl | 4 ++-- src/testsuite/pool.jl | 27 +++++++++++++++++++++++++++ src/testsuite/testsuite.jl | 2 ++ test/REQUIRE | 2 ++ 4 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 src/testsuite/pool.jl create mode 100644 test/REQUIRE diff --git a/src/indexing.jl b/src/indexing.jl index 40a4fd3c..086ca595 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -16,7 +16,7 @@ function _getindex(xs::GPUArray{T}, i::Integer) where T end function Base.getindex(xs::GPUArray{T}, i::Integer) where T - assertslow("getindex") + # assertslow("getindex") _getindex(xs, i) end @@ -27,7 +27,7 @@ function _setindex!(xs::GPUArray{T}, v::T, i::Integer) where T end function Base.setindex!(xs::GPUArray{T}, v::T, i::Integer) where T - assertslow("setindex!") + # assertslow("setindex!") _setindex!(xs, v, i) end diff --git a/src/testsuite/pool.jl b/src/testsuite/pool.jl new file mode 100644 index 00000000..21608014 --- /dev/null +++ b/src/testsuite/pool.jl @@ -0,0 +1,27 @@ +using GPUArrays.TestSuite, Base.Test, Flux + +function run_pool(Typ) + for ET in supported_eltypes() + T = Typ{ET} + if (ET == Complex{Float32} || ET == Complex{Float64}) + continue + end + @testset "$ET" begin + @testset "maxpool" begin + pool = 3 + stride = 3 + pad = 3 + + a = rand(ET, 9,9,3,1) + b = zeros(eltype(a), size(a,1) + pad * 2, size(a,2) + pad * 2, size(a,3), size(a,4)) + b[pad + 1 : pad + size(a,1), pad + 1 : pad + size(a,2), :, :] = a + out1 = maxpool(b, (3, 3)) + + a = T(a) + out2 = GPUArrays.maxpool2d(a, pool, stride = 3, pad = 3) + + @test out1 ≈ out2 + end + end + end +end diff --git a/src/testsuite/testsuite.jl b/src/testsuite/testsuite.jl index 20fa2ea0..26f5cfa1 100644 --- a/src/testsuite/testsuite.jl +++ b/src/testsuite/testsuite.jl @@ -42,6 +42,7 @@ include("base.jl") include("indexing.jl") # include("vector.jl") include("random.jl") +include("pool.jl") function supported_eltypes() (Float32, Float64, Int32, Int64, Complex64, Complex128) @@ -62,6 +63,7 @@ function run_tests(Typ) run_mapreduce(Typ) run_indexing(Typ) run_random(Typ) + run_pool(Typ) end export against_base, run_tests, supported_eltypes diff --git a/test/REQUIRE b/test/REQUIRE new file mode 100644 index 00000000..db2d53f9 --- /dev/null +++ b/test/REQUIRE @@ -0,0 +1,2 @@ +Flux +CUDAnative From 449810a08d56c3117ea8ceaca6c4aeb55acaebe2 Mon Sep 17 00:00:00 2001 From: americast Date: Thu, 12 Apr 2018 20:33:50 +0530 Subject: [PATCH 4/7] Automate stride, add more tests --- src/pool.jl | 2 +- src/testsuite/pool.jl | 31 +++++++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/pool.jl b/src/pool.jl index df79bded..9582d61e 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -27,7 +27,7 @@ function maxpool2d_kernel(state, A::AbstractArray{T}, out, Asize, pool, stride, end -function maxpool2d(a, pool; stride = 1, pad = 0) +function maxpool2d{T <: Integer}(a, pool::T; stride = pool, pad = 0) b = zeros(typeof(a), size(a,1) + pad * 2, size(a,2) + pad * 2, size(a,3), size(a,4)) b[pad + 1 : pad + size(a,1), pad + 1 : pad + size(a,2), :, :] = a Asize = UInt32.(size(b)) diff --git a/src/testsuite/pool.jl b/src/testsuite/pool.jl index 21608014..76a9e63c 100644 --- a/src/testsuite/pool.jl +++ b/src/testsuite/pool.jl @@ -7,7 +7,7 @@ function run_pool(Typ) continue end @testset "$ET" begin - @testset "maxpool" begin + @testset "maxpool with padding" begin pool = 3 stride = 3 pad = 3 @@ -18,7 +18,34 @@ function run_pool(Typ) out1 = maxpool(b, (3, 3)) a = T(a) - out2 = GPUArrays.maxpool2d(a, pool, stride = 3, pad = 3) + out2 = GPUArrays.maxpool2d(a, pool, pad = pad) + + @test out1 ≈ out2 + end + + @testset "maxpool without padding" begin + pool = 3 + stride = 3 + + a = rand(ET, 9,9,3,1) + out1 = maxpool(a, (3, 3)) + + a = T(a) + out2 = GPUArrays.maxpool2d(a, pool) + + @test out1 ≈ out2 + end + + + @testset "maxpool with full kernel" begin + pool = 9 + stride = 1 + + a = rand(ET, 9,9,3,1) + out1 = maxpool(a, (9, 9)) + + a = T(a) + out2 = GPUArrays.maxpool2d(a, pool, stride = stride) @test out1 ≈ out2 end From 53cc937afca24c3c4103b9cea7cb5a2757f459c0 Mon Sep 17 00:00:00 2001 From: americast Date: Fri, 20 Apr 2018 20:58:30 +0530 Subject: [PATCH 5/7] Remove wrong import; adjust dims in similar --- src/pool.jl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/pool.jl b/src/pool.jl index 9582d61e..7e4f5e70 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -1,5 +1,3 @@ -import CUDAnative - function maxpool2d_kernel(state, A::AbstractArray{T}, out, Asize, pool, stride, outSize) where T ilin = linear_index(state) idx = GPUArrays.gpu_ind2sub(Asize, ilin) @@ -33,9 +31,10 @@ function maxpool2d{T <: Integer}(a, pool::T; stride = pool, pad = 0) Asize = UInt32.(size(b)) pool = UInt32(pool) stride = UInt32(stride) - out = similar(b) - out = out[1:(div(Asize[1] - pool, stride) + 1), 1:(div(Asize[2] - pool, stride) + 1), :, :] - outSize = UInt32.(size(out)) + outSize = [i for i in size(b)] + outSize[1:2] = [div(Asize[1] - pool, stride) + 1, div(Asize[2] - pool, stride) + 1] + out = similar(b, outSize...) + outSize = UInt32.(tuple(outSize...)) gpu_call(maxpool2d_kernel, b, (b, out, Asize, pool, stride, outSize)) GPUArrays.synchronize(out) out From 1e1104e56e6d52127f906e251247a123dc45e922 Mon Sep 17 00:00:00 2001 From: americast Date: Sat, 21 Apr 2018 03:11:37 +0530 Subject: [PATCH 6/7] Update calling function Co-authored-by: SimonDanisch --- src/pool.jl | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/pool.jl b/src/pool.jl index 7e4f5e70..7503a4f0 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -26,17 +26,14 @@ end function maxpool2d{T <: Integer}(a, pool::T; stride = pool, pad = 0) - b = zeros(typeof(a), size(a,1) + pad * 2, size(a,2) + pad * 2, size(a,3), size(a,4)) - b[pad + 1 : pad + size(a,1), pad + 1 : pad + size(a,2), :, :] = a - Asize = UInt32.(size(b)) - pool = UInt32(pool) - stride = UInt32(stride) - outSize = [i for i in size(b)] - outSize[1:2] = [div(Asize[1] - pool, stride) + 1, div(Asize[2] - pool, stride) + 1] - out = similar(b, outSize...) - outSize = UInt32.(tuple(outSize...)) - gpu_call(maxpool2d_kernel, b, (b, out, Asize, pool, stride, outSize)) - GPUArrays.synchronize(out) + a2 = size.((a,), (1, 2)) + b = zeros(typeof(a), (a2 .+ 2pad)..., size(a, 3), size(a, 4)) + apad = a2 .+ pad + b[pad + 1 : apad[1], pad + 1 : apad[2], :, :] = a + as = ((size(b) .- pool) .÷ stride) .+ 1 + out = similar(b, (as[1], as[2], size(b, 3), size(b, 4))) + sizes = map(x-> UInt32.(x), (size(b), pool, stride, size(out))) + gpu_call(maxpool2d_kernel, b, (b, out, sizes...)) out end From 7886e5af16c5e04c399394402608fe419d130a7e Mon Sep 17 00:00:00 2001 From: americast Date: Sat, 21 Apr 2018 03:50:54 +0530 Subject: [PATCH 7/7] Add assertslow to test --- src/indexing.jl | 4 ++-- src/testsuite/pool.jl | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/indexing.jl b/src/indexing.jl index 4b98e3e7..5ab4b0e3 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -16,7 +16,7 @@ function _getindex(xs::GPUArray{T}, i::Integer) where T end function Base.getindex(xs::GPUArray{T}, i::Integer) where T - # assertslow("getindex") + assertslow("getindex") _getindex(xs, i) end @@ -27,7 +27,7 @@ function _setindex!(xs::GPUArray{T}, v::T, i::Integer) where T end function Base.setindex!(xs::GPUArray{T}, v::T, i::Integer) where T - # assertslow("setindex!") + assertslow("setindex!") _setindex!(xs, v, i) end diff --git a/src/testsuite/pool.jl b/src/testsuite/pool.jl index 76a9e63c..7649e04e 100644 --- a/src/testsuite/pool.jl +++ b/src/testsuite/pool.jl @@ -18,6 +18,7 @@ function run_pool(Typ) out1 = maxpool(b, (3, 3)) a = T(a) + GPUArrays.allowslow(true) out2 = GPUArrays.maxpool2d(a, pool, pad = pad) @test out1 ≈ out2 @@ -31,6 +32,7 @@ function run_pool(Typ) out1 = maxpool(a, (3, 3)) a = T(a) + GPUArrays.allowslow(true) out2 = GPUArrays.maxpool2d(a, pool) @test out1 ≈ out2 @@ -45,6 +47,7 @@ function run_pool(Typ) out1 = maxpool(a, (9, 9)) a = T(a) + GPUArrays.allowslow(true) out2 = GPUArrays.maxpool2d(a, pool, stride = stride) @test out1 ≈ out2