Skip to content

Commit fc45a83

Browse files
committed
Populate AMDGPU extension
1 parent d930446 commit fc45a83

File tree

3 files changed

+102
-28
lines changed

3 files changed

+102
-28
lines changed

ext/OceananigansAMDGPUExt.jl

Lines changed: 91 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,102 @@
11
module OceananigansAMDGPUExt
22

3-
using AMDGPU
43
using Oceananigans
4+
using InteractiveUtils
5+
using AMDGPU, AMDGPU.ROCSPARSE, AMDGPU.ROCFFT
56
using Oceananigans.Utils: linear_expand, __linear_ndrange, MappedCompilerMetadata
67
using KernelAbstractions: __dynamic_checkbounds, __iterspace
7-
import KernelAbstractions: __validindex
8+
using KernelAbstractions
9+
import Oceananigans.Architectures as AC
10+
import Oceananigans.BoundaryConditions as BC
11+
import Oceananigans.DistributedComputations as DC
12+
import Oceananigans.Fields as FD
13+
import Oceananigans.Grids as GD
14+
import Oceananigans.Solvers as SO
15+
import Oceananigans.Utils as UT
16+
import SparseArrays: SparseMatrixCSC
17+
import KernelAbstractions: __iterspace, __groupindex, __dynamic_checkbounds,
18+
__validindex, CompilerMetadata
19+
import Oceananigans.DistributedComputations: Distributed
820

9-
import Oceananigans.Architectures:
10-
architecture,
11-
convert_to_device,
12-
on_architecture
21+
const GPUVar = Union{ROCArray, CuContext, CuPtr, Ptr}
1322

14-
const ROCGPU = GPU{<:AMDGPU.ROCBackend}
15-
ROCGPU() = GPU(AMDGPU.ROCBackend())
23+
function __init__()
24+
if AMDGPU.functional()
25+
@debug "ROCm-enabled GPU(s) detected:"
26+
for (gpu, dev) in enumerate(AMDGPU.devices())
27+
@debug "$dev: $(AMDGPU.name(dev))"
28+
end
29+
end
30+
end
31+
32+
const ROCGPU = AC.GPU{ROCBackend}
33+
ROCGPU() = AC.GPU(AMDGPU.ROCBackend())
1634

1735
architecture(::ROCArray) = ROCGPU()
1836
Base.summary(::ROCGPU) = "ROCGPU"
1937

20-
on_architecture(::ROCGPU, a::Number) = a
21-
on_architecture(::ROCGPU, a::Array) = ROCArray(a)
22-
on_architecture(::ROCGPU, a::BitArray) = ROCArray(a)
23-
on_architecture(::ROCGPU, a::SubArray{<:Any, <:Any, <:Array}) = ROCArray(a)
24-
on_architecture(::CPU, a::ROCArray) = Array(a)
25-
on_architecture(::CPU, a::SubArray{<:Any, <:Any, <:ROCArray}) = Array(a)
26-
on_architecture(::ROCGPU, a::ROCArray) = a
27-
on_architecture(::ROCGPU, a::SubArray{<:Any, <:Any, <:ROCArray}) = a
28-
on_architecture(::ROCGPU, a::StepRangeLen) = a
38+
AC.architecture(::ROCArray) = ROCGPU()
39+
AC.architecture(::ROCSparseMatrixCSC) = ROCGPU()
40+
AC.array_type(::AC.GPU{ROCBackend}) = ROCArray
41+
42+
AC.on_architecture(::ROCGPU, a::Number) = a
43+
AC.on_architecture(::AC.CPU, a::ROCArray) = Array(a)
44+
AC.on_architecture(::ROCGPU, a::Array) = ROCArray(a)
45+
AC.on_architecture(::ROCGPU, a::ROCArray) = a
46+
AC.on_architecture(::ROCGPU, a::BitArray) = ROCArray(a)
47+
AC.on_architecture(::ROCGPU, a::SubArray{<:Any, <:Any, <:ROCArray}) = a
48+
AC.on_architecture(::ROCGPU, a::SubArray{<:Any, <:Any, <:Array}) = ROCArray(a)
49+
AC.on_architecture(::CPU, a::SubArray{<:Any, <:Any, <:ROCArray}) = Array(a)
50+
AC.on_architecture(::ROCGPU, a::StepRangeLen) = a
51+
AC.on_architecture(arch::Distributed, a::ROCArray) = AC.on_architecture(AC.child_architecture(arch), a)
52+
AC.on_architecture(arch::Distributed, a::SubArray{<:Any, <:Any, <:ROCArray}) = AC.on_architecture(child_architecture(arch), a)
53+
54+
function unified_array(::AMDGPU, a::AbstractArray)
55+
error("unified_array is not implemented for ROCGPU.")
56+
end
57+
58+
## GPU to GPU copy of contiguous data
59+
@inline function AC.device_copy_to!(dst::ROCArray, src::ROCArray; async::Bool = false)
60+
if async == true
61+
@warn "Asynchronous copy is not supported for ROCArray. Falling back to synchronous copy."
62+
end
63+
copyto!(dst, src)
64+
return dst
65+
end
66+
67+
@inline AC.unsafe_free!(a::ROCArray) = AMDGPU.unsafe_free!(a)
68+
69+
@inline AC.constructors(::AC.GPU{ROCBackend}, A::SparseMatrixCSC) = (ROCArray(A.colptr), ROCArray(A.rowval), ROCArray(A.nzval), (A.m, A.n))
70+
@inline AC.constructors(::AC.CPU, A::ROCSparseMatrixCSC) = (A.dims[1], A.dims[2], Int64.(Array(A.colPtr)), Int64.(Array(A.rowVal)), Array(A.nzVal))
71+
@inline AC.constructors(::AC.GPU{ROCBackend}, A::ROCSparseMatrixCSC) = (A.colPtr, A.rowVal, A.nzVal, A.dims)
72+
73+
@inline AC.arch_sparse_matrix(::AC.GPU{ROCBackend}, constr::Tuple) = ROCSparseMatrixCSC(constr...)
74+
@inline AC.arch_sparse_matrix(::AC.CPU, A::ROCSparseMatrixCSC) = SparseMatrixCSC(AC.constructors(AC.CPU(), A)...)
75+
@inline AC.arch_sparse_matrix(::AC.GPU{ROCBackend}, A::SparseMatrixCSC) = ROCSparseMatrixCSC(AC.constructors(AC.GPU(), A)...)
76+
@inline AC.arch_sparse_matrix(::AC.GPU{ROCBackend}, A::ROCSparseMatrixCSC) = A
2977

3078
@inline convert_to_device(::ROCGPU, args) = AMDGPU.rocconvert(args)
3179
@inline convert_to_device(::ROCGPU, args::Tuple) = map(AMDGPU.rocconvert, args)
3280

81+
82+
BC.validate_boundary_condition_architecture(::ROCArray, ::AC.GPU, bc, side) = nothing
83+
84+
BC.validate_boundary_condition_architecture(::ROCArray, ::AC.CPU, bc, side) =
85+
throw(ArgumentError("$side $bc must use `Array` rather than `ROCArray` on CPU architectures!"))
86+
87+
function SO.plan_forward_transform(A::ROCArray, ::Union{GD.Bounded, GD.Periodic}, dims, planner_flag)
88+
length(dims) == 0 && return nothing
89+
return AMDGPU.ROCFFT.plan_fft!(A, dims)
90+
end
91+
92+
FD.set!(v::Field, a::ROCArray) = FD._set!(v, a)
93+
DC.set!(v::DC.DistributedField, a::ROCArray) = DC._set!(v, a)
94+
95+
function SO.plan_backward_transform(A::ROCArray, ::Union{GD.Bounded, GD.Periodic}, dims, planner_flag)
96+
length(dims) == 0 && return nothing
97+
return AMDGPU.ROCFFT.plan_ifft!(A, dims)
98+
end
99+
33100
AMDGPU.Device.@device_override @inline function __validindex(ctx::MappedCompilerMetadata)
34101
if __dynamic_checkbounds(ctx)
35102
I = @inbounds linear_expand(__iterspace(ctx), AMDGPU.Device.blockIdx().x, AMDGPU.Device.threadIdx().x)
@@ -39,4 +106,11 @@ AMDGPU.Device.@device_override @inline function __validindex(ctx::MappedCompiler
39106
end
40107
end
41108

109+
@inline UT.sync_device!(::ROCDevice) = ROC.synchronize()
110+
@inline UT.getdevice(roc::GPUVar, i) = device(roc)
111+
@inline UT.getdevice(roc::GPUVar) = device(roc)
112+
@inline UT.switch_device!(dev::ROCDevice) = device!(dev)
113+
@inline UT.sync_device!(::ROCGPU) = ROC.synchronize()
114+
@inline UT.sync_device!(::ROCBackend) = ROC.synchronize()
115+
42116
end # module

ext/OceananigansCUDAExt.jl

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ module OceananigansCUDAExt
33
using Oceananigans
44
using InteractiveUtils
55
using CUDA, CUDA.CUSPARSE, CUDA.CUFFT
6+
using Oceananigans.Utils: linear_expand, __linear_ndrange, MappedCompilerMetadata
7+
using KernelAbstractions: __dynamic_checkbounds, __iterspace
68
using KernelAbstractions
79
import Oceananigans.Architectures as AC
810
import Oceananigans.BoundaryConditions as BC
@@ -19,7 +21,7 @@ import Oceananigans.DistributedComputations: Distributed
1921
const GPUVar = Union{CuArray, CuContext, CuPtr, Ptr}
2022

2123
function __init__()
22-
if CUDA.has_cuda()
24+
if CUDA.functional()
2325
@debug "CUDA-enabled GPU(s) detected:"
2426
for (gpu, dev) in enumerate(CUDA.devices())
2527
@debug "$dev: $(CUDA.name(dev))"
@@ -56,15 +58,17 @@ AC.architecture(::CuArray) = CUDAGPU()
5658
AC.architecture(::CuSparseMatrixCSC) = CUDAGPU()
5759
AC.array_type(::AC.GPU{CUDABackend}) = CuArray
5860

61+
AC.on_architecture(::CUDAGPU, a::Number) = a
5962
AC.on_architecture(::AC.CPU, a::CuArray) = Array(a)
60-
6163
AC.on_architecture(::CUDAGPU, a::Array) = CuArray(a)
6264
AC.on_architecture(::CUDAGPU, a::CuArray) = a
6365
AC.on_architecture(::CUDAGPU, a::BitArray) = CuArray(a)
6466
AC.on_architecture(::CUDAGPU, a::SubArray{<:Any, <:Any, <:CuArray}) = a
6567
AC.on_architecture(::CUDAGPU, a::SubArray{<:Any, <:Any, <:Array}) = CuArray(a)
6668
AC.on_architecture(::AC.CPU, a::SubArray{<:Any, <:Any, <:CuArray}) = Array(a)
6769
AC.on_architecture(::CUDAGPU, a::StepRangeLen) = a
70+
AC.on_architecture(arch::Distributed, a::CuArray) = AC.on_architecture(AC.child_architecture(arch), a)
71+
AC.on_architecture(arch::Distributed, a::SubArray{<:Any, <:Any, <:CuArray}) = AC.on_architecture(child_architecture(arch), a)
6872

6973
# cu alters the type of `a`, so we convert it back to the correct type
7074
unified_array(::CUDAGPU, a::AbstractArray) = map(eltype(a), cu(a; unified = true))
@@ -86,13 +90,9 @@ end
8690
@inline AC.constructors(::AC.CPU, A::CuSparseMatrixCSC) = (A.dims[1], A.dims[2], Int64.(Array(A.colPtr)), Int64.(Array(A.rowVal)), Array(A.nzVal))
8791
@inline AC.constructors(::AC.GPU{CUDABackend}, A::CuSparseMatrixCSC) = (A.colPtr, A.rowVal, A.nzVal, A.dims)
8892

89-
@inline AC.unpack_constructors(::AC.CPU, constr::Tuple) = (constr[3], constr[4], constr[5])
90-
@inline AC.copy_unpack_constructors(::AC.CPU, constr::Tuple) = deepcopy((constr[3], constr[4], constr[5]))
91-
9293
@inline AC.arch_sparse_matrix(::AC.GPU{CUDABackend}, constr::Tuple) = CuSparseMatrixCSC(constr...)
9394
@inline AC.arch_sparse_matrix(::AC.CPU, A::CuSparseMatrixCSC) = SparseMatrixCSC(AC.constructors(AC.CPU(), A)...)
9495
@inline AC.arch_sparse_matrix(::AC.GPU{CUDABackend}, A::SparseMatrixCSC) = CuSparseMatrixCSC(AC.constructors(AC.GPU(), A)...)
95-
9696
@inline AC.arch_sparse_matrix(::AC.GPU{CUDABackend}, A::CuSparseMatrixCSC) = A
9797

9898
@inline AC.convert_to_device(::CUDAGPU, args) = CUDA.cudaconvert(args)
@@ -119,10 +119,10 @@ end
119119

120120
# CUDA version, the indices are passed implicitly
121121
# You must not use KA here as this code is executed in another scope
122-
CUDA.@device_override @inline function KernelAbstractions.__validindex(ctx::UT.MappedCompilerMetadata)
122+
CUDA.@device_override @inline function __validindex(ctx::MappedCompilerMetadata)
123123
if __dynamic_checkbounds(ctx)
124-
index = @inbounds UT.linear_expand(__iterspace(ctx), blockIdx().x, threadIdx().x)
125-
return index UT.__linear_ndrange(ctx)
124+
index = @inbounds linear_expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x)
125+
return index __linear_ndrange(ctx)
126126
else
127127
return true
128128
end
@@ -134,7 +134,5 @@ end
134134
@inline UT.switch_device!(dev::CuDevice) = device!(dev)
135135
@inline UT.sync_device!(::CUDAGPU) = CUDA.synchronize()
136136
@inline UT.sync_device!(::CUDABackend) = CUDA.synchronize()
137-
AC.on_architecture(arch::Distributed, a::CuArray) = AC.on_architecture(AC.child_architecture(arch), a)
138-
AC.on_architecture(arch::Distributed, a::SubArray{<:Any, <:Any, <:CuArray}) = AC.on_architecture(child_architecture(arch), a)
139137

140138
end # module OceananigansCUDAExt

src/Architectures.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,10 @@ unified_array(::GPU, a) = a
117117
@inline constructors(::CPU, m::Number, n::Number, constr::Tuple) = (m, n, constr...)
118118
@inline constructors(::GPU, m::Number, n::Number, constr::Tuple) = (constr..., (m, n))
119119

120+
@inline unpack_constructors(::CPU, constr::Tuple) = (constr[3], constr[4], constr[5])
120121
@inline unpack_constructors(::GPU, constr::Tuple) = (constr[1], constr[2], constr[3])
121122

123+
@inline copy_unpack_constructors(::CPU, constr::Tuple) = deepcopy((constr[3], constr[4], constr[5]))
122124
@inline copy_unpack_constructors(::GPU, constr::Tuple) = deepcopy((constr[1], constr[2], constr[3]))
123125

124126
@inline arch_sparse_matrix(::CPU, constr::Tuple) = SparseMatrixCSC(constr...)

0 commit comments

Comments
 (0)