-
Notifications
You must be signed in to change notification settings - Fork 14.1k
[MLIR][GPU] Add support for non-portable cluster size attribute #95545
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -544,7 +544,8 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [ | |
Optional<LaunchIndx>:$clusterSizeZ, | ||
Optional<I32>:$dynamicSharedMemorySize, | ||
Variadic<AnyType>:$kernelOperands, | ||
Optional<AnyType>:$asyncObject)>, | ||
Optional<AnyType>:$asyncObject, | ||
OptionalAttr<BoolAttr>:$nonPortableClusterSize)>, | ||
Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> { | ||
let summary = "Launches a function as a GPU kernel"; | ||
|
||
|
@@ -585,6 +586,10 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [ | |
arguments are present, the Op launches a kernel that clusters the given | ||
thread blocks. This feature is exclusive to certain architectures. | ||
|
||
The `gpu.launch_func` also supports the following optional runtime attributes: | ||
- nonPortableClusterSize - launch kernel with non-portable cluster size (only | ||
supported on certain architectures) | ||
|
||
Example: | ||
|
||
```mlir | ||
|
@@ -640,7 +645,7 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [ | |
// memory to allocate for a workgroup. | ||
args(%arg0 : f32, // (Optional) Kernel arguments. | ||
%arg1 : memref<?xf32, 1>) | ||
} | ||
} { nonPortableClusterSize = true } // Attributes | ||
``` | ||
}]; | ||
|
||
|
@@ -652,12 +657,14 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [ | |
"ValueRange":$kernelOperands, | ||
CArg<"Type", "nullptr">:$asyncTokenType, | ||
CArg<"ValueRange", "{}">:$asyncDependencies, | ||
CArg<"std::optional<KernelDim3>", "std::nullopt">:$clusterSize)>, | ||
CArg<"std::optional<KernelDim3>", "std::nullopt">:$clusterSize, | ||
CArg<"BoolAttr", "{}">:$nonPortableClusterSize)>, | ||
OpBuilder<(ins "SymbolRefAttr":$kernel, "KernelDim3":$gridSize, | ||
"KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize, | ||
"ValueRange":$kernelOperands, | ||
CArg<"Value", "nullptr">:$asyncObject, | ||
CArg<"std::optional<KernelDim3>", "std::nullopt">:$clusterSize)> | ||
CArg<"std::optional<KernelDim3>", "std::nullopt">:$clusterSize, | ||
CArg<"BoolAttr", "{}">:$nonPortableClusterSize)> | ||
]; | ||
|
||
let extraClassDeclaration = [{ | ||
|
@@ -720,7 +727,8 @@ def GPU_LaunchOp : GPU_Op<"launch", [ | |
Optional<Index>:$clusterSizeX, | ||
Optional<Index>:$clusterSizeY, | ||
Optional<Index>:$clusterSizeZ, | ||
Optional<I32>:$dynamicSharedMemorySize)>, | ||
Optional<I32>:$dynamicSharedMemorySize, | ||
OptionalAttr<BoolAttr>:$nonPortableClusterSize)>, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> { | ||
let summary = "GPU kernel launch operation"; | ||
|
||
|
@@ -815,10 +823,20 @@ def GPU_LaunchOp : GPU_Op<"launch", [ | |
blocks(%bx, %by, %bz) in (%sz_bx = %3, %sz_by = %4, %sz_bz = %5) | ||
threads(%tx, %ty, %tz) in (%sz_tx = %6, %sz_ty = %7, %sz_tz = %8) | ||
{ | ||
// Cluster, block and thread identifiers, as well as cluster/block/grid | ||
// Cluster, block and thread identifiers, as well as cluster/block/grid | ||
// sizes are immediately usable inside body region. | ||
"some_op"(%cx, %bx, %tx) : (index, index, index) -> () | ||
} | ||
|
||
// Launch with non-portable cluster size attribute. | ||
gpu.launch clusters(%cx, %cy, %cz) in (%sz_cx = %0, %sz_cy = %1, %sz_cz = %2) | ||
blocks(%bx, %by, %bz) in (%sz_bx = %3, %sz_by = %4, %sz_bz = %5) | ||
threads(%tx, %ty, %tz) in (%sz_tx = %6, %sz_ty = %7, %sz_tz = %8) | ||
{ | ||
// Cluster, block and thread identifiers, as well as cluster/block/grid | ||
// sizes are immediately usable inside body region. | ||
"some_op"(%cx, %bx, %tx) : (index, index, index) -> () | ||
} { nonPortabeClusterSize = true } | ||
``` | ||
|
||
Rationale: using operation/block arguments gives analyses a clear way of | ||
|
@@ -843,7 +861,8 @@ def GPU_LaunchOp : GPU_Op<"launch", [ | |
CArg<"TypeRange", "{}">:$privateAttributions, | ||
CArg<"Value", "nullptr">:$clusterSizeX, | ||
CArg<"Value", "nullptr">:$clusterSizeY, | ||
CArg<"Value", "nullptr">:$clusterSizeZ)> | ||
CArg<"Value", "nullptr">:$clusterSizeZ, | ||
CArg<"BoolAttr", "{}">:$nonPortableClusterSize)> | ||
]; | ||
|
||
let extraClassDeclaration = [{ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -92,7 +92,8 @@ void ClusterDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>, | |
|
||
void ClusterDimBlocksOp::inferResultRanges(ArrayRef<ConstantIntRanges>, | ||
SetIntRangeFn setResultRange) { | ||
setResultRange(getResult(), getIndexRange(1, kMaxClusterDim)); | ||
uint64_t max = APInt::getMaxValue(64).getZExtValue(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's add a comment here why is that? |
||
setResultRange(getResult(), getIndexRange(1, max)); | ||
} | ||
|
||
void ClusterIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -335,11 +335,13 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) { | |
|
||
#if (CUDA_VERSION >= 12000) | ||
|
||
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchClusterKernel( | ||
CUfunction function, intptr_t clusterX, intptr_t clusterY, | ||
intptr_t clusterZ, intptr_t gridX, intptr_t gridY, intptr_t gridZ, | ||
intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem, | ||
CUstream stream, void **params, void **extra, size_t /*paramsCount*/) { | ||
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void | ||
mgpuLaunchClusterKernel(CUfunction function, intptr_t clusterX, | ||
intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, | ||
intptr_t gridY, intptr_t gridZ, intptr_t blockX, | ||
intptr_t blockY, intptr_t blockZ, int32_t smem, | ||
bool nonPortableClusterSize, CUstream stream, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand that it's low cost passing a bool here. But if we want to support other arguments, we cannot pass all of them as bool. We need alternative solution. |
||
void **params, void **extra, size_t /*paramsCount*/) { | ||
ScopedContext scopedContext; | ||
if (smem > 0) { | ||
// Avoid checking driver as it's more expensive than if statement | ||
|
@@ -358,6 +360,11 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchClusterKernel( | |
CUDA_REPORT_IF_ERROR(cuFuncSetAttribute( | ||
function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem)); | ||
} | ||
|
||
if (nonPortableClusterSize) | ||
CUDA_REPORT_IF_ERROR(cuFuncSetAttribute( | ||
function, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)); | ||
|
||
CUlaunchConfig config; | ||
config.gridDimX = gridX; | ||
config.gridDimY = gridY; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
// RUN: mlir-opt %s \ | ||
// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_90a cubin-features=+ptx80 opt-level=3" \ | ||
// RUN: | mlir-cpu-runner \ | ||
// RUN: --shared-libs=%mlir_cuda_runtime \ | ||
// RUN: --shared-libs=%mlir_runner_utils \ | ||
// RUN: --shared-libs=%mlir_c_runner_utils \ | ||
// RUN: --entry-point-result=void \ | ||
// RUN: | FileCheck %s | ||
|
||
// CHECK: clusterIdx: (3, 3, 0) in Cluster Dimension: (4, 4, 1) blockIdx: (15, 15, 0) | ||
// CHECK: clusterIdx: (3, 3, 0) in Cluster Dimension: (4, 4, 1) blockIdx: (15, 15, 0) | ||
|
||
module attributes {gpu.container_module} { | ||
gpu.module @gpumodule { | ||
gpu.func @kernel_cluster() kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 1>} { | ||
%cidX = gpu.cluster_id x | ||
%cidY = gpu.cluster_id y | ||
%cidZ = gpu.cluster_id z | ||
%cdimX = gpu.cluster_dim_blocks x | ||
%cdimY = gpu.cluster_dim_blocks y | ||
%cdimZ = gpu.cluster_dim_blocks z | ||
%bidX = gpu.block_id x | ||
%bidY = gpu.block_id y | ||
%bidZ = gpu.block_id z | ||
%cidX_i32 = index.casts %cidX : index to i32 | ||
%cidY_i32 = index.casts %cidY : index to i32 | ||
%cidZ_i32 = index.casts %cidZ : index to i32 | ||
%cdimX_i32 = index.casts %cdimX : index to i32 | ||
%cdimY_i32 = index.casts %cdimY : index to i32 | ||
%cdimZ_i32 = index.casts %cdimZ : index to i32 | ||
%bidX_i32 = index.casts %bidX : index to i32 | ||
%bidY_i32 = index.casts %bidY : index to i32 | ||
%bidZ_i32 = index.casts %bidZ : index to i32 | ||
|
||
%c_1 = arith.constant -1 : index | ||
%cBlocksX = gpu.grid_dim x | ||
%cN_1 = arith.addi %cBlocksX, %c_1 : index | ||
%cnd1 = arith.cmpi eq, %bidX, %cN_1 : index | ||
%cnd2 = arith.cmpi eq, %bidY, %cN_1 : index | ||
scf.if %cnd1 { | ||
scf.if %cnd2 { | ||
gpu.printf "clusterIdx: (%d, %d, %d) in Cluster Dimension: (%d, %d, %d) blockIdx: (%d, %d, %d) \n" | ||
%cidX_i32, | ||
%cidY_i32, | ||
%cidZ_i32, | ||
%cdimX_i32, | ||
%cdimY_i32, | ||
%cdimZ_i32, | ||
%bidX_i32, | ||
%bidY_i32, | ||
%bidZ_i32 | ||
: | ||
i32, i32, i32, i32, i32, i32, i32, i32, i32 | ||
} | ||
} | ||
gpu.return | ||
} | ||
} | ||
|
||
func.func @main() { | ||
%cDimX = arith.constant 4 : index | ||
%cDimY = arith.constant 4 : index | ||
%cDimZ = arith.constant 1 : index | ||
%gDimX = arith.constant 16 : index | ||
%gDimY = arith.constant 16 : index | ||
%gDimZ = arith.constant 1 : index | ||
%bDimX = arith.constant 1 : index | ||
%bDimY = arith.constant 1 : index | ||
%bDimZ = arith.constant 1 : index | ||
|
||
gpu.launch clusters(%cx, %cy, %cz) in (%cluster_x = %cDimX, %cluster_y = %cDimY, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we actually test with cluster more than 8? |
||
%cluster_z = %cDimZ) | ||
blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, | ||
%grid_z = %gDimZ) | ||
threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, | ||
%block_z = %bDimZ) { | ||
%cidX = gpu.cluster_id x | ||
%cidY = gpu.cluster_id y | ||
%cidZ = gpu.cluster_id z | ||
%cdimX = gpu.cluster_dim_blocks x | ||
%cdimY = gpu.cluster_dim_blocks y | ||
%cdimZ = gpu.cluster_dim_blocks z | ||
%bidX = gpu.block_id x | ||
%bidY = gpu.block_id y | ||
%bidZ = gpu.block_id z | ||
%cidX_i32 = index.casts %cidX : index to i32 | ||
%cidY_i32 = index.casts %cidY : index to i32 | ||
%cidZ_i32 = index.casts %cidZ : index to i32 | ||
%cdimX_i32 = index.casts %cdimX : index to i32 | ||
%cdimY_i32 = index.casts %cdimY : index to i32 | ||
%cdimZ_i32 = index.casts %cdimZ : index to i32 | ||
%bidX_i32 = index.casts %bidX : index to i32 | ||
%bidY_i32 = index.casts %bidY : index to i32 | ||
%bidZ_i32 = index.casts %bidZ : index to i32 | ||
|
||
%c_1 = arith.constant -1 : index | ||
%cBlocksX = gpu.grid_dim x | ||
%cN_1 = arith.addi %cBlocksX, %c_1 : index | ||
%cnd1 = arith.cmpi eq, %bidX, %cN_1 : index | ||
%cnd2 = arith.cmpi eq, %bidY, %cN_1 : index | ||
scf.if %cnd1 { | ||
scf.if %cnd2 { | ||
gpu.printf "clusterIdx: (%d, %d, %d) in Cluster Dimension: (%d, %d, %d) blockIdx: (%d, %d, %d) \n" | ||
%cidX_i32, | ||
%cidY_i32, | ||
%cidZ_i32, | ||
%cdimX_i32, | ||
%cdimY_i32, | ||
%cdimZ_i32, | ||
%bidX_i32, | ||
%bidY_i32, | ||
%bidZ_i32 | ||
: | ||
i32, i32, i32, i32, i32, i32, i32, i32, i32 | ||
} | ||
} | ||
|
||
gpu.terminator | ||
} { nonPortableClusterSize = true} | ||
|
||
gpu.launch_func @gpumodule::@kernel_cluster clusters in (%cDimX,%cDimY,%cDimZ) blocks in (%gDimX, %gDimY, %gDimZ) threads in (%bDimX, %bDimY, %bDimZ) { nonPortableClusterSize = true } | ||
return | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -94,7 +94,7 @@ module attributes {gpu.container_module} { | |
// CHECK: [[S3:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst, i64 4) | ||
// CHECK: [[S4:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[S3]], ptr @kernel_module_kernel_kernel_name) | ||
// CHECK: [[S5:%.*]] = call ptr @mgpuStreamCreate() | ||
// CHECK: call void @mgpuLaunchClusterKernel(ptr [[S4]], i64 2, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i32 0, ptr [[S5]], ptr [[S2]], ptr null) | ||
// CHECK: call void @mgpuLaunchClusterKernel(ptr [[S4]], i64 2, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i32 0, i1 false, ptr [[S5]], ptr [[S2]], ptr null) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need a test that's |
||
%0 = llvm.mlir.constant(1 : index) : i64 | ||
%1 = llvm.mlir.constant(2 : index) : i64 | ||
gpu.launch_func @kernel_module::@kernel clusters in (%1, %0, %0) blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.