Skip to content

[AMDGPU] Enable atomic optimizer for 64 bit divergent values #96473

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18467,6 +18467,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
return Builder.CreateCall(F, Args);
}
case AMDGPU::BI__builtin_amdgcn_permlane16:
case AMDGPU::BI__builtin_amdgcn_permlanex16:
return emitBuiltinWithOneOverloadedType<6>(
*this, E,
BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
? Intrinsic::amdgcn_permlane16
: Intrinsic::amdgcn_permlanex16);
case AMDGPU::BI__builtin_amdgcn_permlane64:
return emitBuiltinWithOneOverloadedType<1>(*this, E,
Intrinsic::amdgcn_permlane64);
case AMDGPU::BI__builtin_amdgcn_readlane:
return emitBuiltinWithOneOverloadedType<2>(*this, E,
Intrinsic::amdgcn_readlane);
case AMDGPU::BI__builtin_amdgcn_readfirstlane:
return emitBuiltinWithOneOverloadedType<1>(*this, E,
Intrinsic::amdgcn_readfirstlane);
case AMDGPU::BI__builtin_amdgcn_div_fixup:
case AMDGPU::BI__builtin_amdgcn_div_fixupf:
case AMDGPU::BI__builtin_amdgcn_div_fixuph:
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ typedef unsigned int uint;
typedef unsigned long ulong;

// CHECK-LABEL: @test_permlane16(
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
void test_permlane16(global uint* out, uint a, uint b, uint c, uint d) {
*out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0);
}

// CHECK-LABEL: @test_permlanex16(
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlanex16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlanex16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) {
*out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0);
}
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ void test_ds_bvh_stack_rtn(global uint2* out, uint addr, uint data, uint4 data1)
}

// CHECK-LABEL: @test_permlane64(
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane64(i32 %a)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane64.i32(i32 %a)
void test_permlane64(global uint* out, uint a) {
*out = __builtin_amdgcn_permlane64(a);
}
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Original file line number Diff line number Diff line change
Expand Up @@ -308,14 +308,14 @@ void test_ds_bpermute(global int* out, int a, int b)
}

// CHECK-LABEL: @test_readfirstlane
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readfirstlane(i32 %a)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
void test_readfirstlane(global int* out, int a)
{
*out = __builtin_amdgcn_readfirstlane(a);
}

// CHECK-LABEL: @test_readlane
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
void test_readlane(global int* out, int a, int b)
{
*out = __builtin_amdgcn_readlane(a, b);
Expand Down
20 changes: 20 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1208,6 +1208,26 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
reduction will be performed using default iterative strategy.
Intrinsic is currently only implemented for i32.

llvm.amdgcn.permlane16 Provides direct access to v_permlane16_b32. Performs arbitrary gather-style
operation within a row (16 contiguous lanes) of the second input operand.
The third and fourth inputs must be scalar values. these are combined into
a single 64-bit value representing lane selects used to swizzle within each
row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>,
<2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.

llvm.amdgcn.permlanex16 Provides direct access to v_permlanex16_b32. Performs arbitrary gather-style
operation across two rows of the second input operand (each row is 16 contiguous
lanes). The third and fourth inputs must be scalar values. these are combined
into a single 64-bit value representing lane selects used to swizzle within each
row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>,
<2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.

llvm.amdgcn.permlane64 Provides direct access to v_permlane64_b32. Performs a specific permutation across
lanes of the input operand where the high half and low half of a wave64 are swapped.
Performs no operation in wave32 mode. Currently implemented for i16, i32, float, half,
bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the
32-bit vectors.

llvm.amdgcn.udot2 Provides direct access to v_dot2_u32_u16 across targets which
support such instructions. This performs unsigned dot product
with two v2i16 operands, summed with the third i32 operand. The
Expand Down
30 changes: 13 additions & 17 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2042,26 +2042,23 @@ def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;

def int_amdgcn_readfirstlane :
ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// The lane argument must be uniform across the currently active threads of the
// current wave. Otherwise, the result is undefined.
def int_amdgcn_readlane :
ClangBuiltin<"__builtin_amdgcn_readlane">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// The value to write and lane select arguments must be uniform across the
// currently active threads of the current wave. Otherwise, the result is
// undefined.
def int_amdgcn_writelane :
ClangBuiltin<"__builtin_amdgcn_writelane">,
Intrinsic<[llvm_i32_ty], [
llvm_i32_ty, // uniform value to write: returned by the selected lane
llvm_i32_ty, // uniform lane select
llvm_i32_ty // returned by all lanes other than the selected one
Intrinsic<[llvm_any_ty], [
LLVMMatchType<0>, // uniform value to write: returned by the selected lane
llvm_i32_ty, // uniform lane select
LLVMMatchType<0> // returned by all lanes other than the selected one
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
Expand Down Expand Up @@ -2358,16 +2355,16 @@ def int_amdgcn_pops_exiting_wave_id :
//===----------------------------------------------------------------------===//

// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">,
Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
def int_amdgcn_permlane16 :
Intrinsic<[llvm_any_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;

// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">,
Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
def int_amdgcn_permlanex16 :
Intrinsic<[llvm_any_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;

Expand Down Expand Up @@ -2410,8 +2407,7 @@ def int_amdgcn_image_bvh_intersect_ray :

// llvm.amdgcn.permlane64 <src0>
def int_amdgcn_permlane64 :
ClangBuiltin<"__builtin_amdgcn_permlane64">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

def int_amdgcn_ds_add_gs_reg_rtn :
Expand Down
Loading