-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[Clang][AMDGPU] Expose buffer load lds as a clang builtin #132048
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-flang-openmp @llvm/pr-subscribers-llvm-ir Author: Juan Manuel Martinez Caamaño (jmmartinez) ChangesCK is using either inline assembly or inline LLVM-IR builtins to generate buffer_load_dword lds instructions. This patch exposes this instruction as a Clang builtin. Limitations in this patch: these instructions are not available for gfx11/gfx12. I wasn't able to identify a target feature that we could use to predicate the builtin availability, in the same way as it is done for global_load_lds. Related to SWDEV-519702 and SWDEV-518861 Full diff: https://github.com/llvm/llvm-project/pull/132048.diff 6 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 44ef404aee72f..4117e8d67330b 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -162,6 +162,8 @@ BUILTIN(__builtin_amdgcn_raw_buffer_load_b64, "V2UiQbiiIi", "n")
BUILTIN(__builtin_amdgcn_raw_buffer_load_b96, "V3UiQbiiIi", "n")
BUILTIN(__builtin_amdgcn_raw_buffer_load_b128, "V4UiQbiiIi", "n")
+BUILTIN(__builtin_amdgcn_raw_ptr_buffer_load_lds, "vQbv*3IUiiiIiIi", "t")
+
//===----------------------------------------------------------------------===//
// Ballot builtins.
//===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 1536a3b8c920a..e17d6b530141d 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13132,6 +13132,6 @@ def err_acc_duplicate_bind
"permitted to refer to the same function">;
// AMDGCN builtins diagnostics
-def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">;
-def note_amdgcn_global_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">;
+def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">;
+def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">;
} // end of sema component.
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index a4d075dfd0768..7fec099374152 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -35,6 +35,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
Builtin::evaluateRequiredTargetFeatures("gfx950-insts", CallerFeatureMap);
switch (BuiltinID) {
+ case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds:
case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
constexpr const int SizeIdx = 2;
llvm::APSInt Size;
@@ -54,11 +55,9 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
[[fallthrough]];
}
default:
- Diag(ArgExpr->getExprLoc(),
- diag::err_amdgcn_global_load_lds_size_invalid_value)
+ Diag(ArgExpr->getExprLoc(), diag::err_amdgcn_load_lds_size_invalid_value)
<< ArgExpr->getSourceRange();
- Diag(ArgExpr->getExprLoc(),
- diag::note_amdgcn_global_load_lds_size_valid_value)
+ Diag(ArgExpr->getExprLoc(), diag::note_amdgcn_load_lds_size_valid_value)
<< HasGFX950Insts << ArgExpr->getSourceRange();
return true;
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl
index 3403b69e07e4b..5e3ed9027c17a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl
@@ -170,3 +170,12 @@ v3u32 test_amdgcn_raw_ptr_buffer_load_b96_non_const_soffset(__amdgpu_buffer_rsrc
v4u32 test_amdgcn_raw_ptr_buffer_load_b128_non_const_soffset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) {
return __builtin_amdgcn_raw_buffer_load_b128(rsrc, /*offset=*/0, soffset, /*aux=*/0);
}
+
+// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_lds(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(3) [[LDS:%.*]], i32 1, i32 [[OFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 2, i32 3)
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void * lds, int offset, int soffset) {
+ __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 1, offset, soffset, 2, 3);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl
new file mode 100644
index 0000000000000..b7065ea2b6655
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify -o - %s -DGFX950
+// REQUIRES: amdgpu-registered-target
+
+void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void* lds, int offset, int soffset, int x) {
+ __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, x, offset, soffset, 0, 0); //expected-error{{argument to '__builtin_amdgcn_raw_ptr_buffer_load_lds' must be a constant integer}}
+ __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 4, offset, soffset, x, 0); //expected-error{{argument to '__builtin_amdgcn_raw_ptr_buffer_load_lds' must be a constant integer}}
+ __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 4, offset, soffset, 0, x); //expected-error{{argument to '__builtin_amdgcn_raw_ptr_buffer_load_lds' must be a constant integer}}
+
+#ifdef GFX950
+ __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 3, offset, soffset, 0, 0); //expected-error{{invalid size value}} expected-note{{size must be 1, 2, 4, 12 or 16}}
+#else
+ __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 3, offset, soffset, 0, 0); //expected-error{{invalid size value}} expected-note{{size must be 1, 2, or 4}}
+#endif
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 307e4b8a01e5c..efec2ef8325c9 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1863,7 +1863,9 @@ class AMDGPURawBufferLoadLDS : Intrinsic <
ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS;
-class AMDGPURawPtrBufferLoadLDS : Intrinsic <
+class AMDGPURawPtrBufferLoadLDS :
+ ClangBuiltin<"__builtin_amdgcn_raw_ptr_buffer_load_lds">,
+ Intrinsic <
[],
[AMDGPUBufferRsrcTy, // rsrc(SGPR)
LLVMQualPointerType<3>, // LDS base offset
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can't speak to the Clang side of this change, but I don't see any issues here
c89ffa4
to
a9ff844
Compare
I've also seen that gfx11 seem to have some kind of BUFFER_LOAD_LDS_(SIZE) instruction (different from the BUFFER_LOAD_(SIZE)_LDS instructions associated with this builtin). However, I'm not sure they actually exists: we do not have any test where we generate them and the gfx11 shader manual explicitly says "Remove "LOAD_LDS" instructions: buffer_load_lds, scratch_load_lds, global_load_lds" |
Sounds like we need to get information from HW folks. :-) |
a9ff844
to
cb63d67
Compare
cb63d67
to
2b8923f
Compare
I've asked and there is no buffer_load_lds instruction on gfx11 nor gfx12. I guess we can remove them in a separate patch. |
// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } | ||
// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } | ||
// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-gfx9-insts" } | ||
// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+lds-buffer-load-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ReadOnlyFeatures seems to not have helped with this...
2b8923f
to
6aa2a17
Compare
Squashed and split in 2: #133055 |
This are used to restrict the availability of buffer_load_lds intrinsics to targets that actually have this instructions.
…lds on gfx9 and gfx10
CK is using either inline assembly or inline llvm-ir builtins to generate buffer_load_dword lds instructions. This patch exposes this instruction as a builtin. Related to SWDEV-519702 and SWDEV-518861
6aa2a17
to
b625c81
Compare
Hold on, isn't there an |
No there is not. Neither gfx11 nor 12 have LDS DMA. |
CK is using either inline assembly or inline LLVM-IR builtins to generate buffer_load_dword lds instructions. This patch exposes this instruction as a Clang builtin available on gfx9 and gfx10. Related to SWDEV-519702 and SWDEV-518861
CK is using either inline assembly or inline LLVM-IR builtins to generate buffer_load_dword lds instructions.
This patch exposes this instruction as a Clang builtin.
Limitations in this patch: these instructions are not available for gfx11/gfx12. I wasn't able to identify a target feature that we could use to predicate the builtin availability, in the same way as it is done for global_load_lds.
Related to SWDEV-519702 and SWDEV-518861