-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[AMDGPU] Extend llvm.amdgcn.update.dpp intrinsic to support f64 #91190
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Added new tests for v2i32, float2 and ptr |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Vikram Hegde (vikramRH) ChangesFollow up patch to #89217, before we make changes to atomic optimizer. Patch is 22.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/91190.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cca8d96f29c0fc..9923880ed49dd4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3338,13 +3338,15 @@ def : GCNPat <
(as_i1timm $bound_ctrl))
>;
+foreach vt = Reg64Types.types in {
def : GCNPat <
- (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
+ (vt (int_amdgcn_update_dpp vt:$old, vt:$src, timm:$dpp_ctrl, timm:$row_mask,
timm:$bank_mask, timm:$bound_ctrl)),
(V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$old, VReg_64_Align2:$src, (as_i32timm $dpp_ctrl),
(as_i32timm $row_mask), (as_i32timm $bank_mask),
(as_i1timm $bound_ctrl))
>;
+}
//===----------------------------------------------------------------------===//
// Fract Patterns
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 01ed565bb756dd..caac7126068ef3 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -586,6 +586,7 @@ class RegisterTypes<list<ValueType> reg_types> {
def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
+def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0]>;
let HasVGPR = 1 in {
// VOP3 and VINTERP can access 256 lo and 256 hi registers.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index f7adfe47b64f25..b937d07f09a20e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -43,8 +43,8 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
store i32 %tmp0, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
-; GFX8-LABEL: update_dpp64_test:
+define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
+; GFX8-LABEL: update_dppi64_test:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
@@ -62,7 +62,7 @@ define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i6
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
-; GFX10-LABEL: update_dpp64_test:
+; GFX10-LABEL: update_dppi64_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
@@ -76,7 +76,7 @@ define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i6
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: update_dpp64_test:
+; GFX11-LABEL: update_dppi64_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
@@ -98,6 +98,226 @@ define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i6
ret void
}
+define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) {
+; GFX8-LABEL: update_dppf64_test:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: update_dppf64_test:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: update_dppf64_test:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
+ %load = load double, ptr addrspace(1) %gep
+ %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double %in1, double %load, i32 1, i32 1, i32 1, i1 false) #1
+ store double %tmp0, ptr addrspace(1) %gep
+ ret void
+}
+
+define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) {
+; GFX8-LABEL: update_dppv2i32_test:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: update_dppv2i32_test:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: update_dppv2i32_test:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg, i32 %id
+ %load = load <2 x i32>, ptr addrspace(1) %gep
+ %tmp0 = call <2 x i32> @llvm.amdgcn.update.dpp.v2i32(<2 x i32> %in1, <2 x i32> %load, i32 1, i32 1, i32 1, i1 false) #1
+ store <2 x i32> %tmp0, ptr addrspace(1) %gep
+ ret void
+}
+
+define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) {
+; GFX8-LABEL: update_dppv2f32_test:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: update_dppv2f32_test:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: update_dppv2f32_test:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
+ %load = load <2 x float>, ptr addrspace(1) %gep
+ %tmp0 = call <2 x float> @llvm.amdgcn.update.dpp.v2f32(<2 x float> %in1, <2 x float> %load, i32 1, i32 1, i32 1, i1 false) #1
+ store <2 x float> %tmp0, ptr addrspace(1) %gep
+ ret void
+}
+
+define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) {
+; GFX8-LABEL: update_dpp_p0_test:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: update_dpp_p0_test:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: update_dpp_p0_test:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds ptr, ptr addrspace(1) %arg, i32 %id
+ %load = load ptr, ptr addrspace(1) %gep
+ %tmp0 = call ptr @llvm.amdgcn.update.dpp.v2f32(ptr %in1, ptr %load, i32 1, i32 1, i32 1, i1 false) #1
+ store ptr %tmp0, ptr addrspace(1) %gep
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #1
declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index e43daf46e1e060..df10625ebfe36a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -55,11 +55,11 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}update_dpp64_test:
+; GCN-LABEL: {{^}}update_dppi64_test:
; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
-define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
+define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
%load = load i64, ptr addrspace(1) %gep
@@ -68,7 +68,59 @@ define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i6
ret void
}
-; GCN-LABEL: {{^}}update_dpp64_imm_old_test:
+; GCN-LABEL: {{^}}update_dppf64_test:
+; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
+; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
+ %load = load double, ptr addrspace(1) %gep
+ %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double %in1, double %load, i32 1, i32 1, i32 1, i1 false) #0
+ store double %tmp0, ptr addrspace(1) %gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}update_dppv2i32_test:
+; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
+; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg, i32 %id
+ %load = load <2 x i32>, ptr addrspace(1) %gep
+ %tmp0 = call <2 x i32> @llvm.amdgcn.update.dpp.v2i32(<2 x i32> %in1, <2 x i32> %load, i32 1, i32 1, i32 1, i1 false) #0
+ store <2 x i32> %tmp0, ptr addrspace(1) %gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}update_dppv2f32_test:
+; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
+; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
+ %load = load <2 x float>, ptr addrspace(1) %gep
+ %tmp0 = call <2 x float> @llvm.amdgcn.update.dpp.v2f32(<2 x float> %in1, <2 x float> %load, i32 1, i32 1, i32 1, i1 false) #0
+ store <2 x float> %tmp0, ptr addrspace(1) %gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}update_dpp_p0_test:
+; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
+; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds ptr, ptr addrspace(1) %arg, i32 %id
+ %load = load ptr, ptr addrspace(1) %gep
+ %tmp0 = call ptr @llvm.amdgcn.update.dpp.p0(ptr %in1, ptr %load, i32 1, i32 1, i32 1, i1 false) #0
+ store ptr %tmp0, ptr addrspace(1) %gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}update_dppi64_imm_old_test:
; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
; GFX8-OPT-DAG,GFX10-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
; GFX11-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
@@ -79,7 +131,7 @@ define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i6
; GFX8-OPT-DAG,GFX10-DAG,GFX11-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
-define amdgpu_kernel void @update_dpp64_imm_old_test(ptr addrspace(1) %arg, i64 %in2) {
+define amdgpu_kernel void @update_dppi64_imm_old_test(ptr addrspace(1) %arg, i64 %in2) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
%load = load i64, ptr addrspace(1) %gep
@@ -88,7 +140,27 @@ define amdgpu_kernel void @update_dpp64_imm_old_test(ptr addrspace(1) %arg, i64
ret void
}
-; GCN-LABEL: {{^}}update_dpp64_imm_src_test:
+; GCN-LABEL: {{^}}update_dppf64_imm_old_test:
+; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x6b8564a
+; GFX8-OPT-DAG,GFX10-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x405edce1
+; GFX11-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x405edce1
+; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x6b8564a
+; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x405edce1
+; GCN-DAG: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
+; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GFX8-OPT-DAG,GFX10-DAG,GFX11-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @update_dppf64_imm_old_test(ptr addrspace(1) %arg, double %in2) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
+ %load = load double, ptr addrspace(1) %gep
+ %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double 123.4512345123450, double %load, i32 1, i32 1, i32 1, i1 false) #0
+ store double %tmp0, ptr addrspac...
[truncated]
|
store ptr %tmp0, ptr addrspace(1) %gep | ||
ret void | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should have some 32-bit pointer tests too, don't think we have those
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added new tests
@@ -586,6 +586,7 @@ class RegisterTypes<list<ValueType> reg_types> { | |||
|
|||
def Reg16Types : RegisterTypes<[i16, f16, bf16]>; | |||
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>; | |||
def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0]>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We'll need to do something about other address spaces for globalisel, eventually
As a follow up, can you make AMDGPUInstCombineIntrinsic fold update.dpp.i64 (bitcast (x to i64)) to the different typed intrinsic |
Will do, should this be a separate patch ? |
LGTM, though I wish there was a better way to implement it then duplicating the selection pattern for every 64-bit type. Is there any reason the sdag and globalisel tests can't be combined into a single file with two RUN lines? |
Yes. |
I've had an open issue for ages to add a type matcher that only cares about the size, but nobody ever gets anywhere with it |
Not really, I just extended the existing setup. If it makes more sense , I could combine run lines and autogenerate checks in the current sdag testfile. |
…ring for generic types (llvm#92725) These are incremental changes over llvm#89217 , with core logic being the same. This patch along with llvm#89217 and llvm#91190 should get us ready to enable 64 bit optimizations in atomic optimizer.
…#91190) Follow up patch to llvm#89217, before we make changes to atomic optimizer. Change-Id: I3857ef9314db77354875a3f3f2e1eb7f5fe8067a
…ring for generic types (llvm#92725) These are incremental changes over llvm#89217 , with core logic being the same. This patch along with llvm#89217 and llvm#91190 should get us ready to enable 64 bit optimizations in atomic optimizer. Change-Id: Ief70422a47461606c29134b217f40204ee4a198b
…#91190) Follow up patch to llvm#89217, before we make changes to atomic optimizer. Change-Id: If0e36752e59da11e596b9e008044b9cc1f69a36a
…ring for generic types (llvm#92725) These are incremental changes over llvm#89217 , with core logic being the same. This patch along with llvm#89217 and llvm#91190 should get us ready to enable 64 bit optimizations in atomic optimizer. Change-Id: Ief70422a47461606c29134b217f40204ee4a198b
Follow up patch to #89217, before we make changes to atomic optimizer.
@arsenm , @jayfoad , Do you see any concerns with changes here ?