diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 12716bdde1a9f..88b694862d376 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1537,17 +1537,13 @@ multiclass ScratchFLATLoadPats_D16_t16; -def : FlatLoadPat ; def : FlatLoadPat ; -def : FlatLoadPat ; def : FlatLoadPat ; -def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; -def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; @@ -1560,8 +1556,14 @@ let True16Predicate = p in { def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; + def : FlatLoadPat ; + def : FlatLoadPat ; + def : FlatLoadPat ; + def : FlatLoadPat ; def : FlatStorePat ; def : FlatStorePat ; + def : FlatStorePat ; + def : FlatStorePat ; } let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in { @@ -1569,8 +1571,14 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi def : FlatLoadPat_D16_t16; def : FlatLoadPat_D16_t16; def : FlatLoadPat_D16_t16; + def : FlatLoadPat_D16_t16; + def : FlatLoadPat_D16_t16; + def : FlatLoadPat_D16_t16; + def : FlatLoadPat_D16_t16; def : FlatStorePat ; def : FlatStorePat ; + def : FlatStorePat ; + def : FlatStorePat ; } // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts def : FlatLoadPat ; @@ -1599,9 +1607,7 @@ def : FlatStorePat ; def : FlatStorePat ; def : FlatStorePat ; def : FlatStorePat ; -def : FlatStorePat ; def : FlatStorePat ; -def : FlatStorePat ; foreach as = [ "flat", "global" ] in { defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>; @@ -1680,9 +1686,7 @@ let OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; -defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; -defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; @@ -1702,6 +1706,8 @@ defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; } let OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { @@ -1712,8 +1718,12 @@ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", load_global, i16>; defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_aext_8_global, i16>; defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_zext_8_global, i16>; defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", atomic_load_sext_8_global, i16>; +defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_nonext_16_global, i16>; +defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_zext_16_global, i16>; defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", truncstorei8_global, i16>; defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", store_global, i16>; +defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", atomic_store_8_global, i16>; +defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", atomic_store_16_global, i16>; } // end OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts foreach vt = Reg32Types.types in { @@ -1747,6 +1757,8 @@ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = p in { defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; +defm : GlobalFLATStorePats ; +defm : GlobalFLATStorePats ; } let OtherPredicates = [HasD16LoadStore] in { @@ -1772,9 +1784,7 @@ defm : GlobalFLATLoadPats_D16 } defm : GlobalFLATStorePats ; -defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; -defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index e674b57aae3ef..3304dbf3eaa3d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -2,6 +2,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN1 %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN2 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_offset: @@ -46,6 +48,21 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_add_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw add ptr %gep, i32 %in syncscope("agent") seq_cst @@ -95,6 +112,21 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_max_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_add_u32 v[0:1], v2 offset:4092 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 1023 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst @@ -146,6 +178,23 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_max_offset_p1: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_add_u32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 1024 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst @@ -204,6 +253,22 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_add_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst @@ -266,6 +331,26 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_add_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -337,6 +422,27 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_add_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -384,6 +490,21 @@ define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_add_u32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst ret void @@ -437,6 +558,22 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_add_u32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 @@ -494,6 +631,26 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_add_u32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst @@ -560,6 +717,27 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_add_u32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst @@ -610,6 +788,21 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_and_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_and_b32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst @@ -668,6 +861,22 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_and_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_and_b32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst @@ -730,6 +939,26 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_and_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_and_b32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -801,6 +1030,27 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_and_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_and_b32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -848,6 +1098,21 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_and_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_and_b32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst ret void @@ -901,6 +1166,22 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_and_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_and_b32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 @@ -958,6 +1239,26 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_and_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_and_b32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst @@ -1024,6 +1325,27 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_and_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_and_b32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst @@ -1074,6 +1396,21 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_sub_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst @@ -1132,6 +1469,22 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_sub_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst @@ -1194,6 +1547,26 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_sub_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -1265,6 +1638,27 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_sub_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -1312,6 +1706,21 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_sub_u32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst ret void @@ -1365,6 +1774,22 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_sub_u32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 @@ -1422,6 +1847,26 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_sub_u32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst @@ -1488,6 +1933,27 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_sub_u32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst @@ -1535,6 +2001,20 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_max_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_max_i32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst @@ -1593,6 +2073,21 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_max_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_max_i32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst @@ -1652,6 +2147,25 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_max_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_max_i32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -1723,6 +2237,26 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_max_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_max_i32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -1767,6 +2301,20 @@ define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) { ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_max_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_max_i32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst ret void @@ -1820,6 +2368,21 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_max_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_max_i32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr %out2 @@ -1874,6 +2437,25 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_max_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_max_i32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst @@ -1940,6 +2522,26 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_max_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_max_i32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst @@ -1987,6 +2589,20 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umax_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_max_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst @@ -2045,6 +2661,21 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umax_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_max_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst @@ -2104,6 +2735,25 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umax_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_max_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -2175,6 +2825,26 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umax_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_max_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -2219,6 +2889,20 @@ define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) { ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umax_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_max_u32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst ret void @@ -2272,6 +2956,21 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umax_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_max_u32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr %out2 @@ -2326,6 +3025,25 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umax_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_max_u32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst @@ -2392,6 +3110,26 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umax_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_max_u32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst @@ -2439,6 +3177,20 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_min_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_min_i32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst @@ -2497,6 +3249,21 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_min_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_min_i32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst @@ -2556,6 +3323,25 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_min_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_min_i32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -2627,6 +3413,26 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_min_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_min_i32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -2671,6 +3477,20 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_min_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_min_i32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst ret void @@ -2724,6 +3544,21 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_min_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_min_i32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr %out2 @@ -2778,6 +3613,25 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_min_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_min_i32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst @@ -2844,6 +3698,26 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_min_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_min_i32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst @@ -2891,6 +3765,20 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umin_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_min_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst @@ -2949,6 +3837,21 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umin_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_min_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst @@ -3008,6 +3911,25 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umin_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_min_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -3079,6 +4001,26 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umin_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_min_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -3123,6 +4065,20 @@ define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) { ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umin_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_min_u32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst ret void @@ -3176,6 +4132,21 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umin_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_min_u32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr %out2 @@ -3230,6 +4201,25 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umin_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_min_u32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst @@ -3296,6 +4286,26 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umin_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_min_u32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst @@ -3346,6 +4356,21 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_or_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_or_b32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst @@ -3404,6 +4429,22 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_or_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_or_b32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst @@ -3466,6 +4507,26 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_or_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_or_b32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -3537,6 +4598,27 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_or_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_or_b32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -3584,6 +4666,21 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_or_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_or_b32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst ret void @@ -3637,6 +4734,22 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_or_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_or_b32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 @@ -3694,6 +4807,26 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_or_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_or_b32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst @@ -3760,6 +4893,27 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_or_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_or_b32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst @@ -3810,6 +4964,21 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_swap_b32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst @@ -3859,6 +5028,21 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_f32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_swap_b32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr float, ptr %out, i32 4 %val = atomicrmw volatile xchg ptr %gep, float %in syncscope("agent") seq_cst @@ -3917,6 +5101,22 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst @@ -3979,6 +5179,26 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_swap_b32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -4050,6 +5270,27 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -4097,6 +5338,21 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst ret void @@ -4150,6 +5406,22 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 @@ -4207,6 +5479,26 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst @@ -4273,6 +5565,27 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst @@ -4325,6 +5638,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_cmpxchg_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst @@ -4386,6 +5712,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_cmpxchg_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst @@ -4455,6 +5797,27 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_cmpxchg_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x3c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v[2:3], v[0:1] offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -4532,6 +5895,28 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x44 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[2:3], v[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -4580,6 +5965,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_cmpxchg_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst ret void @@ -4636,6 +6034,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_cmpxchg_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst %flag = extractvalue { i32, i1 } %val, 0 @@ -4700,6 +6114,27 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_cmpxchg_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x3c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v[2:3], v[0:1] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst @@ -4772,6 +6207,28 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x44 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[2:3], v[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst @@ -4823,6 +6280,21 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xor_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_xor_b32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst @@ -4881,6 +6353,22 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xor_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_xor_b32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst @@ -4943,6 +6431,26 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xor_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_xor_b32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -5014,6 +6522,27 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xor_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_xor_b32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -5061,6 +6590,21 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xor_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_xor_b32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst ret void @@ -5114,6 +6658,22 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xor_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_xor_b32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 @@ -5171,6 +6731,26 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xor_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_xor_b32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst @@ -5237,6 +6817,27 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xor_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_xor_b32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst @@ -5290,6 +6891,19 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %in, i32 4 %val = load atomic i32, ptr %gep seq_cst, align 4 @@ -5339,6 +6953,19 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %val = load atomic i32, ptr %in seq_cst, align 4 store i32 %val, ptr %out @@ -5403,6 +7030,25 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %in, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -5465,6 +7111,25 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %in, i64 %index %val = load atomic i32, ptr %ptr seq_cst, align 4 @@ -5509,6 +7174,17 @@ define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 store atomic i32 %in, ptr %gep seq_cst, align 4 @@ -5548,6 +7224,17 @@ define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: store atomic i32 %in, ptr %out seq_cst, align 4 ret void @@ -5599,6 +7286,21 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_add_u32 s0, s0, s2 +; GFX11-NEXT: s_addc_u32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -5648,6 +7350,21 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index ; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_add_u32 s0, s0, s2 +; GFX11-NEXT: s_addc_u32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index store atomic i32 %in, ptr %ptr seq_cst, align 4 @@ -5700,6 +7417,19 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr float, ptr %in, i32 4 %val = load atomic float, ptr %gep seq_cst, align 4 @@ -5749,6 +7479,19 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %val = load atomic float, ptr %in seq_cst, align 4 store float %val, ptr %out @@ -5813,6 +7556,25 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr float, ptr %in, i64 %index %gep = getelementptr float, ptr %ptr, i32 4 @@ -5875,6 +7637,25 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr float, ptr %in, i64 %index %val = load atomic float, ptr %ptr seq_cst, align 4 @@ -5919,6 +7700,17 @@ define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr float, ptr %out, i32 4 store atomic float %in, ptr %gep seq_cst, align 4 @@ -5958,6 +7750,17 @@ define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: store atomic float %in, ptr %out seq_cst, align 4 ret void @@ -6009,6 +7812,21 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i ; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_add_u32 s0, s0, s2 +; GFX11-NEXT: s_addc_u32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr float, ptr %out, i64 %index %gep = getelementptr float, ptr %ptr, i32 4 @@ -6058,6 +7876,21 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %ind ; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_add_u32 s0, s0, s2 +; GFX11-NEXT: s_addc_u32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr float, ptr %out, i64 %index store atomic float %in, ptr %ptr seq_cst, align 4 @@ -6110,6 +7943,33 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_i8_offset: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_i8_offset: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: flat_load_u8 v2, v[0:1] offset:16 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: flat_store_b8 v[0:1], v2 +; GFX11-FAKE16-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr %in, i64 16 %val = load atomic i8, ptr %gep seq_cst, align 1 @@ -6159,6 +8019,33 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_i8: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_i8: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: flat_load_u8 v2, v[0:1] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: flat_store_b8 v[0:1], v2 +; GFX11-FAKE16-NEXT: s_endpgm entry: %val = load atomic i8, ptr %in seq_cst, align 1 store i8 %val, ptr %out @@ -6220,6 +8107,43 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_i8_addr64_offset: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, s4 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_i8_addr64_offset: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, s4 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: flat_load_u8 v2, v[0:1] offset:16 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: flat_store_b8 v[0:1], v2 +; GFX11-FAKE16-NEXT: s_endpgm entry: %ptr = getelementptr i8, ptr %in, i64 %index %gep = getelementptr i8, ptr %ptr, i64 16 @@ -6265,6 +8189,28 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_store_i8_offset: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 offset:16 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_i8_offset: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: flat_store_b8 v[0:1], v2 offset:16 +; GFX11-FAKE16-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr %out, i64 16 store atomic i8 %in, ptr %gep seq_cst, align 1 @@ -6304,6 +8250,28 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_store_i8: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_i8: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: flat_store_b8 v[0:1], v2 +; GFX11-FAKE16-NEXT: s_endpgm entry: store atomic i8 %in, ptr %out seq_cst, align 1 ret void @@ -6352,6 +8320,33 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_store_i8_addr64_offset: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, s2 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 offset:16 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_i8_addr64_offset: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, s2 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-FAKE16-NEXT: flat_store_b8 v[0:1], v2 offset:16 +; GFX11-FAKE16-NEXT: s_endpgm entry: %ptr = getelementptr i8, ptr %out, i64 %index %gep = getelementptr i8, ptr %ptr, i64 16 @@ -6405,6 +8400,33 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_i16_offset: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_i16_offset: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] offset:16 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-FAKE16-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr %in, i64 8 %val = load atomic i16, ptr %gep seq_cst, align 2 @@ -6454,6 +8476,33 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_i16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_i16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-FAKE16-NEXT: s_endpgm entry: %val = load atomic i16, ptr %in seq_cst, align 2 store i16 %val, ptr %out @@ -6518,6 +8567,45 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_i16_addr64_offset: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, s4 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_i16_addr64_offset: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, s4 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] offset:16 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-FAKE16-NEXT: s_endpgm entry: %ptr = getelementptr i16, ptr %in, i64 %index %gep = getelementptr i16, ptr %ptr, i64 8 @@ -6563,6 +8651,28 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_store_i16_offset: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 offset:16 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_i16_offset: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 offset:16 +; GFX11-FAKE16-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr %out, i64 8 store atomic i16 %in, ptr %gep seq_cst, align 2 @@ -6602,6 +8712,28 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_store_i16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_i16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-FAKE16-NEXT: s_endpgm entry: store atomic i16 %in, ptr %out seq_cst, align 2 ret void @@ -6653,6 +8785,36 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_store_i16_addr64_offset: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, s2 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 offset:16 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_i16_addr64_offset: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, s2 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 offset:16 +; GFX11-FAKE16-NEXT: s_endpgm entry: %ptr = getelementptr i16, ptr %out, i64 %index %gep = getelementptr i16, ptr %ptr, i64 8 @@ -6697,6 +8859,28 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_store_f16_offset: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 offset:16 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_f16_offset: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 offset:16 +; GFX11-FAKE16-NEXT: s_endpgm entry: %gep = getelementptr half, ptr %out, i64 8 store atomic half %in, ptr %gep seq_cst, align 2 @@ -6736,6 +8920,28 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_store_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-FAKE16-NEXT: s_endpgm entry: store atomic half %in, ptr %out seq_cst, align 2 ret void @@ -6774,6 +8980,28 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_store_bf16_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_bf16_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-FAKE16-NEXT: s_endpgm %gep = getelementptr bfloat, ptr %out, i64 8 store atomic bfloat %in, ptr %out seq_cst, align 2 ret void @@ -6812,6 +9040,28 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_store_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-FAKE16-NEXT: s_endpgm store atomic bfloat %in, ptr %out seq_cst, align 2 ret void } @@ -6859,6 +9109,21 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst @@ -6908,6 +9173,21 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32_max_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:4092 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 1023 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst @@ -6959,6 +9239,23 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32_max_offset_p1: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 1024 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst @@ -7017,6 +9314,22 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst @@ -7079,6 +9392,26 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 % ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32_incr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -7150,6 +9483,27 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32_ret_incr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -7197,6 +9551,21 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst ret void @@ -7250,6 +9619,22 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 @@ -7307,6 +9692,26 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32_incr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst @@ -7373,6 +9778,27 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32_ret_incr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst @@ -7423,6 +9849,21 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst @@ -7472,6 +9913,21 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32_max_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:4092 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 1023 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst @@ -7523,6 +9979,23 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32_max_offset_p1: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 1024 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst @@ -7581,6 +10054,22 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst @@ -7643,6 +10132,26 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 % ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32_decr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -7714,6 +10223,27 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32_ret_decr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 @@ -7761,6 +10291,21 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst ret void @@ -7814,6 +10359,22 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 @@ -7871,6 +10432,26 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32_decr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst @@ -7937,6 +10518,27 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32_ret_decr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst @@ -7990,6 +10592,33 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_f16_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_f16_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] offset:16 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-FAKE16-NEXT: s_endpgm %gep = getelementptr half, ptr %in, i64 8 %val = load atomic half, ptr %gep seq_cst, align 2 store half %val, ptr %out @@ -8038,6 +10667,33 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-FAKE16-NEXT: s_endpgm %val = load atomic half, ptr %in seq_cst, align 2 store half %val, ptr %out ret void @@ -8089,6 +10745,33 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_bf16_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_bf16_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] offset:16 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-FAKE16-NEXT: s_endpgm %gep = getelementptr bfloat, ptr %in, i64 8 %val = load atomic bfloat, ptr %gep seq_cst, align 2 store bfloat %val, ptr %out @@ -8137,6 +10820,33 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-FAKE16-NEXT: s_endpgm %val = load atomic bfloat, ptr %in seq_cst, align 2 store bfloat %val, ptr %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index 60ef493b7cf6d..0512b9bc2b54a 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -7589,15 +7589,26 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out) ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: atomic_store_i8_offset: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] offset:16 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: atomic_store_i8_offset: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: global_store_b8 v1, v0, s[0:1] offset:16 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_i8_offset: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-FAKE16-NEXT: global_store_b8 v0, v1, s[0:1] offset:16 +; GFX11-FAKE16-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(1) %out, i64 16 store atomic i8 %in, ptr addrspace(1) %gep seq_cst, align 1 @@ -7637,15 +7648,26 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) { ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: atomic_store_i8: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: atomic_store_i8: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_i8: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-FAKE16-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm entry: store atomic i8 %in, ptr addrspace(1) %out seq_cst, align 1 ret void @@ -7700,7 +7722,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:16 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -7778,7 +7800,7 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:-512 glc +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:-512 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -7838,15 +7860,26 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %ou ; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: atomic_store_i16_offset: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] offset:16 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: atomic_store_i16_offset: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] offset:16 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_i16_offset: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] offset:16 +; GFX11-FAKE16-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr addrspace(1) %out, i64 8 store atomic i16 %in, ptr addrspace(1) %gep seq_cst, align 2 @@ -7886,15 +7919,26 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) { ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: atomic_store_i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: atomic_store_i16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_i16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm entry: store atomic i16 %in, ptr addrspace(1) %out seq_cst, align 2 ret void @@ -7935,15 +7979,26 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %o ; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: atomic_store_f16_offset: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] offset:16 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: atomic_store_f16_offset: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] offset:16 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_f16_offset: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] offset:16 +; GFX11-FAKE16-NEXT: s_endpgm entry: %gep = getelementptr half, ptr addrspace(1) %out, i64 8 store atomic half %in, ptr addrspace(1) %gep seq_cst, align 2 @@ -7983,15 +8038,26 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) { ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: atomic_store_f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: atomic_store_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm entry: store atomic half %in, ptr addrspace(1) %out seq_cst, align 2 ret void @@ -8032,15 +8098,26 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) ; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: atomic_store_bf16_offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] offset:16 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: atomic_store_bf16_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] offset:16 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_bf16_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] offset:16 +; GFX11-FAKE16-NEXT: s_endpgm %gep = getelementptr bfloat, ptr addrspace(1) %out, i64 8 store atomic bfloat %in, ptr addrspace(1) %gep seq_cst, align 2 ret void @@ -8079,15 +8156,26 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: atomic_store_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: atomic_store_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_store_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm store atomic bfloat %in, ptr addrspace(1) %out seq_cst, align 2 ret void } @@ -9099,7 +9187,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:16 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -9176,7 +9264,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:-512 glc +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:-512 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -9249,7 +9337,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:16 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -9326,7 +9414,7 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:-512 glc +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:-512 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv