diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index b8109db821bcc..86af897943dae 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -20,6 +20,7 @@ #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Utils/Local.h" @@ -75,6 +76,7 @@ class LiveRegOptimizer { Module &Mod; const DataLayout &DL; const GCNSubtarget &ST; + /// The scalar type to convert to Type *const ConvertToScalar; /// The set of visited Instructions @@ -125,6 +127,131 @@ class LiveRegOptimizer { return LK.first != TargetLoweringBase::TypeLegal; } + /// Check if intrinsic natively operates on 8-bit or 16-bit + bool isNativeIntrinsic(Intrinsic::ID ID) { + switch (ID) { + case Intrinsic::amdgcn_dot4_f32_fp8_bf8: + case Intrinsic::amdgcn_dot4_f32_bf8_fp8: + case Intrinsic::amdgcn_dot4_f32_fp8_fp8: + case Intrinsic::amdgcn_dot4_f32_bf8_bf8: + case Intrinsic::amdgcn_mfma_i32_4x4x4i8: + case Intrinsic::amdgcn_mfma_i32_16x16x4i8: + case Intrinsic::amdgcn_mfma_i32_32x32x4i8: + case Intrinsic::amdgcn_mfma_i32_16x16x16i8: + case Intrinsic::amdgcn_mfma_i32_32x32x8i8: + case Intrinsic::amdgcn_mfma_i32_16x16x64_i8: + case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: + case Intrinsic::amdgcn_mfma_i32_32x32x16_i8: + case Intrinsic::amdgcn_mfma_i32_16x16x32_i8: + case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8: + case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8: + case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8: + case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8: + case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8: + case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8: + case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8: + case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: + case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: + case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: + case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: + case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: + case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: + case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: + case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: + case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: + case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: + case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: + case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8: + case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8: + case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8: + case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8: + case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8: + case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8: + case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: + case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: + case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: + case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4: + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: + case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: + case Intrinsic::amdgcn_raw_buffer_store_format: + case Intrinsic::amdgcn_raw_buffer_store: + case Intrinsic::amdgcn_raw_ptr_buffer_store_format: + case Intrinsic::amdgcn_raw_ptr_buffer_store: + case Intrinsic::amdgcn_struct_buffer_store_format: + case Intrinsic::amdgcn_struct_buffer_store: + case Intrinsic::amdgcn_struct_ptr_buffer_store_format: + case Intrinsic::amdgcn_struct_ptr_buffer_store: + case Intrinsic::amdgcn_raw_tbuffer_store: + case Intrinsic::amdgcn_raw_ptr_tbuffer_store: + case Intrinsic::amdgcn_struct_ptr_tbuffer_store: + case Intrinsic::amdgcn_struct_tbuffer_store: + return true; + default: + return false; + } + } + + bool isOpLegal(Instruction *I) { + if (const auto *Intr = dyn_cast(I)) { + Intrinsic::ID ID = Intr->getIntrinsicID(); + if (isNativeIntrinsic(ID)) + return true; + } + // Stores + if (isa(I)) + return true; + return false; + } + + bool isCoercionProfitable(Instruction *II) { + SmallPtrSet CVisited; + SmallVector UserList; + + // Check users for profitable conditions (across block user which can + // natively handle the illegal vector). + for (User *V : II->users()) + if (auto *UseInst = dyn_cast(V)) + UserList.push_back(UseInst); + + auto IsLookThru = [](Instruction *II) { + if (const auto *Intr = dyn_cast(II)) + return Intr->getIntrinsicID() == Intrinsic::amdgcn_perm; + return isa(II) || isa(II) || + isa(II) || isa(II) || + isa(II); + }; + + while (!UserList.empty()) { + auto CII = UserList.pop_back_val(); + if (!CVisited.insert(CII).second) + continue; + + if (CII->getParent() == II->getParent() && !IsLookThru(II)) + continue; + + if (isOpLegal(CII)) + return true; + + if (IsLookThru(CII)) + for (User *V : CII->users()) + if (auto *UseInst = dyn_cast(V)) + UserList.push_back(UseInst); + } + return false; + } + LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST) : Mod(Mod), DL(Mod.getDataLayout()), ST(ST), ConvertToScalar(Type::getInt32Ty(Mod.getContext())) {} @@ -259,6 +386,9 @@ bool LiveRegOptimizer::optimizeLiveType( if (!shouldReplace(II->getType())) continue; + if (!isCoercionProfitable(II)) + continue; + if (PHINode *Phi = dyn_cast(II)) { PhiNodes.insert(Phi); // Collect all the incoming values of problematic PHI nodes. @@ -478,7 +608,6 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { PreservedAnalyses AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) { const GCNSubtarget &ST = TM.getSubtarget(F); - AssumptionCache &AC = FAM.getResult(F); UniformityInfo &UI = FAM.getResult(F); diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index efbbe2b27f10f..6dabd8c0b83ea 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -13,9 +13,9 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30,25 +30,27 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v6, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB0_3 ; SI-NEXT: s_branch .LBB0_4 ; SI-NEXT: .LBB0_2: -; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB0_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -61,29 +63,29 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v0 -; SI-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: .LBB0_4: ; %exit -; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v2 -; SI-NEXT: v_bfe_i32 v1, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v3, 0, 16 -; SI-NEXT: v_mov_b32_e32 v3, 0xffff0000 -; SI-NEXT: v_bfrev_b32_e32 v4, 1 -; SI-NEXT: v_mov_b32_e32 v5, 0xffff -; SI-NEXT: v_mov_b32_e32 v6, 0x8000 +; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; SI-NEXT: v_mov_b32_e32 v3, 0xffff +; SI-NEXT: v_mov_b32_e32 v4, 0x8000 +; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 +; SI-NEXT: v_bfrev_b32_e32 v6, 1 ; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v1, -1, v7, vcc -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 +; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_8xi16_extract_4xi16: @@ -178,23 +180,26 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v2 -; SI-NEXT: v_or_b32_e32 v4, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB1_3 ; SI-NEXT: s_branch .LBB1_4 ; SI-NEXT: .LBB1_2: -; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB1_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 @@ -209,39 +214,39 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v5, v4, v0 -; SI-NEXT: v_or_b32_e32 v4, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v0 +; SI-NEXT: v_or_b32_e32 v5, v5, v1 ; SI-NEXT: .LBB1_4: ; %exit -; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v4 -; SI-NEXT: v_ashr_i64 v[0:1], v[4:5], 48 +; SI-NEXT: v_bfe_i32 v0, v5, 0, 16 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 -; SI-NEXT: v_bfe_i32 v3, v5, 0, 16 -; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000 -; SI-NEXT: v_bfrev_b32_e32 v5, 1 -; SI-NEXT: v_mov_b32_e32 v6, 0xffff -; SI-NEXT: v_mov_b32_e32 v7, 0x8000 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc +; SI-NEXT: v_bfe_i32 v3, v3, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; SI-NEXT: v_mov_b32_e32 v4, 0xffff +; SI-NEXT: v_mov_b32_e32 v5, 0x8000 +; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 +; SI-NEXT: v_bfrev_b32_e32 v7, 1 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 -; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 ; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; SI-NEXT: v_or_b32_e32 v0, v1, v8 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v8, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_8xi16_extract_4xi16_2: @@ -494,9 +499,9 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -527,25 +532,27 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v6, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB3_3 ; SI-NEXT: s_branch .LBB3_4 ; SI-NEXT: .LBB3_2: -; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB3_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -574,29 +581,29 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v0 -; SI-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: .LBB3_4: ; %exit -; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v2 -; SI-NEXT: v_bfe_i32 v1, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v3, 0, 16 -; SI-NEXT: v_mov_b32_e32 v3, 0xffff0000 -; SI-NEXT: v_bfrev_b32_e32 v4, 1 -; SI-NEXT: v_mov_b32_e32 v5, 0xffff -; SI-NEXT: v_mov_b32_e32 v6, 0x8000 +; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; SI-NEXT: v_mov_b32_e32 v3, 0xffff +; SI-NEXT: v_mov_b32_e32 v4, 0x8000 +; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 +; SI-NEXT: v_bfrev_b32_e32 v6, 1 ; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v1, -1, v7, vcc -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 +; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_16xi16_extract_4xi16: @@ -703,13 +710,13 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -727,15 +734,18 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v2 -; SI-NEXT: v_or_b32_e32 v4, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB4_3 ; SI-NEXT: s_branch .LBB4_4 ; SI-NEXT: .LBB4_2: -; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB4_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 @@ -750,11 +760,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -775,29 +785,29 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v5, v4, v0 -; SI-NEXT: v_or_b32_e32 v4, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: .LBB4_4: ; %exit -; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v4 -; SI-NEXT: v_ashr_i64 v[0:1], v[4:5], 48 +; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; SI-NEXT: v_bfe_i32 v3, v5, 0, 16 -; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000 -; SI-NEXT: v_bfrev_b32_e32 v5, 1 -; SI-NEXT: v_mov_b32_e32 v6, 0xffff -; SI-NEXT: v_mov_b32_e32 v7, 0x8000 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc +; SI-NEXT: v_mov_b32_e32 v4, 0xffff +; SI-NEXT: v_mov_b32_e32 v5, 0x8000 +; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 +; SI-NEXT: v_bfrev_b32_e32 v7, 1 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 -; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; SI-NEXT: v_or_b32_e32 v0, v1, v8 +; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1195,21 +1205,21 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1227,39 +1237,46 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4 -; SI-NEXT: v_or_b32_e32 v5, v10, v2 -; SI-NEXT: v_or_b32_e32 v4, v8, v3 -; SI-NEXT: v_or_b32_e32 v3, v7, v9 -; SI-NEXT: v_or_b32_e32 v2, v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v11, v2 +; SI-NEXT: v_or_b32_e32 v8, v8, v12 +; SI-NEXT: v_or_b32_e32 v2, v10, v13 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB7_3 ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_2: -; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB7_3: ; %T ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1277,52 +1294,52 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_or_b32_e32 v5, v8, v0 -; SI-NEXT: v_or_b32_e32 v4, v7, v1 -; SI-NEXT: v_or_b32_e32 v3, v6, v9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v0 +; SI-NEXT: v_or_b32_e32 v8, v8, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: .LBB7_4: ; %exit -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: s_movk_i32 s34, 0x3800 -; SI-NEXT: v_mov_b32_e32 v8, 0x3d000000 -; SI-NEXT: v_mov_b32_e32 v9, 0x39000000 -; SI-NEXT: v_mov_b32_e32 v10, 0x3d00 -; SI-NEXT: v_mov_b32_e32 v11, 0x3900 +; SI-NEXT: v_mov_b32_e32 v8, 0x3d00 +; SI-NEXT: v_mov_b32_e32 v9, 0x3900 +; SI-NEXT: v_mov_b32_e32 v10, 0x3d000000 +; SI-NEXT: v_mov_b32_e32 v11, 0x39000000 ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v0 -; SI-NEXT: v_cndmask_b32_e32 v12, v8, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc -; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v2 -; SI-NEXT: v_cndmask_b32_e32 v13, v8, v9, vcc -; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4 ; SI-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc -; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v6 -; SI-NEXT: v_cndmask_b32_e32 v14, v8, v9, vcc ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v5 -; SI-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc -; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v7 ; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v6 +; SI-NEXT: v_cndmask_b32_e32 v12, v10, v11, vcc ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v10, v11, vcc -; SI-NEXT: v_or_b32_e32 v0, v0, v12 -; SI-NEXT: v_or_b32_e32 v4, v1, v13 -; SI-NEXT: v_or_b32_e32 v6, v2, v14 -; SI-NEXT: v_or_b32_e32 v2, v3, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_alignbit_b32 v1, v2, v12, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v13, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v7 +; SI-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc +; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4 +; SI-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v5, v12 +; SI-NEXT: v_or_b32_e32 v6, v3, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_16xi16_extract_8xi16_0: diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll index 36a93bd2511ce..1e86842be4e5e 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll @@ -8,7 +8,8 @@ define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] ; GCN-NEXT: s_cbranch_execz .LBB0_2 @@ -19,22 +20,22 @@ define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: s_mov_b32 s9, s10 ; GCN-NEXT: buffer_load_ushort v0, v[2:3], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:2 glc +; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:2 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:4 glc +; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:4 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:6 glc +; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:6 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:8 glc +; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:8 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:10 glc +; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:10 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:12 glc +; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:12 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64 offset:14 glc +; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:14 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v4, v0, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GCN-NEXT: v_or_b32_e32 v5, v0, v1 ; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB0_2: ; %Flow ; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -46,36 +47,36 @@ define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: s_mov_b32 s9, s10 ; GCN-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:2 glc +; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:2 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:4 glc +; GCN-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:4 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:6 glc +; GCN-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:6 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:8 glc +; GCN-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:8 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:10 glc +; GCN-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:10 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:12 glc +; GCN-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:12 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 offset:14 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v2, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GCN-NEXT: v_or_b32_e32 v5, v2, v0 ; GCN-NEXT: .LBB0_4: ; %exit ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_ashrrev_i32_e32 v0, 16, v4 +; GCN-NEXT: v_bfe_i32 v0, v5, 0, 16 ; GCN-NEXT: v_bfe_i32 v1, v4, 0, 16 ; GCN-NEXT: v_mov_b32_e32 v2, 0xffff ; GCN-NEXT: v_mov_b32_e32 v3, 0x8000 ; GCN-NEXT: v_mov_b32_e32 v4, 0xffff8000 -; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] br i1 %c0, label %T, label %F diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll index 8f984bfd4d7f7..883a6b70a5a6d 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll @@ -13,34 +13,27 @@ define amdgpu_kernel void @widget(ptr addrspace(1) %arg, i1 %arg1, ptr addrspace ; CHECK-NEXT: s_clause 0x1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8 -; CHECK-NEXT: v_mov_b32_e32 v2, 8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: global_load_ushort v1, v0, s[0:1] -; CHECK-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2 +; CHECK-NEXT: global_load_sbyte v0, v0, s[0:1] offset:2 ; CHECK-NEXT: s_bitcmp1_b32 s2, 0 ; CHECK-NEXT: s_cselect_b32 s0, -1, 0 ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; CHECK-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; CHECK-NEXT: s_cbranch_vccz .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %bb19 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: ds_write_b32 v1, v1 ; CHECK-NEXT: .LBB0_2: ; %bb20 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_cmp_ne_u16_sdwa s0, v0, v1 src0_sel:WORD_1 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s1, s0 -; CHECK-NEXT: s_xor_b32 s1, exec_lo, s1 +; CHECK-NEXT: s_mov_b32 s0, exec_lo +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmpx_ne_u16_e32 0, v0 +; CHECK-NEXT: s_xor_b32 s0, exec_lo, s0 +; CHECK-NEXT: s_cbranch_execz .LBB0_4 ; CHECK-NEXT: ; %bb.3: ; %bb11 -; CHECK-NEXT: v_mov_b32_e32 v2, 2 -; CHECK-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: ds_write_b32 v0, v1 offset:84 -; CHECK-NEXT: ; %bb.4: ; %bb14 +; CHECK-NEXT: .LBB0_4: ; %bb14 ; CHECK-NEXT: s_endpgm bb: %call = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll index a7f9a4c51aa75..cf2976261d3d2 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll @@ -397,17 +397,13 @@ define amdgpu_kernel void @reuseOp() { ; GFX906-SAME: ) #[[ATTR0]] { ; GFX906-NEXT: entry: ; GFX906-NEXT: [[VEC1:%.*]] = insertelement <16 x i8> zeroinitializer, i8 0, i64 0 -; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <16 x i8> [[VEC1]] to <4 x i32> ; GFX906-NEXT: br label [[BB_1:%.*]] ; GFX906: bb.1: -; GFX906-NEXT: [[VEC1_BC_BC:%.*]] = bitcast <4 x i32> [[VEC1_BC]] to <16 x i8> ; GFX906-NEXT: [[SEL0:%.*]] = select i1 false, <16 x i8> zeroinitializer, <16 x i8> zeroinitializer -; GFX906-NEXT: [[SEL0_BC:%.*]] = bitcast <16 x i8> [[SEL0]] to <4 x i32> -; GFX906-NEXT: [[SEL1:%.*]] = select i1 false, <16 x i8> [[VEC1_BC_BC]], <16 x i8> [[SEL0]] +; GFX906-NEXT: [[SEL1:%.*]] = select i1 false, <16 x i8> [[VEC1]], <16 x i8> [[SEL0]] ; GFX906-NEXT: br label [[BB_2:%.*]] ; GFX906: bb.2: -; GFX906-NEXT: [[SEL0_BC_BC:%.*]] = bitcast <4 x i32> [[SEL0_BC]] to <16 x i8> -; GFX906-NEXT: [[VAL:%.*]] = extractelement <16 x i8> [[SEL0_BC_BC]], i64 0 +; GFX906-NEXT: [[VAL:%.*]] = extractelement <16 x i8> [[SEL0]], i64 0 ; GFX906-NEXT: ret void ; entry: