From a23c2216a698c2456e5fc6b6dfe68e57e604be08 Mon Sep 17 00:00:00 2001 From: Icohedron Date: Thu, 8 May 2025 16:13:32 -0700 Subject: [PATCH 1/4] Legalize memcpy --- llvm/lib/Target/DirectX/DXILLegalizePass.cpp | 78 +++++++++ llvm/test/CodeGen/DirectX/legalize-memcpy.ll | 174 +++++++++++++++++++ 2 files changed, 252 insertions(+) create mode 100644 llvm/test/CodeGen/DirectX/legalize-memcpy.ll diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp index be77a70fa46ba..d9e560a9f8859 100644 --- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp +++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp @@ -15,6 +15,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include @@ -246,6 +247,58 @@ downcastI64toI32InsertExtractElements(Instruction &I, } } +static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src, + ConstantInt *Length) { + + uint64_t ByteLength = Length->getZExtValue(); + if (ByteLength == 0) + return; + + LLVMContext &Ctx = Builder.getContext(); + const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout(); + + auto GetArrTyFromVal = [](Value *Val) { + if (auto *Alloca = dyn_cast(Val)) + return dyn_cast(Alloca->getAllocatedType()); + if (auto *GlobalVar = dyn_cast(Val)) + return dyn_cast(GlobalVar->getValueType()); + llvm_unreachable( + "Expected an Alloca or GlobalVariable in memcpy Src and Dst"); + }; + + ArrayType *ArrTy = GetArrTyFromVal(Dst); + assert(ArrTy && "Expected Dst of memcpy to be a Pointer to an Array Type"); + if (auto *DstGlobalVar = dyn_cast(Dst)) + assert(!DstGlobalVar->isConstant() && + "The Dst of memcpy must not be a constant Global Variable"); + + [[maybe_unused]] ArrayType *SrcArrTy = GetArrTyFromVal(Src); + assert(SrcArrTy && "Expected Src of memcpy to be a Pointer to an Array Type"); + + // This assumption simplifies implementation and covers currently-known + // use-cases for DXIL. It may be relaxed in the future if required. + assert(ArrTy == SrcArrTy && "Array Types of Src and Dst in memcpy must match"); + + Type *ElemTy = ArrTy->getElementType(); + uint64_t ElemSize = DL.getTypeStoreSize(ElemTy); + assert(ElemSize > 0 && "Size must be set"); + + [[maybe_unused]] uint64_t Size = ArrTy->getArrayNumElements(); + assert(ElemSize * Size >= ByteLength && + "Array size must be at least as large as the memcpy length"); + + uint64_t NumElemsToCopy = ByteLength / ElemSize; + assert(ByteLength % ElemSize == 0 && + "memcpy length must be divisible by array element type"); + for (uint64_t I = 0; I < NumElemsToCopy; ++I) { + Value *Offset = ConstantInt::get(Type::getInt32Ty(Ctx), I); + Value *SrcPtr = Builder.CreateGEP(ElemTy, Src, Offset, "gep"); + Value *SrcVal = Builder.CreateLoad(ElemTy, SrcPtr); + Value *DstPtr = Builder.CreateGEP(ElemTy, Dst, Offset, "gep"); + Builder.CreateStore(SrcVal, DstPtr); + } +} + static void emitMemsetExpansion(IRBuilder<> &Builder, Value *Dst, Value *Val, ConstantInt *SizeCI, DenseMap &ReplacedValues) { @@ -296,6 +349,30 @@ static void emitMemsetExpansion(IRBuilder<> &Builder, Value *Dst, Value *Val, } } +static void removeMemCpy(Instruction &I, + SmallVectorImpl &ToRemove, + DenseMap &ReplacedValues) { + + CallInst *CI = dyn_cast(&I); + if (!CI) + return; + + Intrinsic::ID ID = CI->getIntrinsicID(); + if (ID != Intrinsic::memcpy) + return; + + IRBuilder<> Builder(&I); + Value *Dst = CI->getArgOperand(0); + Value *Src = CI->getArgOperand(1); + ConstantInt *Length = dyn_cast(CI->getArgOperand(2)); + assert(Length && "Expected Length to be a ConstantInt"); + ConstantInt *IsVolatile = dyn_cast(CI->getArgOperand(3)); + assert(IsVolatile && "Expected IsVolatile to be a ConstantInt"); + assert(IsVolatile->getZExtValue() == 0 && "Expected IsVolatile to be false"); + emitMemcpyExpansion(Builder, Dst, Src, Length); + ToRemove.push_back(CI); +} + static void removeMemSet(Instruction &I, SmallVectorImpl &ToRemove, DenseMap &ReplacedValues) { @@ -348,6 +425,7 @@ class DXILLegalizationPipeline { LegalizationPipeline.push_back(fixI8UseChain); LegalizationPipeline.push_back(downcastI64toI32InsertExtractElements); LegalizationPipeline.push_back(legalizeFreeze); + LegalizationPipeline.push_back(removeMemCpy); LegalizationPipeline.push_back(removeMemSet); } }; diff --git a/llvm/test/CodeGen/DirectX/legalize-memcpy.ll b/llvm/test/CodeGen/DirectX/legalize-memcpy.ll new file mode 100644 index 0000000000000..09a4d051a2822 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/legalize-memcpy.ll @@ -0,0 +1,174 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -dxil-legalize -dxil-finalize-linkage -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s + +@outputStrides = external local_unnamed_addr addrspace(2) global [2 x <4 x i32>], align 4 + +define void @replace_2x4xint_global_memcpy_test() #0 { +; CHECK-LABEL: define void @replace_2x4xint_global_memcpy_test( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = alloca [2 x <4 x i32>], align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(2) @outputStrides, align 16 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i32>, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[GEP]], align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr addrspace(2) getelementptr (<4 x i32>, ptr addrspace(2) @outputStrides, i32 1), align 16 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i32>, ptr [[TMP1]], i32 1 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[GEP1]], align 16 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[TMP1]]) +; CHECK-NEXT: ret void +; + %1 = alloca [2 x <4 x i32>], align 16 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %1) + call void @llvm.memcpy.p0.p2.i32(ptr nonnull align 16 dereferenceable(32) %1, ptr addrspace(2) align 16 dereferenceable(32) @outputStrides, i32 32, i1 false) + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %1) + ret void +} + +define void @replace_int_memcpy_test() #0 { +; CHECK-LABEL: define void @replace_int_memcpy_test( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = alloca [1 x i32], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = alloca [1 x i32], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[TMP1]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[TMP2]]) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store i32 [[TMP3]], ptr [[GEP1]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[TMP2]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[TMP1]]) +; CHECK-NEXT: ret void +; + %1 = alloca [1 x i32], align 4 + %2 = alloca [1 x i32], align 4 + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %1) + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) + call void @llvm.memcpy.p0.p0.i32(ptr nonnull align 4 dereferenceable(4) %2, ptr align 4 dereferenceable(4) %1, i32 4, i1 false) + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %1) + ret void +} + +define void @replace_int16_memcpy_test() #0 { +; CHECK-LABEL: define void @replace_int16_memcpy_test( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = alloca [2 x i16], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = alloca [2 x i16], align 2 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[TMP1]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[TMP2]]) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[GEP]], align 2 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i16, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store i16 [[TMP3]], ptr [[GEP1]], align 2 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[GEP2]], align 2 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i16, ptr [[TMP2]], i32 1 +; CHECK-NEXT: store i16 [[TMP4]], ptr [[GEP3]], align 2 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[TMP2]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[TMP1]]) +; CHECK-NEXT: ret void +; + %1 = alloca [2 x i16], align 2 + %2 = alloca [2 x i16], align 2 + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %1) + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) + call void @llvm.memcpy.p0.p0.i32(ptr nonnull align 2 dereferenceable(4) %2, ptr align 2 dereferenceable(4) %1, i32 4, i1 false) + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %1) + ret void +} + +define void @replace_float_memcpy_test() #0 { +; CHECK-LABEL: define void @replace_float_memcpy_test( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = alloca [2 x float], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = alloca [2 x float], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[TMP1]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[TMP2]]) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[GEP]], align 4 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store float [[TMP3]], ptr [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr float, ptr [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[GEP2]], align 4 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr float, ptr [[TMP2]], i32 1 +; CHECK-NEXT: store float [[TMP4]], ptr [[GEP3]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[TMP2]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[TMP1]]) +; CHECK-NEXT: ret void +; + %1 = alloca [2 x float], align 4 + %2 = alloca [2 x float], align 4 + call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %1) + call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %2) + call void @llvm.memcpy.p0.p0.i32(ptr nonnull align 4 dereferenceable(8) %2, ptr align 4 dereferenceable(8) %1, i32 8, i1 false) + call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %2) + call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %1) + ret void +} + +define void @replace_double_memcpy_test() #0 { +; CHECK-LABEL: define void @replace_double_memcpy_test( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = alloca [2 x double], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = alloca [2 x double], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[TMP1]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[TMP2]]) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[GEP]], align 8 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr double, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store double [[TMP3]], ptr [[GEP1]], align 8 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr double, ptr [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[GEP2]], align 8 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr double, ptr [[TMP2]], i32 1 +; CHECK-NEXT: store double [[TMP4]], ptr [[GEP3]], align 8 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[TMP2]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[TMP1]]) +; CHECK-NEXT: ret void +; + %1 = alloca [2 x double], align 4 + %2 = alloca [2 x double], align 4 + call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %1) + call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %2) + call void @llvm.memcpy.p0.p0.i32(ptr nonnull align 4 dereferenceable(8) %2, ptr align 4 dereferenceable(8) %1, i32 16, i1 false) + call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %2) + call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %1) + ret void +} + +define void @replace_half_memcpy_test() #0 { +; CHECK-LABEL: define void @replace_half_memcpy_test( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = alloca [2 x half], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = alloca [2 x half], align 2 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[TMP1]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[TMP2]]) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr half, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[GEP]], align 2 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr half, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store half [[TMP3]], ptr [[GEP1]], align 2 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr half, ptr [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = load half, ptr [[GEP2]], align 2 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr half, ptr [[TMP2]], i32 1 +; CHECK-NEXT: store half [[TMP4]], ptr [[GEP3]], align 2 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[TMP2]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[TMP1]]) +; CHECK-NEXT: ret void +; + %1 = alloca [2 x half], align 2 + %2 = alloca [2 x half], align 2 + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %1) + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) + call void @llvm.memcpy.p0.p0.i32(ptr nonnull align 2 dereferenceable(4) %2, ptr align 2 dereferenceable(4) %1, i32 4, i1 false) + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %1) + ret void +} + +attributes #0 = {"hlsl.export"} + + +declare void @llvm.lifetime.end.p0(i64 immarg, ptr captures(none)) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr captures(none)) +declare void @llvm.memcpy.p0.p2.i32(ptr noalias, ptr addrspace(2) noalias readonly, i32, i1) +declare void @llvm.memcpy.p0.p0.i32(ptr noalias, ptr noalias readonly, i32, i1) From 6ab5c90e31f31f9d59c32512a390318e5f5b77e1 Mon Sep 17 00:00:00 2001 From: Icohedron Date: Thu, 8 May 2025 16:47:58 -0700 Subject: [PATCH 2/4] Apply clang-format --- llvm/lib/Target/DirectX/DXILLegalizePass.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp index d9e560a9f8859..9d5886becb996 100644 --- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp +++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp @@ -277,7 +277,8 @@ static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src, // This assumption simplifies implementation and covers currently-known // use-cases for DXIL. It may be relaxed in the future if required. - assert(ArrTy == SrcArrTy && "Array Types of Src and Dst in memcpy must match"); + assert(ArrTy == SrcArrTy && + "Array Types of Src and Dst in memcpy must match"); Type *ElemTy = ArrTy->getElementType(); uint64_t ElemSize = DL.getTypeStoreSize(ElemTy); From 8d1223794136379f5ba90885b9f084aa5cdd3031 Mon Sep 17 00:00:00 2001 From: "Deric C." Date: Fri, 9 May 2025 12:16:55 -0700 Subject: [PATCH 3/4] Specify return type of lambda expression Co-authored-by: Finn Plummer --- llvm/lib/Target/DirectX/DXILLegalizePass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp index 9d5886becb996..0b4b59e621773 100644 --- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp +++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp @@ -257,7 +257,7 @@ static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src, LLVMContext &Ctx = Builder.getContext(); const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout(); - auto GetArrTyFromVal = [](Value *Val) { + auto GetArrTyFromVal = [](Value *Val) -> ArrayType * { if (auto *Alloca = dyn_cast(Val)) return dyn_cast(Alloca->getAllocatedType()); if (auto *GlobalVar = dyn_cast(Val)) From b44152e8b595cb328b18e37fc6bdbe0e4c649f47 Mon Sep 17 00:00:00 2001 From: Icohedron Date: Fri, 9 May 2025 20:34:07 +0000 Subject: [PATCH 4/4] Replace llvm_unreachable with assert --- llvm/lib/Target/DirectX/DXILLegalizePass.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp index 0b4b59e621773..e43c0ffe33175 100644 --- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp +++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp @@ -15,7 +15,6 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include @@ -258,12 +257,14 @@ static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src, const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout(); auto GetArrTyFromVal = [](Value *Val) -> ArrayType * { + assert(isa(Val) || + isa(Val) && + "Expected Val to be an Alloca or Global Variable"); if (auto *Alloca = dyn_cast(Val)) return dyn_cast(Alloca->getAllocatedType()); if (auto *GlobalVar = dyn_cast(Val)) return dyn_cast(GlobalVar->getValueType()); - llvm_unreachable( - "Expected an Alloca or GlobalVariable in memcpy Src and Dst"); + return nullptr; }; ArrayType *ArrTy = GetArrTyFromVal(Dst);