Skip to content

Commit 0ee6cf6

Browse files
memset lowering improvements in VC (#8)
Co-authored-by: Dmitry Ryabtsev <[email protected]>
1 parent 4a8e6b3 commit 0ee6cf6

File tree

1 file changed

+103
-19
lines changed

1 file changed

+103
-19
lines changed

IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.cpp

+103-19
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
3131
//===----------------------------------------------------------------------===//
3232

3333
#include "GenXLowerAggrCopies.h"
34+
#include "GenX.h"
3435
#include "llvm/Analysis/TargetTransformInfo.h"
3536
#include "llvm/CodeGen/StackProtector.h"
3637
#include "llvm/IR/Constants.h"
@@ -47,10 +48,13 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
4748
#include "llvm/Support/Debug.h"
4849
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
4950
#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
50-
#include "Probe/Assertion.h"
5151

52+
#include "Probe/Assertion.h"
5253
#include "llvmWrapper/IR/DerivedTypes.h"
5354

55+
#include <tuple>
56+
#include <vector>
57+
5458
#define DEBUG_TYPE "GENX_LOWERAGGRCOPIES"
5559

5660
using namespace llvm;
@@ -91,6 +95,98 @@ struct GenXLowerAggrCopies : public FunctionPass {
9195

9296
char GenXLowerAggrCopies::ID = 0;
9397

98+
namespace {
99+
struct SliceInfo {
100+
int Offset;
101+
int Width;
102+
};
103+
} // namespace
104+
105+
static std::vector<SliceInfo> getLegalLengths(int TotalLength) {
106+
std::vector<SliceInfo> Slices;
107+
for (int Offset = 0; TotalLength;) {
108+
int Width = PowerOf2Floor(TotalLength);
109+
Slices.push_back({Offset, Width});
110+
Offset += Width;
111+
TotalLength -= Width;
112+
}
113+
return std::move(Slices);
114+
}
115+
116+
// Original memset intrinsic fills memory with 8-bit values.
117+
// This function checks whether bigger type can be used (e.g. storing by 32-bit
118+
// values).
119+
// Desired type size is provided with \p CoalescedTySize parametr and given in
120+
// bytes.
121+
static bool memSetCanBeCoalesced(MemSetInst &MemSet, int CoalescedTySize) {
122+
auto OrigLength = cast<ConstantInt>(MemSet.getLength())->getSExtValue();
123+
IGC_ASSERT_MESSAGE(MemSet.getValue()->getType()->getScalarSizeInBits() ==
124+
genx::ByteBits,
125+
"memset is expected to store by bytes");
126+
IGC_ASSERT_MESSAGE(CoalescedTySize >= 1 && isPowerOf2_32(CoalescedTySize),
127+
"wrong argument: invalid CoalescedTySize");
128+
return OrigLength % CoalescedTySize == 0 &&
129+
static_cast<int>(MemSet.getDestAlignment()) >= CoalescedTySize;
130+
}
131+
132+
// Original memset intrinsic fills memory with 8-bit values.
133+
// This function checks whether bigger type can be used (e.g. storing by 32-bit
134+
// values).
135+
// New coalesced value and corresponding base address and length are returned
136+
// respectively.
137+
// New instructions may be inserted before the \p MemSet to produce these new
138+
// values.
139+
static std::tuple<Value &, Value &, int>
140+
defineOptimalValueAndLength(MemSetInst &MemSet) {
141+
auto OrigLength = cast<ConstantInt>(MemSet.getLength())->getSExtValue();
142+
Value &OrigSetVal = *MemSet.getValue();
143+
Value &OrigBaseAddr = *MemSet.getRawDest();
144+
145+
// Because DWord is better than Byte and causes minimal problems.
146+
// OWord can be better but but it requires more code.
147+
constexpr int CoalescedTySize = genx::DWordBytes;
148+
if (!memSetCanBeCoalesced(MemSet, CoalescedTySize))
149+
return {OrigSetVal, OrigBaseAddr, OrigLength};
150+
151+
IRBuilder<> IRB{&MemSet};
152+
auto *PreNewSetVal = IRB.CreateVectorSplat(
153+
CoalescedTySize, &OrigSetVal, OrigSetVal.getName() + ".pre.coalesce");
154+
auto *NewSetVal = IRB.CreateBitCast(PreNewSetVal, IRB.getInt32Ty(),
155+
OrigSetVal.getName() + ".coalesce");
156+
auto DstAS = cast<PointerType>(OrigBaseAddr.getType())->getAddressSpace();
157+
auto *NewBaseAddr =
158+
IRB.CreateBitCast(&OrigBaseAddr, IRB.getInt32Ty()->getPointerTo(DstAS),
159+
OrigBaseAddr.getName() + ".align");
160+
return {*NewSetVal, *NewBaseAddr, OrigLength / CoalescedTySize};
161+
}
162+
163+
// Fills memory section/slice defined by \p Slice and \p BaseAddr parameters
164+
// with \p SetVal values. Memory is filled by a vector store instruction.
165+
static void setMemorySliceWithVecStore(SliceInfo Slice, Value &SetVal,
166+
Value &BaseAddr,
167+
Instruction *InsertionPt) {
168+
IGC_ASSERT_MESSAGE(
169+
InsertionPt,
170+
"wrong argument: insertion point must be a valid instruction");
171+
IGC_ASSERT_MESSAGE(Slice.Offset >= 0 && isPowerOf2_32(Slice.Width),
172+
"illegal slice is provided");
173+
IGC_ASSERT_MESSAGE(SetVal.getType() ==
174+
BaseAddr.getType()->getPointerElementType(),
175+
"value and pointer types must correspond");
176+
177+
auto *VecTy = IGCLLVM::FixedVectorType::get(SetVal.getType(), Slice.Width);
178+
IRBuilder<> IRB(InsertionPt);
179+
Value *WriteOut = IRB.CreateVectorSplat(Slice.Width, &SetVal);
180+
auto *DstAddr = &BaseAddr;
181+
if (Slice.Offset != 0)
182+
DstAddr = IRB.CreateGEP(BaseAddr.getType()->getPointerElementType(),
183+
&BaseAddr, IRB.getInt32(Slice.Offset),
184+
BaseAddr.getName() + ".addr.offset");
185+
auto DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
186+
auto *StoreVecPtr = IRB.CreateBitCast(DstAddr, VecTy->getPointerTo(DstAS));
187+
IRB.CreateStore(WriteOut, StoreVecPtr);
188+
}
189+
94190
bool GenXLowerAggrCopies::runOnFunction(Function &F) {
95191
SmallVector<MemIntrinsic *, 4> MemCalls;
96192

@@ -135,26 +231,14 @@ bool GenXLowerAggrCopies::runOnFunction(Function &F) {
135231
} else {
136232
expandMemMoveAsLoop(Memmove);
137233
}
138-
} else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
234+
} else if (MemSetInst *MemSet = dyn_cast<MemSetInst>(MemCall)) {
139235
if (doLinearExpand) {
140-
llvm::Value *SetVal = Memset->getValue();
141-
llvm::Value *LenVal = Memset->getLength();
142-
IGC_ASSERT(isa<Constant>(LenVal));
143-
IGC_ASSERT(SetVal->getType()->getScalarSizeInBits() == 8);
144-
auto Len = (unsigned)cast<ConstantInt>(LenVal)->getZExtValue();
145-
auto VecTy = IGCLLVM::FixedVectorType::get(SetVal->getType(), Len);
146-
Value *WriteOut = UndefValue::get(VecTy);
147-
IRBuilder<> IRB(Memset);
148-
for (unsigned i = 0; i < Len; ++i) {
149-
WriteOut = IRB.CreateInsertElement(WriteOut, SetVal, IRB.getInt32(i));
150-
}
151-
auto DstAddr = Memset->getRawDest();
152-
unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
153-
auto StorePtrV =
154-
IRB.CreateBitCast(DstAddr, VecTy->getPointerTo(dstAS));
155-
IRB.CreateStore(WriteOut, StorePtrV);
236+
auto &&[SetVal, BaseAddr, Len] = defineOptimalValueAndLength(*MemSet);
237+
std::vector<SliceInfo> LegalLengths = getLegalLengths(Len);
238+
for (SliceInfo Slice : LegalLengths)
239+
setMemorySliceWithVecStore(Slice, SetVal, BaseAddr, MemSet);
156240
} else {
157-
expandMemSetAsLoop(Memset);
241+
expandMemSetAsLoop(MemSet);
158242
}
159243
}
160244
MemCall->eraseFromParent();

0 commit comments

Comments
 (0)