Skip to content

Commit 4bf42a7

Browse files
committed
Revert "Revert "Add SIMD to LowerCallMemcmp (dotnet#84530)" (dotnet#84595)"
1 parent f8435f4 commit 4bf42a7

File tree

2 files changed

+59
-15
lines changed

2 files changed

+59
-15
lines changed

src/coreclr/jit/gentree.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7384,9 +7384,9 @@ GenTree* Compiler::gtNewZeroConNode(var_types type)
73847384
#ifdef FEATURE_SIMD
73857385
if (varTypeIsSIMD(type))
73867386
{
7387-
GenTreeVecCon* allBitsSet = gtNewVconNode(type);
7388-
allBitsSet->gtSimdVal = simd_t::Zero();
7389-
return allBitsSet;
7387+
GenTreeVecCon* vecCon = gtNewVconNode(type);
7388+
vecCon->gtSimdVal = simd_t::Zero();
7389+
return vecCon;
73907390
}
73917391
#endif // FEATURE_SIMD
73927392

src/coreclr/jit/lower.cpp

Lines changed: 56 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1902,8 +1902,20 @@ GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call)
19021902
{
19031903
GenTree* lArg = call->gtArgs.GetUserArgByIndex(0)->GetNode();
19041904
GenTree* rArg = call->gtArgs.GetUserArgByIndex(1)->GetNode();
1905-
// TODO: Add SIMD path for [16..128] via GT_HWINTRINSIC nodes
1906-
if (cnsSize <= 16)
1905+
1906+
ssize_t MaxUnrollSize = 16;
1907+
#ifdef FEATURE_SIMD
1908+
MaxUnrollSize = 32;
1909+
#ifdef TARGET_XARCH
1910+
if (comp->compOpportunisticallyDependsOn(InstructionSet_Vector256))
1911+
{
1912+
MaxUnrollSize = 64;
1913+
}
1914+
// TODO-XARCH-AVX512: Consider enabling this for AVX512
1915+
#endif
1916+
#endif
1917+
1918+
if (cnsSize <= MaxUnrollSize)
19071919
{
19081920
unsigned loadWidth = 1 << BitOperations::Log2((unsigned)cnsSize);
19091921
var_types loadType;
@@ -1919,11 +1931,25 @@ GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call)
19191931
{
19201932
loadType = TYP_INT;
19211933
}
1922-
else if ((loadWidth == 8) || (loadWidth == 16))
1934+
else if ((loadWidth == 8) || (MaxUnrollSize == 16))
19231935
{
19241936
loadWidth = 8;
19251937
loadType = TYP_LONG;
19261938
}
1939+
#ifdef FEATURE_SIMD
1940+
else if ((loadWidth == 16) || (MaxUnrollSize == 32))
1941+
{
1942+
loadWidth = 16;
1943+
loadType = TYP_SIMD16;
1944+
}
1945+
#ifdef TARGET_XARCH
1946+
else if ((loadWidth == 32) || (MaxUnrollSize == 64))
1947+
{
1948+
loadWidth = 32;
1949+
loadType = TYP_SIMD32;
1950+
}
1951+
#endif // TARGET_XARCH
1952+
#endif // FEATURE_SIMD
19271953
else
19281954
{
19291955
unreached();
@@ -1932,8 +1958,26 @@ GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call)
19321958

19331959
GenTree* result = nullptr;
19341960

1961+
auto newBinaryOp = [](Compiler* comp, genTreeOps oper, var_types type, GenTree* op1,
1962+
GenTree* op2) -> GenTree* {
1963+
#ifdef FEATURE_SIMD
1964+
if (varTypeIsSIMD(op1))
1965+
{
1966+
if (GenTree::OperIsCmpCompare(oper))
1967+
{
1968+
assert(type == TYP_INT);
1969+
return comp->gtNewSimdCmpOpAllNode(oper, TYP_BOOL, op1, op2, CORINFO_TYPE_NATIVEUINT,
1970+
genTypeSize(op1));
1971+
}
1972+
return comp->gtNewSimdBinOpNode(oper, op1->TypeGet(), op1, op2, CORINFO_TYPE_NATIVEUINT,
1973+
genTypeSize(op1));
1974+
}
1975+
#endif
1976+
return comp->gtNewOperNode(oper, type, op1, op2);
1977+
};
1978+
19351979
// loadWidth == cnsSize means a single load is enough for both args
1936-
if ((loadWidth == (unsigned)cnsSize) && (loadWidth <= 8))
1980+
if (loadWidth == (unsigned)cnsSize)
19371981
{
19381982
// We're going to emit something like the following:
19391983
//
@@ -1943,7 +1987,7 @@ GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call)
19431987
//
19441988
GenTree* lIndir = comp->gtNewIndir(loadType, lArg);
19451989
GenTree* rIndir = comp->gtNewIndir(loadType, rArg);
1946-
result = comp->gtNewOperNode(GT_EQ, TYP_INT, lIndir, rIndir);
1990+
result = newBinaryOp(comp, GT_EQ, TYP_INT, lIndir, rIndir);
19471991

19481992
BlockRange().InsertAfter(lArg, lIndir);
19491993
BlockRange().InsertAfter(rArg, rIndir);
@@ -1990,17 +2034,17 @@ GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call)
19902034
//
19912035
GenTree* l1Indir = comp->gtNewIndir(loadType, lArgUse.Def());
19922036
GenTree* r1Indir = comp->gtNewIndir(loadType, rArgUse.Def());
1993-
GenTree* lXor = comp->gtNewOperNode(GT_XOR, actualLoadType, l1Indir, r1Indir);
2037+
GenTree* lXor = newBinaryOp(comp, GT_XOR, actualLoadType, l1Indir, r1Indir);
19942038
GenTree* l2Offs = comp->gtNewIconNode(cnsSize - loadWidth, TYP_I_IMPL);
1995-
GenTree* l2AddOffs = comp->gtNewOperNode(GT_ADD, lArg->TypeGet(), lArgClone, l2Offs);
2039+
GenTree* l2AddOffs = newBinaryOp(comp, GT_ADD, lArg->TypeGet(), lArgClone, l2Offs);
19962040
GenTree* l2Indir = comp->gtNewIndir(loadType, l2AddOffs);
19972041
GenTree* r2Offs = comp->gtCloneExpr(l2Offs); // offset is the same
1998-
GenTree* r2AddOffs = comp->gtNewOperNode(GT_ADD, rArg->TypeGet(), rArgClone, r2Offs);
2042+
GenTree* r2AddOffs = newBinaryOp(comp, GT_ADD, rArg->TypeGet(), rArgClone, r2Offs);
19992043
GenTree* r2Indir = comp->gtNewIndir(loadType, r2AddOffs);
2000-
GenTree* rXor = comp->gtNewOperNode(GT_XOR, actualLoadType, l2Indir, r2Indir);
2001-
GenTree* resultOr = comp->gtNewOperNode(GT_OR, actualLoadType, lXor, rXor);
2002-
GenTree* zeroCns = comp->gtNewIconNode(0, actualLoadType);
2003-
result = comp->gtNewOperNode(GT_EQ, TYP_INT, resultOr, zeroCns);
2044+
GenTree* rXor = newBinaryOp(comp, GT_XOR, actualLoadType, l2Indir, r2Indir);
2045+
GenTree* resultOr = newBinaryOp(comp, GT_OR, actualLoadType, lXor, rXor);
2046+
GenTree* zeroCns = comp->gtNewZeroConNode(actualLoadType);
2047+
result = newBinaryOp(comp, GT_EQ, TYP_INT, resultOr, zeroCns);
20042048

20052049
BlockRange().InsertAfter(rArgClone, l1Indir, r1Indir, l2Offs, l2AddOffs);
20062050
BlockRange().InsertAfter(l2AddOffs, l2Indir, r2Offs, r2AddOffs, r2Indir);

0 commit comments

Comments
 (0)