Skip to content

Commit c0d4efe

Browse files
Carry ExtractMostSignificantBits through to LIR and add constant folding support (#117673)
* Carry ExtractMostSignificantBits through to LIR and add constant folding support * Ensure 64-bit masks create 64-bit constants when folded * Handle the fact that V512.EMSB always returns TYP_LONG * Expose a GetRawBits and GetBitMask helper on simdmask_t to ensure we get valid data
1 parent 755ddd7 commit c0d4efe

File tree

9 files changed

+590
-243
lines changed

9 files changed

+590
-243
lines changed

src/coreclr/jit/gentree.cpp

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32618,6 +32618,78 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
3261832618
{
3261932619
switch (ni)
3262032620
{
32621+
#if defined(TARGET_ARM64)
32622+
case NI_Vector64_ExtractMostSignificantBits:
32623+
#elif defined(TARGET_XARCH)
32624+
case NI_Vector256_ExtractMostSignificantBits:
32625+
case NI_X86Base_MoveMask:
32626+
case NI_AVX_MoveMask:
32627+
case NI_AVX2_MoveMask:
32628+
#endif
32629+
case NI_Vector128_ExtractMostSignificantBits:
32630+
{
32631+
simdmask_t simdMaskVal;
32632+
32633+
switch (simdSize)
32634+
{
32635+
case 8:
32636+
{
32637+
EvaluateExtractMSB<simd8_t>(simdBaseType, &simdMaskVal, cnsNode->AsVecCon()->gtSimd8Val);
32638+
break;
32639+
}
32640+
32641+
case 16:
32642+
{
32643+
EvaluateExtractMSB<simd16_t>(simdBaseType, &simdMaskVal, cnsNode->AsVecCon()->gtSimd16Val);
32644+
break;
32645+
}
32646+
32647+
#if defined(TARGET_XARCH)
32648+
case 32:
32649+
{
32650+
EvaluateExtractMSB<simd32_t>(simdBaseType, &simdMaskVal, cnsNode->AsVecCon()->gtSimd32Val);
32651+
break;
32652+
}
32653+
#endif // TARGET_XARCH
32654+
32655+
default:
32656+
{
32657+
unreached();
32658+
}
32659+
}
32660+
32661+
uint32_t elemCount = simdSize / genTypeSize(simdBaseType);
32662+
uint64_t mask = simdMaskVal.GetRawBits() & simdmask_t::GetBitMask(elemCount);
32663+
32664+
assert(varTypeIsInt(retType));
32665+
assert(elemCount <= 32);
32666+
32667+
resultNode = gtNewIconNode(static_cast<int32_t>(mask));
32668+
break;
32669+
}
32670+
32671+
#ifdef TARGET_XARCH
32672+
case NI_AVX512_MoveMask:
32673+
{
32674+
GenTreeMskCon* mskCns = cnsNode->AsMskCon();
32675+
32676+
uint32_t elemCount = simdSize / genTypeSize(simdBaseType);
32677+
uint64_t mask = mskCns->gtSimdMaskVal.GetRawBits() & simdmask_t::GetBitMask(elemCount);
32678+
32679+
if (varTypeIsInt(retType))
32680+
{
32681+
assert(elemCount <= 32);
32682+
resultNode = gtNewIconNode(static_cast<int32_t>(mask));
32683+
}
32684+
else
32685+
{
32686+
assert(varTypeIsLong(retType));
32687+
resultNode = gtNewLconNode(static_cast<int64_t>(mask));
32688+
}
32689+
break;
32690+
}
32691+
#endif // TARGET_XARCH
32692+
3262132693
#ifdef TARGET_ARM64
3262232694
case NI_ArmBase_LeadingZeroCount:
3262332695
#else

src/coreclr/jit/hwintrinsicarm64.cpp

Lines changed: 2 additions & 160 deletions
Original file line numberDiff line numberDiff line change
@@ -1346,166 +1346,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
13461346
case NI_Vector128_ExtractMostSignificantBits:
13471347
{
13481348
assert(sig->numArgs == 1);
1349-
1350-
// ARM64 doesn't have a single instruction that performs the behavior so we'll emulate it instead.
1351-
// To do this, we effectively perform the following steps:
1352-
// 1. tmp = input & 0x80 ; and the input to clear all but the most significant bit
1353-
// 2. tmp = tmp >> index ; right shift each element by its index
1354-
// 3. tmp = sum(tmp) ; sum the elements together
1355-
1356-
// For byte/sbyte, we also need to handle the fact that we can only shift by up to 8
1357-
// but for Vector128, we have 16 elements to handle. In that scenario, we will simply
1358-
// extract both scalars, and combine them via: (upper << 8) | lower
1359-
1360-
var_types simdType = getSIMDTypeForSize(simdSize);
1361-
1362-
op1 = impSIMDPopStack();
1363-
1364-
GenTreeVecCon* vecCon2 = gtNewVconNode(simdType);
1365-
GenTreeVecCon* vecCon3 = gtNewVconNode(simdType);
1366-
1367-
switch (simdBaseType)
1368-
{
1369-
case TYP_BYTE:
1370-
case TYP_UBYTE:
1371-
{
1372-
simdBaseType = TYP_UBYTE;
1373-
simdBaseJitType = CORINFO_TYPE_UBYTE;
1374-
1375-
vecCon2->gtSimdVal.u64[0] = 0x8080808080808080;
1376-
vecCon3->gtSimdVal.u64[0] = 0x00FFFEFDFCFBFAF9;
1377-
1378-
if (simdSize == 16)
1379-
{
1380-
vecCon2->gtSimdVal.u64[1] = 0x8080808080808080;
1381-
vecCon3->gtSimdVal.u64[1] = 0x00FFFEFDFCFBFAF9;
1382-
}
1383-
break;
1384-
}
1385-
1386-
case TYP_SHORT:
1387-
case TYP_USHORT:
1388-
{
1389-
simdBaseType = TYP_USHORT;
1390-
simdBaseJitType = CORINFO_TYPE_USHORT;
1391-
1392-
vecCon2->gtSimdVal.u64[0] = 0x8000800080008000;
1393-
vecCon3->gtSimdVal.u64[0] = 0xFFF4FFF3FFF2FFF1;
1394-
1395-
if (simdSize == 16)
1396-
{
1397-
vecCon2->gtSimdVal.u64[1] = 0x8000800080008000;
1398-
vecCon3->gtSimdVal.u64[1] = 0xFFF8FFF7FFF6FFF5;
1399-
}
1400-
break;
1401-
}
1402-
1403-
case TYP_INT:
1404-
case TYP_UINT:
1405-
case TYP_FLOAT:
1406-
{
1407-
simdBaseType = TYP_INT;
1408-
simdBaseJitType = CORINFO_TYPE_INT;
1409-
1410-
vecCon2->gtSimdVal.u64[0] = 0x8000000080000000;
1411-
vecCon3->gtSimdVal.u64[0] = 0xFFFFFFE2FFFFFFE1;
1412-
1413-
if (simdSize == 16)
1414-
{
1415-
vecCon2->gtSimdVal.u64[1] = 0x8000000080000000;
1416-
vecCon3->gtSimdVal.u64[1] = 0xFFFFFFE4FFFFFFE3;
1417-
}
1418-
break;
1419-
}
1420-
1421-
case TYP_LONG:
1422-
case TYP_ULONG:
1423-
case TYP_DOUBLE:
1424-
{
1425-
simdBaseType = TYP_LONG;
1426-
simdBaseJitType = CORINFO_TYPE_LONG;
1427-
1428-
vecCon2->gtSimdVal.u64[0] = 0x8000000000000000;
1429-
vecCon3->gtSimdVal.u64[0] = 0xFFFFFFFFFFFFFFC1;
1430-
1431-
if (simdSize == 16)
1432-
{
1433-
vecCon2->gtSimdVal.u64[1] = 0x8000000000000000;
1434-
vecCon3->gtSimdVal.u64[1] = 0xFFFFFFFFFFFFFFC2;
1435-
}
1436-
break;
1437-
}
1438-
1439-
default:
1440-
{
1441-
unreached();
1442-
}
1443-
}
1444-
1445-
op3 = vecCon3;
1446-
op2 = vecCon2;
1447-
op1 = gtNewSimdHWIntrinsicNode(simdType, op1, op2, NI_AdvSimd_And, simdBaseJitType, simdSize);
1448-
1449-
NamedIntrinsic shiftIntrinsic = NI_AdvSimd_ShiftLogical;
1450-
1451-
if ((simdSize == 8) && varTypeIsLong(simdBaseType))
1452-
{
1453-
shiftIntrinsic = NI_AdvSimd_ShiftLogicalScalar;
1454-
}
1455-
1456-
op1 = gtNewSimdHWIntrinsicNode(simdType, op1, op3, shiftIntrinsic, simdBaseJitType, simdSize);
1457-
1458-
if (varTypeIsByte(simdBaseType) && (simdSize == 16))
1459-
{
1460-
op1 = impCloneExpr(op1, &op2, CHECK_SPILL_ALL,
1461-
nullptr DEBUGARG("Clone op1 for vector extractmostsignificantbits"));
1462-
1463-
op1 = gtNewSimdGetLowerNode(TYP_SIMD8, op1, simdBaseJitType, simdSize);
1464-
op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, 8);
1465-
op1 = gtNewSimdToScalarNode(genActualType(simdBaseType), op1, simdBaseJitType, 8);
1466-
op1 = gtNewCastNode(TYP_INT, op1, /* isUnsigned */ true, TYP_INT);
1467-
1468-
GenTree* zero = gtNewZeroConNode(TYP_SIMD16);
1469-
ssize_t index = 8 / genTypeSize(simdBaseType);
1470-
1471-
op2 = gtNewSimdGetUpperNode(TYP_SIMD8, op2, simdBaseJitType, simdSize);
1472-
op2 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op2, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, 8);
1473-
op2 = gtNewSimdToScalarNode(genActualType(simdBaseType), op2, simdBaseJitType, 8);
1474-
op2 = gtNewCastNode(TYP_INT, op2, /* isUnsigned */ true, TYP_INT);
1475-
1476-
op2 = gtNewOperNode(GT_LSH, TYP_INT, op2, gtNewIconNode(8));
1477-
retNode = gtNewOperNode(GT_OR, TYP_INT, op1, op2);
1478-
}
1479-
else
1480-
{
1481-
if (!varTypeIsLong(simdBaseType))
1482-
{
1483-
if ((simdSize == 8) && ((simdBaseType == TYP_INT) || (simdBaseType == TYP_UINT)))
1484-
{
1485-
op1 = impCloneExpr(op1, &op2, CHECK_SPILL_ALL,
1486-
nullptr DEBUGARG("Clone op1 for vector extractmostsignificantbits"));
1487-
op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, op2, NI_AdvSimd_AddPairwise, simdBaseJitType,
1488-
simdSize);
1489-
}
1490-
else
1491-
{
1492-
op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType,
1493-
simdSize);
1494-
}
1495-
}
1496-
else if (simdSize == 16)
1497-
{
1498-
op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddPairwiseScalar, simdBaseJitType,
1499-
simdSize);
1500-
}
1501-
1502-
retNode = gtNewSimdToScalarNode(genActualType(simdBaseType), op1, simdBaseJitType, 8);
1503-
1504-
if ((simdBaseType != TYP_INT) && (simdBaseType != TYP_UINT))
1505-
{
1506-
retNode = gtNewCastNode(TYP_INT, retNode, /* isUnsigned */ true, TYP_INT);
1507-
}
1508-
}
1349+
op1 = impSIMDPopStack();
1350+
retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize);
15091351
break;
15101352
}
15111353

src/coreclr/jit/hwintrinsiclistarm64.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ HARDWARE_INTRINSIC(Vector64, CreateSequence,
5151
HARDWARE_INTRINSIC(Vector64, Dot, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
5252
HARDWARE_INTRINSIC(Vector64, Equals, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
5353
HARDWARE_INTRINSIC(Vector64, EqualsAny, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
54-
HARDWARE_INTRINSIC(Vector64, ExtractMostSignificantBits, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
54+
HARDWARE_INTRINSIC(Vector64, ExtractMostSignificantBits, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
5555
HARDWARE_INTRINSIC(Vector64, Floor, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
5656
HARDWARE_INTRINSIC(Vector64, FusedMultiplyAdd, 8, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
5757
HARDWARE_INTRINSIC(Vector64, GetElement, 8, 2, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SupportsContainment)
@@ -182,7 +182,7 @@ HARDWARE_INTRINSIC(Vector128, CreateSequence,
182182
HARDWARE_INTRINSIC(Vector128, Dot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
183183
HARDWARE_INTRINSIC(Vector128, Equals, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
184184
HARDWARE_INTRINSIC(Vector128, EqualsAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
185-
HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
185+
HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
186186
HARDWARE_INTRINSIC(Vector128, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
187187
HARDWARE_INTRINSIC(Vector128, FusedMultiplyAdd, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
188188
HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SupportsContainment)

0 commit comments

Comments
 (0)