Skip to content

Commit ede0118

Browse files
JIT: Emit mulx for GT_MULHI and GT_MUL_LONG if BMI2 is available (#116198)
* WIP: Emit mulx for GT_MULHI * * Handle containment for GT_MUL_LONG on x86 * Fix register for mulx * Cleanup: use GenTree::IsUnsigned helper * update comments * update after merge * * remove move instruction since it is handled by lsra * don't force op1 to implicit register if op2 is already in it * minor formatting fixes * clenaup * Ensure magic number for GT_MULHI for division with constant, is put in rdx * only swap operands for GT_MULHI and GT_MUL_LONG * fix formatting * Fix operand order * Fixes after merge: * use OperIs() * replace Intructionset_BMI2 => InstructionSetAVX2 * fix review comment * kill rdx register for mulx instead of specifying as fixed register for use * fix format * remove register preference for mul, it does only make sense for extended 1 op mul - some cleanup of BuildMul, reorder andremove dead code * fix formatting * remove swap in lowering * update fixed reg in lowering for division by constant * change from isUsedFromMemory to isContained() * Fix review comments
1 parent 536bb84 commit ede0118

File tree

6 files changed

+163
-61
lines changed

6 files changed

+163
-61
lines changed

src/coreclr/jit/codegenxarch.cpp

Lines changed: 54 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -822,38 +822,75 @@ void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
822822
// to get the high bits of the multiply, we are constrained to using the
823823
// 1-op form: RDX:RAX = RAX * rm
824824
// The 3-op form (Rx=Ry*Rz) does not support it.
825-
825+
// When BMI2 is available, we can use the MULX instruction to get the high bits
826826
genConsumeOperands(treeNode->AsOp());
827827

828828
GenTree* regOp = op1;
829829
GenTree* rmOp = op2;
830830

831-
// Set rmOp to the memory operand (if any)
832-
if (op1->isUsedFromMemory() || (op2->isUsedFromReg() && (op2->GetRegNum() == REG_RAX)))
831+
if (op1->isUsedFromMemory())
833832
{
834833
regOp = op2;
835834
rmOp = op1;
836835
}
837836
assert(regOp->isUsedFromReg());
838837

839-
// Setup targetReg when neither of the source operands was a matching register
840-
inst_Mov(targetType, REG_RAX, regOp->GetRegNum(), /* canSkip */ true);
841-
842-
instruction ins;
843-
if ((treeNode->gtFlags & GTF_UNSIGNED) == 0)
838+
if (treeNode->IsUnsigned() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
844839
{
845-
ins = INS_imulEAX;
840+
if (rmOp->isUsedFromReg() && (rmOp->GetRegNum() == REG_RDX))
841+
{
842+
std::swap(regOp, rmOp);
843+
}
844+
845+
// Setup targetReg when neither of the source operands was a matching register
846+
inst_Mov(targetType, REG_RDX, regOp->GetRegNum(), /* canSkip */ true);
847+
848+
if (treeNode->OperIs(GT_MULHI))
849+
{
850+
// emit MULX instruction, use targetReg twice to only store high result
851+
inst_RV_RV_TT(INS_mulx, size, targetReg, targetReg, rmOp, /* isRMW */ false, INS_OPTS_NONE);
852+
}
853+
else
854+
{
855+
#if TARGET_64BIT
856+
assert(false);
857+
#else
858+
assert(treeNode->OperIs(GT_MUL_LONG));
859+
860+
// emit MULX instruction
861+
regNumber hiReg = treeNode->AsMultiRegOp()->GetRegByIndex(1);
862+
inst_RV_RV_TT(INS_mulx, size, hiReg, targetReg, rmOp, /* isRMW */ false, INS_OPTS_NONE);
863+
#endif
864+
}
846865
}
847-
else
866+
else // Generate MUL or IMUL instruction
848867
{
849-
ins = INS_mulEAX;
850-
}
851-
emit->emitInsBinary(ins, size, treeNode, rmOp);
868+
// If op2 is already present in RAX use that as implicit operand
869+
if (rmOp->isUsedFromReg() && (rmOp->GetRegNum() == REG_RAX))
870+
{
871+
std::swap(regOp, rmOp);
872+
}
852873

853-
// Move the result to the desired register, if necessary
854-
if (treeNode->OperIs(GT_MULHI))
855-
{
856-
inst_Mov(targetType, targetReg, REG_RDX, /* canSkip */ true);
874+
// Setup targetReg when neither of the source operands was a matching register
875+
inst_Mov(targetType, REG_RAX, regOp->GetRegNum(), /* canSkip */ true);
876+
877+
instruction ins;
878+
if (!treeNode->IsUnsigned())
879+
{
880+
ins = INS_imulEAX;
881+
}
882+
else
883+
{
884+
ins = INS_mulEAX;
885+
}
886+
emit->emitInsBinary(ins, size, treeNode, rmOp);
887+
888+
// Move the result to the desired register, if necessary
889+
if (treeNode->OperIs(GT_MULHI))
890+
{
891+
assert(targetReg == REG_RDX);
892+
inst_Mov(targetType, targetReg, REG_RDX, /* canSkip */ true);
893+
}
857894
}
858895

859896
genProduceReg(treeNode);

src/coreclr/jit/lower.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7921,11 +7921,12 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
79217921
}
79227922

79237923
#ifdef TARGET_XARCH
7924-
// force input transformation to RAX because the following MULHI will kill RDX:RAX anyway and LSRA often causes
7925-
// redundant copies otherwise
7924+
// force input transformation to RAX/RDX because the following MULHI will kill RDX:RAX (RDX if mulx is
7925+
// available) anyway and LSRA often causes redundant copies otherwise
79267926
if (firstNode && !simpleMul)
79277927
{
7928-
adjustedDividend->SetRegNum(REG_RAX);
7928+
regNumber implicitReg = comp->compOpportunisticallyDependsOn(InstructionSet_AVX2) ? REG_RDX : REG_RAX;
7929+
adjustedDividend->SetRegNum(implicitReg);
79297930
}
79307931
#endif
79317932

src/coreclr/jit/lower.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,13 @@ class Lowering final : public Phase
496496

497497
#endif // TARGET_XARCH
498498

499+
#if TARGET_X86
500+
if (parentNode->OperIs(GT_MUL_LONG))
501+
{
502+
return genTypeSize(childNode->TypeGet()) == operatorSize / 2;
503+
}
504+
#endif // TARGET_X86
505+
499506
return genTypeSize(childNode->TypeGet()) == operatorSize;
500507
}
501508

src/coreclr/jit/lowerxarch.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7851,14 +7851,15 @@ void Lowering::ContainCheckMul(GenTreeOp* node)
78517851
bool isSafeToContainOp1 = true;
78527852
bool isSafeToContainOp2 = true;
78537853

7854-
bool isUnsignedMultiply = ((node->gtFlags & GTF_UNSIGNED) != 0);
7854+
bool isUnsignedMultiply = node->IsUnsigned();
78557855
bool requiresOverflowCheck = node->gtOverflowEx();
78567856
bool useLeaEncoding = false;
78577857
GenTree* memOp = nullptr;
78587858

78597859
bool hasImpliedFirstOperand = false;
78607860
GenTreeIntConCommon* imm = nullptr;
78617861
GenTree* other = nullptr;
7862+
var_types nodeType = node->TypeGet();
78627863

78637864
// Multiply should never be using small types
78647865
assert(!varTypeIsSmall(node->TypeGet()));
@@ -7878,6 +7879,8 @@ void Lowering::ContainCheckMul(GenTreeOp* node)
78787879
else if (node->OperIs(GT_MUL_LONG))
78797880
{
78807881
hasImpliedFirstOperand = true;
7882+
// GT_MUL_LONG hsa node type LONG but work on INT
7883+
nodeType = TYP_INT;
78817884
}
78827885
#endif
78837886
else if (IsContainableImmed(node, op2) || IsContainableImmed(node, op1))
@@ -7914,7 +7917,7 @@ void Lowering::ContainCheckMul(GenTreeOp* node)
79147917
//
79157918
if (memOp == nullptr)
79167919
{
7917-
if ((op2->TypeGet() == node->TypeGet()) && IsContainableMemoryOp(op2))
7920+
if ((op2->TypeGet() == nodeType) && IsContainableMemoryOp(op2))
79187921
{
79197922
isSafeToContainOp2 = IsSafeToContainMem(node, op2);
79207923
if (isSafeToContainOp2)
@@ -7923,7 +7926,7 @@ void Lowering::ContainCheckMul(GenTreeOp* node)
79237926
}
79247927
}
79257928

7926-
if ((memOp == nullptr) && (op1->TypeGet() == node->TypeGet()) && IsContainableMemoryOp(op1))
7929+
if ((memOp == nullptr) && (op1->TypeGet() == nodeType) && IsContainableMemoryOp(op1))
79277930
{
79287931
isSafeToContainOp1 = IsSafeToContainMem(node, op1);
79297932
if (isSafeToContainOp1)
@@ -7934,7 +7937,7 @@ void Lowering::ContainCheckMul(GenTreeOp* node)
79347937
}
79357938
else
79367939
{
7937-
if ((memOp->TypeGet() != node->TypeGet()))
7940+
if ((memOp->TypeGet() != nodeType))
79387941
{
79397942
memOp = nullptr;
79407943
}

src/coreclr/jit/lsrabuild.cpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -782,7 +782,27 @@ regMaskTP LinearScan::getKillSetForMul(GenTreeOp* mulNode)
782782
regMaskTP killMask = RBM_NONE;
783783
#ifdef TARGET_XARCH
784784
assert(mulNode->OperIsMul());
785-
if (!mulNode->OperIs(GT_MUL) || (((mulNode->gtFlags & GTF_UNSIGNED) != 0) && mulNode->gtOverflowEx()))
785+
if (!mulNode->OperIs(GT_MUL))
786+
{
787+
// If we can use the mulx instruction, we don't need to kill RAX
788+
if (mulNode->IsUnsigned() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
789+
{
790+
// If on operand is contained, we define fixed RDX register for use, so we don't need to kill register.
791+
if (mulNode->gtGetOp1()->isContained() || mulNode->gtGetOp2()->isContained())
792+
{
793+
killMask = RBM_NONE;
794+
}
795+
else
796+
{
797+
killMask = RBM_RDX;
798+
}
799+
}
800+
else
801+
{
802+
killMask = RBM_RAX | RBM_RDX;
803+
}
804+
}
805+
else if (mulNode->IsUnsigned() && mulNode->gtOverflowEx())
786806
{
787807
killMask = RBM_RAX | RBM_RDX;
788808
}

src/coreclr/jit/lsraxarch.cpp

Lines changed: 70 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -795,6 +795,14 @@ bool LinearScan::isRMWRegOper(GenTree* tree)
795795
}
796796
return (!tree->gtGetOp2()->isContainedIntOrIImmed() && !tree->gtGetOp1()->isContainedIntOrIImmed());
797797
}
798+
#ifdef TARGET_X86
799+
case GT_MUL_LONG:
800+
#endif
801+
case GT_MULHI:
802+
{
803+
// MUL, IMUL are RMW but mulx is not (which is used for unsigned operands when BMI2 is availible)
804+
return !(tree->IsUnsigned() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2));
805+
}
798806

799807
#ifdef FEATURE_HW_INTRINSICS
800808
case GT_HWINTRINSIC:
@@ -3223,18 +3231,22 @@ int LinearScan::BuildMul(GenTree* tree)
32233231
return BuildSimple(tree);
32243232
}
32253233

3226-
// ToDo-APX : imul currently doesn't have rex2 support. So, cannot use R16-R31.
3227-
int srcCount = BuildBinaryUses(tree->AsOp());
3234+
bool isUnsignedMultiply = tree->IsUnsigned();
3235+
bool requiresOverflowCheck = tree->gtOverflowEx();
3236+
bool useMulx =
3237+
!tree->OperIs(GT_MUL) && isUnsignedMultiply && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2);
3238+
3239+
// ToDo-APX : imul currently doesn't have rex2 support. So, cannot use R16-R31.
3240+
int srcCount = 0;
32283241
int dstCount = 1;
32293242
SingleTypeRegSet dstCandidates = RBM_NONE;
32303243

3231-
bool isUnsignedMultiply = ((tree->gtFlags & GTF_UNSIGNED) != 0);
3232-
bool requiresOverflowCheck = tree->gtOverflowEx();
3233-
3234-
// There are three forms of x86 multiply:
3244+
// There are three forms of x86 multiply in base instruction set
32353245
// one-op form: RDX:RAX = RAX * r/m
32363246
// two-op form: reg *= r/m
32373247
// three-op form: reg = r/m * imm
3248+
// If the BMI2 instruction set is supported there is an additional unsigned multiply
3249+
// mulx reg1:reg2 = RDX * reg3/m
32383250

32393251
// This special widening 32x32->64 MUL is not used on x64
32403252
#if defined(TARGET_X86)
@@ -3244,42 +3256,64 @@ int LinearScan::BuildMul(GenTree* tree)
32443256
assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
32453257
}
32463258

3247-
// We do use the widening multiply to implement
3248-
// the overflow checking for unsigned multiply
3249-
//
3250-
if (isUnsignedMultiply && requiresOverflowCheck)
3259+
if (useMulx)
32513260
{
3252-
// The only encoding provided is RDX:RAX = RAX * rm
3253-
//
3254-
// Here we set RAX as the only destination candidate
3255-
// In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
3256-
//
3257-
dstCandidates = SRBM_RAX;
3258-
}
3259-
else if (tree->OperIs(GT_MULHI))
3260-
{
3261-
// Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
3262-
// upper 32 bits of the result set the destination candidate to REG_RDX.
3263-
dstCandidates = SRBM_RDX;
3264-
}
3261+
// If one of the operands is contained, specify RDX for the other operand
3262+
SingleTypeRegSet srcCandidates1 = RBM_NONE;
3263+
SingleTypeRegSet srcCandidates2 = RBM_NONE;
3264+
if (op1->isContained())
3265+
{
3266+
assert(!op2->isContained());
3267+
srcCandidates2 = SRBM_RDX;
3268+
}
3269+
else if (op2->isContained())
3270+
{
3271+
srcCandidates1 = SRBM_RDX;
3272+
}
3273+
3274+
srcCount = BuildOperandUses(op1, srcCandidates1);
3275+
srcCount += BuildOperandUses(op2, srcCandidates2);
3276+
32653277
#if defined(TARGET_X86)
3266-
else if (tree->OperIs(GT_MUL_LONG))
3267-
{
3268-
// have to use the encoding:RDX:RAX = RAX * rm
3269-
dstCandidates = SRBM_RAX | SRBM_RDX;
3270-
dstCount = 2;
3271-
}
3278+
if (tree->OperIs(GT_MUL_LONG))
3279+
{
3280+
dstCount = 2;
3281+
}
32723282
#endif
3273-
GenTree* containedMemOp = nullptr;
3274-
if (op1->isContained() && !op1->IsCnsIntOrI())
3275-
{
3276-
assert(!op2->isContained() || op2->IsCnsIntOrI());
3277-
containedMemOp = op1;
32783283
}
3279-
else if (op2->isContained() && !op2->IsCnsIntOrI())
3284+
else
32803285
{
3281-
containedMemOp = op2;
3286+
assert(!(op1->isContained() && !op1->IsCnsIntOrI()) || !(op2->isContained() && !op2->IsCnsIntOrI()));
3287+
srcCount = BuildBinaryUses(tree->AsOp());
3288+
3289+
// We do use the widening multiply to implement
3290+
// the overflow checking for unsigned multiply
3291+
//
3292+
if (isUnsignedMultiply && requiresOverflowCheck)
3293+
{
3294+
// The only encoding provided is RDX:RAX = RAX * rm
3295+
//
3296+
// Here we set RAX as the only destination candidate
3297+
// In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
3298+
//
3299+
dstCandidates = SRBM_RAX;
3300+
}
3301+
else if (tree->OperIs(GT_MULHI))
3302+
{
3303+
// Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
3304+
// upper 32 bits of the result set the destination candidate to REG_RDX.
3305+
dstCandidates = SRBM_RDX;
3306+
}
3307+
#if defined(TARGET_X86)
3308+
else if (tree->OperIs(GT_MUL_LONG))
3309+
{
3310+
// We have to use the encoding:RDX:RAX = RAX * rm
3311+
dstCandidates = SRBM_RAX | SRBM_RDX;
3312+
dstCount = 2;
3313+
}
3314+
#endif
32823315
}
3316+
32833317
regMaskTP killMask = getKillSetForMul(tree->AsOp());
32843318
BuildDefWithKills(tree, dstCount, dstCandidates, killMask);
32853319
return srcCount;

0 commit comments

Comments
 (0)