Skip to content

Commit d34d0af

Browse files
fangliu2020igcbot
authored andcommitted
Enable madw use in i64 mul emulator. #2 try.
Fixed the regression caused by a regioning issue in moving madw low/high results to other variables. And enable madw use in i64 mul emulator again. befor fix: madw (M1_NM, 1) int64Tmp(0,0)<1> localSize_0(0,0)<0;1,0> localSize_0(0,1)<0;1,0> 0x0:ud mov (M1_NM, 1) V0101(0,0)<1> int64Tmp(0,0)<1;1,0> mov (M1_NM, 1) V0103(0,0)<1> int64Tmp(2,0)<1;1,0> After fix: madw (M1_NM, 1) int64Tmp(0,0)<1> localSize_0(0,0)<0;1,0> localSize_0(0,1)<0;1,0> 0x0:ud mov (M1_NM, 1) V0101(0,0)<1> int64Tmp(0,0)<1;1,0> mov (M1_NM, 1) V0103(0,0)<1> int64Tmp(1,0)<1;1,0>
1 parent 7c8a4d0 commit d34d0af

File tree

1 file changed

+8
-8
lines changed

1 file changed

+8
-8
lines changed

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2530,17 +2530,17 @@ void EmitPass::EmitMulPair(GenIntrinsicInst* GII, const SSource Sources[4], cons
25302530
else
25312531
{
25322532
// For those platforms natively not support DW-DW multiply, use vISA madw instruction instead of mul/mulh to get better performance.
2533-
if (false && m_currShader->m_Platform->noNativeDwordMulSupport())
2533+
if (m_currShader->m_Platform->noNativeDwordMulSupport())
25342534
{
25352535
// (Cr, E) = A * B
25362536
// dst size should be GRF-aligned and doubled as it has both low and high results.
25372537
// We must make the dst element number is numDWPerGRF aligned. For example, if the madw is SIMD1,
25382538
// the dst has only 1 DW as low result in 1 GRF and only 1 DW as high result in another GRF. We should
25392539
// set the dst as (numDWPerGRF * 2) element but not 2 DW elements. This is required by madw.
25402540
auto numDWPerGRF = getGRFSize() / SIZE_DWORD;
2541-
auto numElements = iSTD::Align(Lo->GetNumberElement(), numDWPerGRF) * 2;
2541+
auto numElements = iSTD::Align(Lo->GetNumberElement(), numDWPerGRF);
25422542
CVariable* DstTmp = m_currShader->GetNewVariable(
2543-
numElements, ISA_TYPE_UD, EALIGN_GRF, Lo->IsUniform(),
2543+
numElements * 2, ISA_TYPE_UD, EALIGN_GRF, Lo->IsUniform(),
25442544
CName(Lo->getName(), "int64Tmp"));
25452545
CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_UD);
25462546
m_encoder->Madw(DstTmp, L0, L1, zero);
@@ -2551,7 +2551,7 @@ void EmitPass::EmitMulPair(GenIntrinsicInst* GII, const SSource Sources[4], cons
25512551
m_encoder->Push();
25522552

25532553
// dstHigh = Cr
2554-
uint regOffset = (uint)std::ceil((float)(numLanes(m_currShader->m_SIMDSize) * CEncoder::GetCISADataTypeSize(ISA_TYPE_UD)) / getGRFSize());
2554+
uint regOffset = (uint)std::ceil((float)(numElements * CEncoder::GetCISADataTypeSize(ISA_TYPE_UD)) / getGRFSize());
25552555
m_encoder->SetSrcSubVar(0, regOffset);
25562556
m_encoder->SetSrcRegion(0, 1, 1, 0);
25572557
m_encoder->Copy(dstHiTmp, DstTmp);
@@ -3662,7 +3662,7 @@ void EmitPass::Mul64(CVariable* dst, CVariable* src[2], SIMDMode simdMode, bool
36623662
// dstHigh = F + G + carry
36633663

36643664
// For those platforms natively not support DW-DW multiply, use vISA madw instruction instead of mul/mulh to get better performance.
3665-
if (false && m_currShader->m_Platform->noNativeDwordMulSupport())
3665+
if (m_currShader->m_Platform->noNativeDwordMulSupport())
36663666
{
36673667
// (Cr, E) = A * B
36683668
EncoderInit();
@@ -3671,9 +3671,9 @@ void EmitPass::Mul64(CVariable* dst, CVariable* src[2], SIMDMode simdMode, bool
36713671
// the dst has only 1 DW as low result in 1 GRF and only 1 DW as high result in another GRF. We should
36723672
// set the dst as (numDWPerGRF * 2) element but not 2 DW elements. This is required by madw.
36733673
auto numDWPerGRF = getGRFSize() / SIZE_DWORD;
3674-
auto numElements = iSTD::Align(dst->GetNumberElement(), numDWPerGRF) * 2;
3674+
auto numElements = iSTD::Align(dst->GetNumberElement(), numDWPerGRF);
36753675
CVariable* dstTmp = m_currShader->GetNewVariable(
3676-
numElements, ISA_TYPE_UD, EALIGN_GRF, dst->IsUniform(),
3676+
numElements * 2, ISA_TYPE_UD, EALIGN_GRF, dst->IsUniform(),
36773677
CName(m_destination->getName(), "int64Tmp"));
36783678
CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_UD);
36793679
m_encoder->Madw(dstTmp, srcLo[0], srcLo[1], zero);
@@ -3686,7 +3686,7 @@ void EmitPass::Mul64(CVariable* dst, CVariable* src[2], SIMDMode simdMode, bool
36863686

36873687
// copy high of A*B to dstHi
36883688
EncoderInit();
3689-
uint regOffset = (uint)std::ceil((float)(numLanes(simdMode) * CEncoder::GetCISADataTypeSize(ISA_TYPE_UD)) / getGRFSize());
3689+
uint regOffset = (uint)std::ceil((float)(numElements * CEncoder::GetCISADataTypeSize(ISA_TYPE_UD)) / getGRFSize());
36903690
m_encoder->SetSrcSubVar(0, regOffset);
36913691
m_encoder->SetSrcRegion(0, 1, 1, 0);
36923692
m_encoder->Copy(dstHi, dstTmp);

0 commit comments

Comments
 (0)