Enable madw use in i64 mul emulator. #2 try.

fangliu2020 · igcbot · commit d34d0af96792 · 2021-08-27T20:57:20.000+02:00
Fixed the regression caused by a regioning issue in moving madw low/high results
to other variables. And enable madw use in i64 mul emulator again.
befor fix:
madw (M1_NM, 1) int64Tmp(0,0)&lt;1&gt; localSize_0(0,0)&lt;0;1,0&gt; localSize_0(0,1)&lt;0;1,0&gt; 0x0:ud
mov (M1_NM, 1) V0101(0,0)&lt;1&gt; int64Tmp(0,0)&lt;1;1,0&gt;
mov (M1_NM, 1) V0103(0,0)&lt;1&gt; int64Tmp(2,0)&lt;1;1,0&gt;
After fix:
madw (M1_NM, 1) int64Tmp(0,0)&lt;1&gt; localSize_0(0,0)&lt;0;1,0&gt; localSize_0(0,1)&lt;0;1,0&gt; 0x0:ud
mov (M1_NM, 1) V0101(0,0)&lt;1&gt; int64Tmp(0,0)&lt;1;1,0&gt;
mov (M1_NM, 1) V0103(0,0)&lt;1&gt; int64Tmp(1,0)&lt;1;1,0&gt;
diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp
@@ -2530,17 +2530,17 @@ void EmitPass::EmitMulPair(GenIntrinsicInst* GII, const SSource Sources[4], cons
     else
     {
         // For those platforms natively not support DW-DW multiply, use vISA madw instruction instead of mul/mulh to get better performance.
-        if (false && m_currShader->m_Platform->noNativeDwordMulSupport())
+        if (m_currShader->m_Platform->noNativeDwordMulSupport())
         {
             // (Cr, E) = A * B
             // dst size should be GRF-aligned and doubled as it has both low and high results.
             // We must make the dst element number is numDWPerGRF aligned. For example, if the madw is SIMD1,
             // the dst has only 1 DW as low result in 1 GRF and only 1 DW as high result in another GRF. We should
             // set the dst as (numDWPerGRF * 2) element but not 2 DW elements. This is required by madw.
             auto numDWPerGRF = getGRFSize() / SIZE_DWORD;
-            auto numElements = iSTD::Align(Lo->GetNumberElement(), numDWPerGRF) * 2;
+            auto numElements = iSTD::Align(Lo->GetNumberElement(), numDWPerGRF);
             CVariable* DstTmp = m_currShader->GetNewVariable(
-                numElements, ISA_TYPE_UD, EALIGN_GRF, Lo->IsUniform(),
+                numElements * 2, ISA_TYPE_UD, EALIGN_GRF, Lo->IsUniform(),
                 CName(Lo->getName(), "int64Tmp"));
             CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_UD);
             m_encoder->Madw(DstTmp, L0, L1, zero);
@@ -2551,7 +2551,7 @@ void EmitPass::EmitMulPair(GenIntrinsicInst* GII, const SSource Sources[4], cons
             m_encoder->Push();
 
             // dstHigh = Cr
-            uint regOffset = (uint)std::ceil((float)(numLanes(m_currShader->m_SIMDSize) * CEncoder::GetCISADataTypeSize(ISA_TYPE_UD)) / getGRFSize());
+            uint regOffset = (uint)std::ceil((float)(numElements * CEncoder::GetCISADataTypeSize(ISA_TYPE_UD)) / getGRFSize());
             m_encoder->SetSrcSubVar(0, regOffset);
             m_encoder->SetSrcRegion(0, 1, 1, 0);
             m_encoder->Copy(dstHiTmp, DstTmp);
@@ -3662,7 +3662,7 @@ void EmitPass::Mul64(CVariable* dst, CVariable* src[2], SIMDMode simdMode, bool
     // dstHigh = F + G + carry
 
     // For those platforms natively not support DW-DW multiply, use vISA madw instruction instead of mul/mulh to get better performance.
-    if (false && m_currShader->m_Platform->noNativeDwordMulSupport())
+    if (m_currShader->m_Platform->noNativeDwordMulSupport())
     {
         // (Cr, E) = A * B
         EncoderInit();
@@ -3671,9 +3671,9 @@ void EmitPass::Mul64(CVariable* dst, CVariable* src[2], SIMDMode simdMode, bool
         // the dst has only 1 DW as low result in 1 GRF and only 1 DW as high result in another GRF. We should
         // set the dst as (numDWPerGRF * 2) element but not 2 DW elements. This is required by madw.
         auto numDWPerGRF = getGRFSize() / SIZE_DWORD;
-        auto numElements = iSTD::Align(dst->GetNumberElement(), numDWPerGRF) * 2;
+        auto numElements = iSTD::Align(dst->GetNumberElement(), numDWPerGRF);
         CVariable* dstTmp = m_currShader->GetNewVariable(
-            numElements, ISA_TYPE_UD, EALIGN_GRF, dst->IsUniform(),
+            numElements * 2, ISA_TYPE_UD, EALIGN_GRF, dst->IsUniform(),
             CName(m_destination->getName(), "int64Tmp"));
         CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_UD);
         m_encoder->Madw(dstTmp, srcLo[0], srcLo[1], zero);
@@ -3686,7 +3686,7 @@ void EmitPass::Mul64(CVariable* dst, CVariable* src[2], SIMDMode simdMode, bool
 
         // copy high of A*B to dstHi
         EncoderInit();
-        uint regOffset = (uint)std::ceil((float)(numLanes(simdMode) * CEncoder::GetCISADataTypeSize(ISA_TYPE_UD)) / getGRFSize());
+        uint regOffset = (uint)std::ceil((float)(numElements * CEncoder::GetCISADataTypeSize(ISA_TYPE_UD)) / getGRFSize());
         m_encoder->SetSrcSubVar(0, regOffset);
         m_encoder->SetSrcRegion(0, 1, 1, 0);
         m_encoder->Copy(dstHi, dstTmp);