@@ -2530,17 +2530,17 @@ void EmitPass::EmitMulPair(GenIntrinsicInst* GII, const SSource Sources[4], cons
2530
2530
else
2531
2531
{
2532
2532
// For those platforms natively not support DW-DW multiply, use vISA madw instruction instead of mul/mulh to get better performance.
2533
- if (false && m_currShader->m_Platform->noNativeDwordMulSupport())
2533
+ if (m_currShader->m_Platform->noNativeDwordMulSupport())
2534
2534
{
2535
2535
// (Cr, E) = A * B
2536
2536
// dst size should be GRF-aligned and doubled as it has both low and high results.
2537
2537
// We must make the dst element number is numDWPerGRF aligned. For example, if the madw is SIMD1,
2538
2538
// the dst has only 1 DW as low result in 1 GRF and only 1 DW as high result in another GRF. We should
2539
2539
// set the dst as (numDWPerGRF * 2) element but not 2 DW elements. This is required by madw.
2540
2540
auto numDWPerGRF = getGRFSize() / SIZE_DWORD;
2541
- auto numElements = iSTD::Align(Lo->GetNumberElement(), numDWPerGRF) * 2 ;
2541
+ auto numElements = iSTD::Align(Lo->GetNumberElement(), numDWPerGRF);
2542
2542
CVariable* DstTmp = m_currShader->GetNewVariable(
2543
- numElements, ISA_TYPE_UD, EALIGN_GRF, Lo->IsUniform(),
2543
+ numElements * 2 , ISA_TYPE_UD, EALIGN_GRF, Lo->IsUniform(),
2544
2544
CName(Lo->getName(), "int64Tmp"));
2545
2545
CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_UD);
2546
2546
m_encoder->Madw(DstTmp, L0, L1, zero);
@@ -2551,7 +2551,7 @@ void EmitPass::EmitMulPair(GenIntrinsicInst* GII, const SSource Sources[4], cons
2551
2551
m_encoder->Push();
2552
2552
2553
2553
// dstHigh = Cr
2554
- uint regOffset = (uint)std::ceil((float)(numLanes(m_currShader->m_SIMDSize) * CEncoder::GetCISADataTypeSize(ISA_TYPE_UD)) / getGRFSize());
2554
+ uint regOffset = (uint)std::ceil((float)(numElements * CEncoder::GetCISADataTypeSize(ISA_TYPE_UD)) / getGRFSize());
2555
2555
m_encoder->SetSrcSubVar(0, regOffset);
2556
2556
m_encoder->SetSrcRegion(0, 1, 1, 0);
2557
2557
m_encoder->Copy(dstHiTmp, DstTmp);
@@ -3662,7 +3662,7 @@ void EmitPass::Mul64(CVariable* dst, CVariable* src[2], SIMDMode simdMode, bool
3662
3662
// dstHigh = F + G + carry
3663
3663
3664
3664
// For those platforms natively not support DW-DW multiply, use vISA madw instruction instead of mul/mulh to get better performance.
3665
- if (false && m_currShader->m_Platform->noNativeDwordMulSupport())
3665
+ if (m_currShader->m_Platform->noNativeDwordMulSupport())
3666
3666
{
3667
3667
// (Cr, E) = A * B
3668
3668
EncoderInit();
@@ -3671,9 +3671,9 @@ void EmitPass::Mul64(CVariable* dst, CVariable* src[2], SIMDMode simdMode, bool
3671
3671
// the dst has only 1 DW as low result in 1 GRF and only 1 DW as high result in another GRF. We should
3672
3672
// set the dst as (numDWPerGRF * 2) element but not 2 DW elements. This is required by madw.
3673
3673
auto numDWPerGRF = getGRFSize() / SIZE_DWORD;
3674
- auto numElements = iSTD::Align(dst->GetNumberElement(), numDWPerGRF) * 2 ;
3674
+ auto numElements = iSTD::Align(dst->GetNumberElement(), numDWPerGRF);
3675
3675
CVariable* dstTmp = m_currShader->GetNewVariable(
3676
- numElements, ISA_TYPE_UD, EALIGN_GRF, dst->IsUniform(),
3676
+ numElements * 2 , ISA_TYPE_UD, EALIGN_GRF, dst->IsUniform(),
3677
3677
CName(m_destination->getName(), "int64Tmp"));
3678
3678
CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_UD);
3679
3679
m_encoder->Madw(dstTmp, srcLo[0], srcLo[1], zero);
@@ -3686,7 +3686,7 @@ void EmitPass::Mul64(CVariable* dst, CVariable* src[2], SIMDMode simdMode, bool
3686
3686
3687
3687
// copy high of A*B to dstHi
3688
3688
EncoderInit();
3689
- uint regOffset = (uint)std::ceil((float)(numLanes(simdMode) * CEncoder::GetCISADataTypeSize(ISA_TYPE_UD)) / getGRFSize());
3689
+ uint regOffset = (uint)std::ceil((float)(numElements * CEncoder::GetCISADataTypeSize(ISA_TYPE_UD)) / getGRFSize());
3690
3690
m_encoder->SetSrcSubVar(0, regOffset);
3691
3691
m_encoder->SetSrcRegion(0, 1, 1, 0);
3692
3692
m_encoder->Copy(dstHi, dstTmp);
0 commit comments