@@ -2650,9 +2650,12 @@ void GlobalRA::updateSubRegAlignment(G4_SubReg_Align subAlign) {
2650
2650
}
2651
2651
}
2652
2652
2653
- bool GlobalRA::evenAlignNeeded (G4_Declare *dcl) {
2653
+ int GlobalRA::getAlignFromAugBucket (G4_Declare *dcl) {
2654
2654
if (GlobalRA::useGenericAugAlign (builder.getPlatformGeneration ())) {
2655
- // Return true if even alignment is needed
2655
+ // Return 0 if no special alignment is needed
2656
+ // Return 2 if even alignment is needed
2657
+ // Return 4 if quad alignment is needed
2658
+
2656
2659
// Even align needed if for given SIMD size and elem type,
2657
2660
// a complete def uses between 1-2 GRFs.
2658
2661
auto kernelSimdSizeToUse = kernel.getSimdSizeWithSlicing ();
@@ -2670,14 +2673,41 @@ bool GlobalRA::evenAlignNeeded(G4_Declare *dcl) {
2670
2673
topdclAugMask == AugmentationMasks::Default64Bit)
2671
2674
elemSizeToUse = 8 ;
2672
2675
2673
- if ( // Even align if size is between 1-2 GRFs, for >2GRF sizes use weak
2674
- // edges
2675
- (elemSizeToUse * kernelSimdSizeToUse) >
2676
- (unsigned )kernel.numEltPerGRF <Type_UB>() &&
2677
- (elemSizeToUse * kernelSimdSizeToUse) <=
2678
- (unsigned )(2 * kernel.numEltPerGRF <Type_UB>()) &&
2679
- !(!builder.canReadR0 () && dcl == kernel.fg .builder ->getBuiltinR0 ())) {
2680
- return true ;
2676
+ auto totalByteSize = elemSizeToUse * kernelSimdSizeToUse;
2677
+ auto bucketSpans2GRFs = [&]() {
2678
+ return totalByteSize > (unsigned )kernel.numEltPerGRF <Type_UB>() &&
2679
+ totalByteSize <= (unsigned )(2 * kernel.numEltPerGRF <Type_UB>());
2680
+ };
2681
+
2682
+ if (!(!builder.canReadR0 () && dcl == kernel.fg .builder ->getBuiltinR0 ())) {
2683
+ if (use4GRFAlign) {
2684
+ // The only time it's safe to do 2GRF align is when augmentation
2685
+ // bucket is known to be Default32Bit, otherwise we need to align
2686
+ // 4GRF. It isn't enough to simply check elemSize * GRF size to
2687
+ // decide alignment.
2688
+ if (topdclAugMask == AugmentationMasks::Default32Bit) {
2689
+ if (bucketSpans2GRFs ())
2690
+ return 2 ;
2691
+ } else if (topdclAugMask == AugmentationMasks::Default64Bit) {
2692
+ if (bucketSpans2GRFs ())
2693
+ // :df SIMD16
2694
+ return 2 ;
2695
+
2696
+ // :df SIMD32
2697
+ return 4 ;
2698
+ } else {
2699
+ // Local RA will take this path as augmentation buckets are set
2700
+ // to Undetermined. Although this is conservative, hybrid RA
2701
+ // will run augmentation and compute buckets to fill in "holes".
2702
+ // For eg, mov (32|M0) V10<2>:f should use 4GRF alignment as
2703
+ // it's Default64Bit variable, although elem size is :f.
2704
+ return 4 ;
2705
+ }
2706
+ } else {
2707
+ // Even align if size is between 1-2 GRFs, for >2GRF sizes.
2708
+ if (bucketSpans2GRFs ())
2709
+ return 2 ;
2710
+ }
2681
2711
}
2682
2712
}
2683
2713
} else {
@@ -2693,21 +2723,28 @@ bool GlobalRA::evenAlignNeeded(G4_Declare *dcl) {
2693
2723
topdcl->getByteSize () >= kernel.numEltPerGRF <Type_UB>() &&
2694
2724
!(!builder.canReadR0 () &&
2695
2725
dcl == kernel.fg .builder ->getBuiltinR0 ())) {
2696
- return true ;
2726
+ return 2 ;
2697
2727
}
2698
2728
}
2699
2729
}
2700
2730
}
2701
2731
2702
- return false ;
2732
+ return 0 ;
2703
2733
}
2704
2734
2705
- // This function can be invoked before local RA or after augmentation.
2706
- void GlobalRA::evenAlign () {
2707
- // Update alignment of all GRF declares to align
2735
+ void GlobalRA::augAlign () {
2736
+ // Update alignment of all GRF declares based on
2737
+ // augmentation bucket and platform.
2708
2738
for (auto dcl : kernel.Declares ) {
2709
2739
if (dcl->getRegFile () & G4_GRF) {
2710
- if (evenAlignNeeded (dcl)) {
2740
+ unsigned int align = getAlignFromAugBucket (dcl);
2741
+ if (align == 4 ) {
2742
+ if (!isQuadAligned (dcl)) {
2743
+ incRA.evenAlignUpdate (dcl);
2744
+ }
2745
+ forceQuadAlign (dcl);
2746
+ }
2747
+ else if (align == 2 ) {
2711
2748
if (!isEvenAligned (dcl)) {
2712
2749
incRA.evenAlignUpdate (dcl);
2713
2750
}
@@ -3471,8 +3508,8 @@ bool Augmentation::markNonDefaultMaskDef() {
3471
3508
3472
3509
bool checkLRAAlign = false ;
3473
3510
if (liveAnalysis.livenessClass (G4_GRF)) {
3474
- if (( GlobalRA::useGenericAugAlign (kernel.getPlatformGeneration ()) &&
3475
- gra.evenAlignNeeded (dcl)) )
3511
+ if (GlobalRA::useGenericAugAlign (kernel.getPlatformGeneration ()) &&
3512
+ gra.getAlignFromAugBucket (dcl) > 0 )
3476
3513
checkLRAAlign = true ;
3477
3514
else if (gra.getAugmentationMask (dcl) ==
3478
3515
AugmentationMasks::Default32Bit &&
@@ -3485,10 +3522,22 @@ bool Augmentation::markNonDefaultMaskDef() {
3485
3522
if (dclLR) {
3486
3523
int s;
3487
3524
auto phyReg = dclLR->getPhyReg (s);
3488
- if (phyReg && phyReg->asGreg ()->getRegNum () % 2 != 0 ) {
3489
- // If LRA assignment is not 2GRF aligned for then
3525
+ unsigned int maxAlign = 2 ;
3526
+ if (gra.use4GRFAlign && gra.getAugmentationMask (dcl) == AugmentationMasks::Default64Bit) {
3527
+ maxAlign = 4 ;
3528
+ }
3529
+ if (phyReg && phyReg->asGreg ()->getRegNum () % maxAlign != 0 ) {
3530
+ // If LRA assignment is not aligned as expected then
3490
3531
// mark it as non-default. GRA candidates cannot fully
3491
3532
// overlap with such ranges. Partial overlap is illegal.
3533
+
3534
+ // TODO: There's a bug here. This branch should execute only if
3535
+ // dclLR->getAssigned() == true. If this is false, then
3536
+ // dclLR->getPhyReg() is invalid. Once this is fixed, we can
3537
+ // re-enable following assert.
3538
+ //
3539
+ // vISA_ASSERT(!gra.use4GRFAlign,
3540
+ // "expecting LRA allocation to be aligned");
3492
3541
gra.setAugmentationMask (dcl, AugmentationMasks::NonDefault);
3493
3542
nonDefaultMaskDefFound = true ;
3494
3543
}
@@ -4195,6 +4244,8 @@ bool Interference::isStrongEdgeBetween(const G4_Declare *dcl1,
4195
4244
4196
4245
bool Augmentation::weakEdgeNeeded (AugmentationMasks defaultDclMask,
4197
4246
AugmentationMasks newDclMask) {
4247
+ if (gra.use4GRFAlign )
4248
+ return false ;
4198
4249
if (useGenericAugAlign) {
4199
4250
// Weak edge needed in case #GRF exceeds 2
4200
4251
if (newDclMask == AugmentationMasks::Default64Bit)
@@ -4746,9 +4797,9 @@ void Augmentation::augmentIntfGraph() {
4746
4797
// to 2GRF except for NoMask variables
4747
4798
VISA_DEBUG_VERBOSE (std::cout
4748
4799
<< " Kernel size is SIMD" << kernel.getSimdSize ()
4749
- << " so updating all GRFs to be 2GRF aligned "
4800
+ << " so updating all GRFs to aug align "
4750
4801
<< " \n " );
4751
- gra.evenAlign ();
4802
+ gra.augAlign ();
4752
4803
}
4753
4804
gra.updateSubRegAlignment (kernel.getGRFAlign ());
4754
4805
}
@@ -5054,6 +5105,7 @@ void GraphColor::computeDegreeForGRF() {
5054
5105
// consider weak edges in degree computation
5055
5106
auto *weakEdges = intf.getCompatibleSparseIntf (lrs[i]->getDcl ());
5056
5107
if (weakEdges) {
5108
+ vISA_ASSERT (!gra.use4GRFAlign , " not expecting weak edges" );
5057
5109
for (auto weakNeighbor : *weakEdges) {
5058
5110
if (!weakNeighbor->getRegVar ()->isRegAllocPartaker ())
5059
5111
continue ;
@@ -5367,16 +5419,22 @@ void GraphColor::relaxNeighborDegreeGRF(LiveRange *lr) {
5367
5419
if (!(lr->getIsPseudoNode ()) && !(lr->getIsPartialDcl ())) {
5368
5420
unsigned lr_id = lr->getVar ()->getId ();
5369
5421
bool lr2EvenAlign = gra.isEvenAligned (lr->getDcl ());
5422
+ unsigned int lr2AugAlign = gra.getAugAlign (lr->getDcl ());
5370
5423
unsigned lr2_nreg = lr->getNumRegNeeded ();
5371
5424
5372
5425
// relax degree between 2 nodes
5373
5426
auto relaxDegree = [&](LiveRange *lr1) {
5374
5427
if (lr1->getActive () && !lr1->getIsPseudoNode () &&
5375
5428
!(lr1->getIsPartialDcl ())) {
5376
- bool lr1EvenAlign = gra.isEvenAligned (lr1->getDcl ());
5377
5429
unsigned lr1_nreg = lr1->getNumRegNeeded ();
5378
- unsigned w =
5379
- edgeWeightGRF (lr1EvenAlign, lr2EvenAlign, lr1_nreg, lr2_nreg);
5430
+ unsigned w = 0 ;
5431
+ if (gra.use4GRFAlign ) {
5432
+ unsigned int lr1AugAlign = gra.getAugAlign (lr1->getDcl ());
5433
+ w = edgeWeightWith4GRF (lr1AugAlign, lr2AugAlign, lr1_nreg, lr2_nreg);
5434
+ } else {
5435
+ bool lr1EvenAlign = gra.isEvenAligned (lr1->getDcl ());
5436
+ w = edgeWeightGRF (lr1EvenAlign, lr2EvenAlign, lr1_nreg, lr2_nreg);
5437
+ }
5380
5438
VISA_DEBUG_VERBOSE ({
5381
5439
std::cout << " \t relax " ;
5382
5440
lr1->dump ();
@@ -5782,9 +5840,15 @@ bool GraphColor::assignColors(ColorHeuristic colorHeuristicGRF,
5782
5840
if (!failed_alloc) {
5783
5841
// When evenAlignNeeded is true, it is binding for correctness
5784
5842
bool evenAlignNeeded = gra.isEvenAligned (lrVar->getDeclare ());
5785
- BankAlign align = evenAlignNeeded ? BankAlign::Even : BankAlign::Either;
5786
- if (allocFromBanks) {
5843
+ bool quadAlignNeeded = gra.isQuadAligned (lrVar->getDeclare ());
5844
+ BankAlign align = BankAlign::Either;
5845
+ if (quadAlignNeeded)
5846
+ align = BankAlign::QuadGRF;
5847
+ else if (evenAlignNeeded)
5848
+ align = BankAlign::Even;
5787
5849
5850
+ if (allocFromBanks) {
5851
+ vISA_ASSERT (align != BankAlign::QuadGRF, " unexpected value" );
5788
5852
if (!isHybrid && oneGRFBankDivision &&
5789
5853
(!evenAlignNeeded ||
5790
5854
builder.getPlatformGeneration () == PlatformGen::GEN9)) {
@@ -10876,12 +10940,20 @@ void GlobalRA::insertRestoreAddr(G4_BB *bb) {
10876
10940
// correctness.
10877
10941
//
10878
10942
unsigned GraphColor::edgeWeightGRF (const LiveRange *lr1, const LiveRange *lr2) {
10879
- bool lr1EvenAlign = gra.isEvenAligned (lr1->getDcl ());
10880
- bool lr2EvenAlign = gra.isEvenAligned (lr2->getDcl ());
10881
10943
unsigned lr1_nreg = lr1->getNumRegNeeded ();
10882
10944
unsigned lr2_nreg = lr2->getNumRegNeeded ();
10883
10945
10884
- return edgeWeightGRF (lr1EvenAlign, lr2EvenAlign, lr1_nreg, lr2_nreg);
10946
+ if (gra.use4GRFAlign ) {
10947
+ auto lr1Align = gra.getAugAlign (lr1->getDcl ());
10948
+ auto lr2Align = gra.getAugAlign (lr2->getDcl ());
10949
+
10950
+ return edgeWeightWith4GRF (lr1Align, lr2Align, lr1_nreg, lr2_nreg);
10951
+ } else {
10952
+ bool lr1EvenAlign = gra.isEvenAligned (lr1->getDcl ());
10953
+ bool lr2EvenAlign = gra.isEvenAligned (lr2->getDcl ());
10954
+
10955
+ return edgeWeightGRF (lr1EvenAlign, lr2EvenAlign, lr1_nreg, lr2_nreg);
10956
+ }
10885
10957
}
10886
10958
10887
10959
unsigned GraphColor::edgeWeightARF (const LiveRange *lr1, const LiveRange *lr2) {
0 commit comments