@@ -440,7 +440,8 @@ void CodeGenFunction::EmitNoLoopXteamScanInit(const OMPLoopDirective &LD,
440
440
const FunctionArgList *Args,
441
441
llvm::Value *&GpuThreadId,
442
442
llvm::Value *&GlobalGpuThreadId,
443
- llvm::Value *&WorkGroupId) {
443
+ llvm::Value *&WorkGroupId,
444
+ llvm::Value *&TotalNumThreads) {
444
445
auto IVPair = EmitNoLoopIV (LD, Args);
445
446
Address OMPIterationVarAddr = IVPair.second ;
446
447
@@ -468,6 +469,8 @@ void CodeGenFunction::EmitNoLoopXteamScanInit(const OMPLoopDirective &LD,
468
469
CGM.updateXteamRedKernel (
469
470
CapturedForStmt, Builder.CreateIntCast (OMPIterationVar, Int64Ty, false ),
470
471
NumTeams);
472
+ TotalNumThreads =
473
+ Builder.CreateMul (NumTeams, WorkGroupSize, " total_num_threads" );
471
474
Builder.CreateStore (OMPIterationVar, OMPIterationVarAddr);
472
475
473
476
// Emit updates of the original loop indices
@@ -488,8 +491,9 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseOneCode(
488
491
llvm::Value *GpuThreadId = nullptr ;
489
492
llvm::Value *GlobalGpuThreadId = nullptr ;
490
493
llvm::Value *WorkGroupId = nullptr ;
494
+ llvm::Value *TotalNumThreads = nullptr ;
491
495
EmitNoLoopXteamScanInit (LD, CapturedForStmt, Args, GpuThreadId,
492
- GlobalGpuThreadId, WorkGroupId);
496
+ GlobalGpuThreadId, WorkGroupId, TotalNumThreads );
493
497
494
498
// Branch to end if original loop condition not satisfied
495
499
llvm::Value *IvCmp = EvaluateExprAsBool (LD.getCond ());
@@ -539,8 +543,9 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseTwoCode(
539
543
llvm::Value *GpuThreadId = nullptr ;
540
544
llvm::Value *GlobalGpuThreadId = nullptr ;
541
545
llvm::Value *WorkGroupId = nullptr ;
546
+ llvm::Value *TotalNumThreads = nullptr ;
542
547
EmitNoLoopXteamScanInit (LD, CapturedForStmt, Args, GpuThreadId,
543
- GlobalGpuThreadId, WorkGroupId);
548
+ GlobalGpuThreadId, WorkGroupId, TotalNumThreads );
544
549
545
550
const CodeGenModule::XteamRedVarMap &RedVarMap =
546
551
CGM.getXteamRedVarMap (CapturedForStmt);
@@ -559,137 +564,21 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseTwoCode(
559
564
Address XteamRedSumArg3 = GetAddrOfLocalVar ((*Args)[RVI.ArgPos + 2 ]);
560
565
llvm::Value *DScanStorage = Builder.CreateLoad (XteamRedSumArg3);
561
566
562
- // TODO: Extract a DeviceRTL function out of the PhaseTwo of Xteam Scan
563
- // codegen.
564
- if (CGM.OMPPresentScanDirective ->hasClausesOfKind <OMPInclusiveClause>()) {
565
- // Handle the redistribution of cross-team scan result inside every
566
- // constituent team member by emitting this -
567
- // RedVar = Storage[GlobalTID]
568
- // if(TeamID >= 1)
569
- // {
570
- // RedVar += TeamVals[TeamID - 1]
571
- // }
572
- Address ScanStorageValGEP = Address (
573
- Builder.CreateGEP (RedVarType, DScanStorage, GlobalGpuThreadId),
574
- RedVarType,
575
- getContext ().getTypeAlignInChars (
576
- XteamVD->getType ())); // Storage[GlobalTID]
577
- Builder.CreateStore (Builder.CreateLoad (ScanStorageValGEP),
578
- RVI.RedVarAddr ); // RedVar = Storage[GlobalTID]
579
- llvm::Value *IsAfterFirstTeam = Builder.CreateICmpUGE (
580
- WorkGroupId, llvm::ConstantInt::get (Int32Ty, 1 )); // TeamID >= 1
581
- llvm::BasicBlock *IsAfterFirstTeamThenBlock =
582
- createBasicBlock (" omp.is.after.first.team.then" );
583
- llvm::BasicBlock *InclusiveScanEndBlock =
584
- createBasicBlock (" omp.xteam.inclusive.scan.end" );
585
- Builder.CreateCondBr (IsAfterFirstTeam, IsAfterFirstTeamThenBlock,
586
- InclusiveScanEndBlock);
587
- EmitBlock (IsAfterFirstTeamThenBlock);
588
- Address PrevTeamValGEP =
589
- Address (Builder.CreateGEP (
590
- RedVarType, DTeamVals,
591
- Builder.CreateSub (WorkGroupId,
592
- llvm::ConstantInt::get (Int32Ty, 1 ))),
593
- RedVarType,
594
- getContext ().getTypeAlignInChars (
595
- XteamVD->getType ())); // TeamVals[TeamID - 1]
596
- Builder.CreateStore (Builder.CreateAdd (Builder.CreateLoad (RVI.RedVarAddr ),
597
- Builder.CreateLoad (PrevTeamValGEP)),
598
- RVI.RedVarAddr ); // RedVar += TeamVals[TeamID - 1]
599
- EmitBranch (InclusiveScanEndBlock);
600
- EmitBlock (InclusiveScanEndBlock);
601
- } else {
602
- // Redistribution for the 'exclusive' scan is handled differently because
603
- // each work-item accesses the temporary output 'Storage' at the index
604
- // before it's own global thread id(GlobalTID). Emits the following -
605
- // RedVar = 0
606
- // if(GlobalTID >= 1)
607
- // {
608
- // RedVar = Storage[GlobalTID - 1]
609
- // if(TeamID >= 1)
610
- // {
611
- // if(localTID >= 1)
612
- // RedVar += TeamVals[TeamID - 1];
613
- // else if(TeamID >= 2)
614
- // RedVar += TeamVals[TeamID - 2];
615
- // }
616
- // }
617
-
618
- Builder.CreateStore (llvm::ConstantInt::get (RedVarType, 0 ),
619
- RVI.RedVarAddr ); // RedVar = 0
620
- llvm::Value *IsNotFirstThread = Builder.CreateICmpUGE (
621
- GlobalGpuThreadId,
622
- llvm::ConstantInt::get (Int32Ty, 1 )); // GlobalTID >= 1
623
- llvm::BasicBlock *IsNotFirstThreadThenBlock =
624
- createBasicBlock (" omp.is.not.first.thread.then" );
625
- llvm::BasicBlock *ExclusiveScanEndBlock =
626
- createBasicBlock (" omp.xteam.exclusive.scan.end" );
627
- Builder.CreateCondBr (IsNotFirstThread, IsNotFirstThreadThenBlock,
628
- ExclusiveScanEndBlock);
629
- EmitBlock (IsNotFirstThreadThenBlock);
630
- llvm::Value *PrevGlobalGpuThreadId = Builder.CreateSub (
631
- GlobalGpuThreadId,
632
- llvm::ConstantInt::get (Int32Ty, 1 )); // GlobalTID - 1
633
- Address ScanStoragePrevValGEP = Address (
634
- Builder.CreateGEP (RedVarType, DScanStorage, PrevGlobalGpuThreadId),
635
- RedVarType,
636
- getContext ().getTypeAlignInChars (
637
- XteamVD->getType ())); // Storage[GlobalTID - 1]
638
- Builder.CreateStore (Builder.CreateLoad (ScanStoragePrevValGEP),
639
- RVI.RedVarAddr ); // RedVar = Storage[GlobalTID - 1]
640
-
641
- llvm::Value *IsAfterFirstTeam = Builder.CreateICmpUGE (
642
- WorkGroupId, llvm::ConstantInt::get (Int32Ty, 1 )); // TeamID >= 1
643
- llvm::BasicBlock *IsAfterFirstTeamThenBlock =
644
- createBasicBlock (" omp.is.after.first.team.then" );
645
- Builder.CreateCondBr (IsAfterFirstTeam, IsAfterFirstTeamThenBlock,
646
- ExclusiveScanEndBlock);
647
- EmitBlock (IsAfterFirstTeamThenBlock);
648
- llvm::Value *IsNotFirstThreadInTeam = Builder.CreateICmpUGE (
649
- GpuThreadId, llvm::ConstantInt::get (Int32Ty, 1 )); // LocalTID >= 1
650
- llvm::BasicBlock *IsNotFirstThreadInTeamThenBlock =
651
- createBasicBlock (" omp.is.not.first.thread.in.team.then" );
652
- llvm::BasicBlock *IsNotFirstThreadInTeamElseBlock =
653
- createBasicBlock (" omp.is.not.first.thread.in.team.else" );
654
- Builder.CreateCondBr (IsNotFirstThreadInTeam,
655
- IsNotFirstThreadInTeamThenBlock,
656
- IsNotFirstThreadInTeamElseBlock);
657
- EmitBlock (IsNotFirstThreadInTeamThenBlock);
658
- Address PrevTeamValGEP =
659
- Address (Builder.CreateGEP (
660
- RedVarType, DTeamVals,
661
- Builder.CreateSub (WorkGroupId,
662
- llvm::ConstantInt::get (Int32Ty, 1 ))),
663
- RedVarType,
664
- getContext ().getTypeAlignInChars (
665
- XteamVD->getType ())); // TeamVals[TeamID - 1]
666
- Builder.CreateStore (Builder.CreateAdd (Builder.CreateLoad (RVI.RedVarAddr ),
667
- Builder.CreateLoad (PrevTeamValGEP)),
668
- RVI.RedVarAddr ); // RedVar += TeamVals[TeamID - 1]
669
- EmitBranch (ExclusiveScanEndBlock);
670
- EmitBlock (IsNotFirstThreadInTeamElseBlock);
671
- llvm::Value *IsAfterSecondTeam = Builder.CreateICmpUGE (
672
- WorkGroupId, llvm::ConstantInt::get (Int32Ty, 2 )); // TeamID >= 2
673
- llvm::BasicBlock *IsAfterSecondTeamThenBlock =
674
- createBasicBlock (" omp.is.after.second.team.then" );
675
- Builder.CreateCondBr (IsAfterSecondTeam, IsAfterSecondTeamThenBlock,
676
- ExclusiveScanEndBlock);
677
- EmitBlock (IsAfterSecondTeamThenBlock);
678
- Address PrevPrevTeamValGEP =
679
- Address (Builder.CreateGEP (
680
- RedVarType, DTeamVals,
681
- Builder.CreateSub (WorkGroupId,
682
- llvm::ConstantInt::get (Int32Ty, 2 ))),
683
- RedVarType,
684
- getContext ().getTypeAlignInChars (
685
- XteamVD->getType ())); // TeamVals[TeamID - 2]
686
- Builder.CreateStore (
687
- Builder.CreateAdd (Builder.CreateLoad (RVI.RedVarAddr ),
688
- Builder.CreateLoad (PrevPrevTeamValGEP)),
689
- RVI.RedVarAddr ); // RedVar += TeamVals[TeamID - 2]
690
- EmitBranch (ExclusiveScanEndBlock);
691
- EmitBlock (ExclusiveScanEndBlock);
692
- }
567
+ EmitXteamScanPhaseTwo (
568
+ CapturedForStmt, /* SegmentSize=*/ Builder.getInt32 (1 ), *Args,
569
+ CGM.getXteamRedBlockSize (D),
570
+ CGM.OMPPresentScanDirective ->hasClausesOfKind <OMPInclusiveClause>());
571
+
572
+ // Emit: RedVar = Storage[Offset + GlobalTID]
573
+ // The offset is calculated to index into the second half of the Storage[]
574
+ // data structure.
575
+ llvm::Value *StorageOffset =
576
+ Builder.CreateAdd (GlobalGpuThreadId, TotalNumThreads);
577
+ Address ScanStorageValGEP = Address (
578
+ Builder.CreateGEP (RedVarType, DScanStorage, StorageOffset), RedVarType,
579
+ getContext ().getTypeAlignInChars (
580
+ XteamVD->getType ())); // Storage[Offset + GlobalTID]
581
+ Builder.CreateStore (Builder.CreateLoad (ScanStorageValGEP), RVI.RedVarAddr );
693
582
}
694
583
695
584
// After the 'scanned' results are put in the respective private copies, the
@@ -949,8 +838,17 @@ void CodeGenFunction::EmitXteamScanPhaseTwo(const ForStmt *FStmt,
949
838
Address XteamRedSumArg2 = GetAddrOfLocalVar (Args[RVI.ArgPos + 2 ]);
950
839
llvm::Value *DScanStorage = Builder.CreateLoad (XteamRedSumArg2);
951
840
952
- Address XteamRedSumArg3 = GetAddrOfLocalVar (Args[RVI.ArgPos + 3 ]);
953
- llvm::Value *DSegmentVals = Builder.CreateLoad (XteamRedSumArg3);
841
+ llvm::Value *DSegmentVals = nullptr ;
842
+ if (CGM.isXteamSegmentedScanKernel ()) {
843
+ Address XteamRedSumArg3 = GetAddrOfLocalVar (Args[RVI.ArgPos + 3 ]);
844
+ DSegmentVals = Builder.CreateLoad (XteamRedSumArg3);
845
+ } else {
846
+ // For No-Loop Scan, the SegmentVals[] is not required and therefore was
847
+ // not created in the first place. Here we want to use the same
848
+ // kmpc_xteams_phase2* API to compute Phase 2 of scan, therefore we're
849
+ // passing the pointer of Storage[] as a dummy ptr.
850
+ DSegmentVals = DScanStorage;
851
+ }
954
852
955
853
const Expr *OrigRedVarExpr = RVI.RedVarExpr ;
956
854
const DeclRefExpr *DRE = cast<DeclRefExpr>(OrigRedVarExpr);
0 commit comments