Skip to content

Commit 9c965bd

Browse files
authored
Merge pull request llvm#122 from AMD-Lightning-Internal/amd/dev/animkuma/xteam-noloop-types-rebase
[OpenMP][Clang] Extend No-Loop Scan for generic datatypes
2 parents 871a8a3 + ff10679 commit 9c965bd

File tree

6 files changed

+2722
-500
lines changed

6 files changed

+2722
-500
lines changed

clang/lib/CodeGen/CGStmt.cpp

Lines changed: 34 additions & 136 deletions
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,8 @@ void CodeGenFunction::EmitNoLoopXteamScanInit(const OMPLoopDirective &LD,
440440
const FunctionArgList *Args,
441441
llvm::Value *&GpuThreadId,
442442
llvm::Value *&GlobalGpuThreadId,
443-
llvm::Value *&WorkGroupId) {
443+
llvm::Value *&WorkGroupId,
444+
llvm::Value *&TotalNumThreads) {
444445
auto IVPair = EmitNoLoopIV(LD, Args);
445446
Address OMPIterationVarAddr = IVPair.second;
446447

@@ -468,6 +469,8 @@ void CodeGenFunction::EmitNoLoopXteamScanInit(const OMPLoopDirective &LD,
468469
CGM.updateXteamRedKernel(
469470
CapturedForStmt, Builder.CreateIntCast(OMPIterationVar, Int64Ty, false),
470471
NumTeams);
472+
TotalNumThreads =
473+
Builder.CreateMul(NumTeams, WorkGroupSize, "total_num_threads");
471474
Builder.CreateStore(OMPIterationVar, OMPIterationVarAddr);
472475

473476
// Emit updates of the original loop indices
@@ -488,8 +491,9 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseOneCode(
488491
llvm::Value *GpuThreadId = nullptr;
489492
llvm::Value *GlobalGpuThreadId = nullptr;
490493
llvm::Value *WorkGroupId = nullptr;
494+
llvm::Value *TotalNumThreads = nullptr;
491495
EmitNoLoopXteamScanInit(LD, CapturedForStmt, Args, GpuThreadId,
492-
GlobalGpuThreadId, WorkGroupId);
496+
GlobalGpuThreadId, WorkGroupId, TotalNumThreads);
493497

494498
// Branch to end if original loop condition not satisfied
495499
llvm::Value *IvCmp = EvaluateExprAsBool(LD.getCond());
@@ -539,8 +543,9 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseTwoCode(
539543
llvm::Value *GpuThreadId = nullptr;
540544
llvm::Value *GlobalGpuThreadId = nullptr;
541545
llvm::Value *WorkGroupId = nullptr;
546+
llvm::Value *TotalNumThreads = nullptr;
542547
EmitNoLoopXteamScanInit(LD, CapturedForStmt, Args, GpuThreadId,
543-
GlobalGpuThreadId, WorkGroupId);
548+
GlobalGpuThreadId, WorkGroupId, TotalNumThreads);
544549

545550
const CodeGenModule::XteamRedVarMap &RedVarMap =
546551
CGM.getXteamRedVarMap(CapturedForStmt);
@@ -559,137 +564,21 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseTwoCode(
559564
Address XteamRedSumArg3 = GetAddrOfLocalVar((*Args)[RVI.ArgPos + 2]);
560565
llvm::Value *DScanStorage = Builder.CreateLoad(XteamRedSumArg3);
561566

562-
// TODO: Extract a DeviceRTL function out of the PhaseTwo of Xteam Scan
563-
// codegen.
564-
if (CGM.OMPPresentScanDirective->hasClausesOfKind<OMPInclusiveClause>()) {
565-
// Handle the redistribution of cross-team scan result inside every
566-
// constituent team member by emitting this -
567-
// RedVar = Storage[GlobalTID]
568-
// if(TeamID >= 1)
569-
// {
570-
// RedVar += TeamVals[TeamID - 1]
571-
// }
572-
Address ScanStorageValGEP = Address(
573-
Builder.CreateGEP(RedVarType, DScanStorage, GlobalGpuThreadId),
574-
RedVarType,
575-
getContext().getTypeAlignInChars(
576-
XteamVD->getType())); // Storage[GlobalTID]
577-
Builder.CreateStore(Builder.CreateLoad(ScanStorageValGEP),
578-
RVI.RedVarAddr); // RedVar = Storage[GlobalTID]
579-
llvm::Value *IsAfterFirstTeam = Builder.CreateICmpUGE(
580-
WorkGroupId, llvm::ConstantInt::get(Int32Ty, 1)); // TeamID >= 1
581-
llvm::BasicBlock *IsAfterFirstTeamThenBlock =
582-
createBasicBlock("omp.is.after.first.team.then");
583-
llvm::BasicBlock *InclusiveScanEndBlock =
584-
createBasicBlock("omp.xteam.inclusive.scan.end");
585-
Builder.CreateCondBr(IsAfterFirstTeam, IsAfterFirstTeamThenBlock,
586-
InclusiveScanEndBlock);
587-
EmitBlock(IsAfterFirstTeamThenBlock);
588-
Address PrevTeamValGEP =
589-
Address(Builder.CreateGEP(
590-
RedVarType, DTeamVals,
591-
Builder.CreateSub(WorkGroupId,
592-
llvm::ConstantInt::get(Int32Ty, 1))),
593-
RedVarType,
594-
getContext().getTypeAlignInChars(
595-
XteamVD->getType())); // TeamVals[TeamID - 1]
596-
Builder.CreateStore(Builder.CreateAdd(Builder.CreateLoad(RVI.RedVarAddr),
597-
Builder.CreateLoad(PrevTeamValGEP)),
598-
RVI.RedVarAddr); // RedVar += TeamVals[TeamID - 1]
599-
EmitBranch(InclusiveScanEndBlock);
600-
EmitBlock(InclusiveScanEndBlock);
601-
} else {
602-
// Redistribution for the 'exclusive' scan is handled differently because
603-
// each work-item accesses the temporary output 'Storage' at the index
604-
// before it's own global thread id(GlobalTID). Emits the following -
605-
// RedVar = 0
606-
// if(GlobalTID >= 1)
607-
// {
608-
// RedVar = Storage[GlobalTID - 1]
609-
// if(TeamID >= 1)
610-
// {
611-
// if(localTID >= 1)
612-
// RedVar += TeamVals[TeamID - 1];
613-
// else if(TeamID >= 2)
614-
// RedVar += TeamVals[TeamID - 2];
615-
// }
616-
// }
617-
618-
Builder.CreateStore(llvm::ConstantInt::get(RedVarType, 0),
619-
RVI.RedVarAddr); // RedVar = 0
620-
llvm::Value *IsNotFirstThread = Builder.CreateICmpUGE(
621-
GlobalGpuThreadId,
622-
llvm::ConstantInt::get(Int32Ty, 1)); // GlobalTID >= 1
623-
llvm::BasicBlock *IsNotFirstThreadThenBlock =
624-
createBasicBlock("omp.is.not.first.thread.then");
625-
llvm::BasicBlock *ExclusiveScanEndBlock =
626-
createBasicBlock("omp.xteam.exclusive.scan.end");
627-
Builder.CreateCondBr(IsNotFirstThread, IsNotFirstThreadThenBlock,
628-
ExclusiveScanEndBlock);
629-
EmitBlock(IsNotFirstThreadThenBlock);
630-
llvm::Value *PrevGlobalGpuThreadId = Builder.CreateSub(
631-
GlobalGpuThreadId,
632-
llvm::ConstantInt::get(Int32Ty, 1)); // GlobalTID - 1
633-
Address ScanStoragePrevValGEP = Address(
634-
Builder.CreateGEP(RedVarType, DScanStorage, PrevGlobalGpuThreadId),
635-
RedVarType,
636-
getContext().getTypeAlignInChars(
637-
XteamVD->getType())); // Storage[GlobalTID - 1]
638-
Builder.CreateStore(Builder.CreateLoad(ScanStoragePrevValGEP),
639-
RVI.RedVarAddr); // RedVar = Storage[GlobalTID - 1]
640-
641-
llvm::Value *IsAfterFirstTeam = Builder.CreateICmpUGE(
642-
WorkGroupId, llvm::ConstantInt::get(Int32Ty, 1)); // TeamID >= 1
643-
llvm::BasicBlock *IsAfterFirstTeamThenBlock =
644-
createBasicBlock("omp.is.after.first.team.then");
645-
Builder.CreateCondBr(IsAfterFirstTeam, IsAfterFirstTeamThenBlock,
646-
ExclusiveScanEndBlock);
647-
EmitBlock(IsAfterFirstTeamThenBlock);
648-
llvm::Value *IsNotFirstThreadInTeam = Builder.CreateICmpUGE(
649-
GpuThreadId, llvm::ConstantInt::get(Int32Ty, 1)); // LocalTID >= 1
650-
llvm::BasicBlock *IsNotFirstThreadInTeamThenBlock =
651-
createBasicBlock("omp.is.not.first.thread.in.team.then");
652-
llvm::BasicBlock *IsNotFirstThreadInTeamElseBlock =
653-
createBasicBlock("omp.is.not.first.thread.in.team.else");
654-
Builder.CreateCondBr(IsNotFirstThreadInTeam,
655-
IsNotFirstThreadInTeamThenBlock,
656-
IsNotFirstThreadInTeamElseBlock);
657-
EmitBlock(IsNotFirstThreadInTeamThenBlock);
658-
Address PrevTeamValGEP =
659-
Address(Builder.CreateGEP(
660-
RedVarType, DTeamVals,
661-
Builder.CreateSub(WorkGroupId,
662-
llvm::ConstantInt::get(Int32Ty, 1))),
663-
RedVarType,
664-
getContext().getTypeAlignInChars(
665-
XteamVD->getType())); // TeamVals[TeamID - 1]
666-
Builder.CreateStore(Builder.CreateAdd(Builder.CreateLoad(RVI.RedVarAddr),
667-
Builder.CreateLoad(PrevTeamValGEP)),
668-
RVI.RedVarAddr); // RedVar += TeamVals[TeamID - 1]
669-
EmitBranch(ExclusiveScanEndBlock);
670-
EmitBlock(IsNotFirstThreadInTeamElseBlock);
671-
llvm::Value *IsAfterSecondTeam = Builder.CreateICmpUGE(
672-
WorkGroupId, llvm::ConstantInt::get(Int32Ty, 2)); // TeamID >= 2
673-
llvm::BasicBlock *IsAfterSecondTeamThenBlock =
674-
createBasicBlock("omp.is.after.second.team.then");
675-
Builder.CreateCondBr(IsAfterSecondTeam, IsAfterSecondTeamThenBlock,
676-
ExclusiveScanEndBlock);
677-
EmitBlock(IsAfterSecondTeamThenBlock);
678-
Address PrevPrevTeamValGEP =
679-
Address(Builder.CreateGEP(
680-
RedVarType, DTeamVals,
681-
Builder.CreateSub(WorkGroupId,
682-
llvm::ConstantInt::get(Int32Ty, 2))),
683-
RedVarType,
684-
getContext().getTypeAlignInChars(
685-
XteamVD->getType())); // TeamVals[TeamID - 2]
686-
Builder.CreateStore(
687-
Builder.CreateAdd(Builder.CreateLoad(RVI.RedVarAddr),
688-
Builder.CreateLoad(PrevPrevTeamValGEP)),
689-
RVI.RedVarAddr); // RedVar += TeamVals[TeamID - 2]
690-
EmitBranch(ExclusiveScanEndBlock);
691-
EmitBlock(ExclusiveScanEndBlock);
692-
}
567+
EmitXteamScanPhaseTwo(
568+
CapturedForStmt, /*SegmentSize=*/Builder.getInt32(1), *Args,
569+
CGM.getXteamRedBlockSize(D),
570+
CGM.OMPPresentScanDirective->hasClausesOfKind<OMPInclusiveClause>());
571+
572+
// Emit: RedVar = Storage[Offset + GlobalTID]
573+
// The offset is calculated to index into the second half of the Storage[]
574+
// data structure.
575+
llvm::Value *StorageOffset =
576+
Builder.CreateAdd(GlobalGpuThreadId, TotalNumThreads);
577+
Address ScanStorageValGEP = Address(
578+
Builder.CreateGEP(RedVarType, DScanStorage, StorageOffset), RedVarType,
579+
getContext().getTypeAlignInChars(
580+
XteamVD->getType())); // Storage[Offset + GlobalTID]
581+
Builder.CreateStore(Builder.CreateLoad(ScanStorageValGEP), RVI.RedVarAddr);
693582
}
694583

695584
// After the 'scanned' results are put in the respective private copies, the
@@ -949,8 +838,17 @@ void CodeGenFunction::EmitXteamScanPhaseTwo(const ForStmt *FStmt,
949838
Address XteamRedSumArg2 = GetAddrOfLocalVar(Args[RVI.ArgPos + 2]);
950839
llvm::Value *DScanStorage = Builder.CreateLoad(XteamRedSumArg2);
951840

952-
Address XteamRedSumArg3 = GetAddrOfLocalVar(Args[RVI.ArgPos + 3]);
953-
llvm::Value *DSegmentVals = Builder.CreateLoad(XteamRedSumArg3);
841+
llvm::Value *DSegmentVals = nullptr;
842+
if (CGM.isXteamSegmentedScanKernel()) {
843+
Address XteamRedSumArg3 = GetAddrOfLocalVar(Args[RVI.ArgPos + 3]);
844+
DSegmentVals = Builder.CreateLoad(XteamRedSumArg3);
845+
} else {
846+
// For No-Loop Scan, the SegmentVals[] is not required and therefore was
847+
// not created in the first place. Here we want to use the same
848+
// kmpc_xteams_phase2* API to compute Phase 2 of scan, therefore we're
849+
// passing the pointer of Storage[] as a dummy ptr.
850+
DSegmentVals = DScanStorage;
851+
}
954852

955853
const Expr *OrigRedVarExpr = RVI.RedVarExpr;
956854
const DeclRefExpr *DRE = cast<DeclRefExpr>(OrigRedVarExpr);

clang/lib/CodeGen/CodeGenFunction.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3587,7 +3587,8 @@ class CodeGenFunction : public CodeGenTypeCache {
35873587
const FunctionArgList *Args,
35883588
llvm::Value *&GpuThreadId,
35893589
llvm::Value *&GlobalGpuThreadId,
3590-
llvm::Value *&WorkGroupId);
3590+
llvm::Value *&WorkGroupId,
3591+
llvm::Value *&TotalNumThreads);
35913592

35923593
void EmitNoLoopXteamScanPhaseOneCode(const OMPExecutableDirective &D,
35933594
const ForStmt *CapturedForStmt,

0 commit comments

Comments
 (0)