Skip to content

Commit d3e583f

Browse files
Synchronize blit enqueue after barrier call
Change-Id: I0349dc5b1581ecb142bdab881877450366bcdb86 Signed-off-by: Dunajski, Bartosz <[email protected]> Related-To: NEO-3020
1 parent 0f122fb commit d3e583f

12 files changed

+187
-24
lines changed

runtime/command_queue/command_queue_hw.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,7 @@ class CommandQueueHw : public CommandQueue {
344344
size_t surfacesCount,
345345
const MultiDispatchInfo &multiDispatchInfo,
346346
TimestampPacketContainer *previousTimestampPacketNodes,
347+
TimestampPacketContainer &barrierTimestampPacketNode,
347348
std::unique_ptr<KernelOperation> &blockedCommandsData,
348349
const EnqueueProperties &enqueueProperties,
349350
EventsRequest &eventsRequest,
@@ -357,6 +358,7 @@ class CommandQueueHw : public CommandQueue {
357358
bool &blocking,
358359
const EnqueueProperties &enqueueProperties,
359360
TimestampPacketContainer *previousTimestampPacketNodes,
361+
const TimestampPacketContainer &barrierTimestampPacketNodes,
360362
EventsRequest &eventsRequest,
361363
EventBuilder &eventBuilder,
362364
uint32_t taskLevel);
@@ -366,6 +368,7 @@ class CommandQueueHw : public CommandQueue {
366368
CsrDependencies &csrDeps);
367369
BlitProperties processDispatchForBlitEnqueue(const MultiDispatchInfo &multiDispatchInfo,
368370
TimestampPacketContainer &previousTimestampPacketNodes,
371+
TimestampPacketContainer &barrierTimestampPacketNode,
369372
const EventsRequest &eventsRequest,
370373
LinearStream &commandStream,
371374
uint32_t commandType, bool queueBlocked);

runtime/command_queue/enqueue_common.h

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
188188
}
189189

190190
TimestampPacketContainer previousTimestampPacketNodes;
191+
TimestampPacketContainer barrierTimestampPacketNode;
191192
EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
192193
CsrDependencies csrDeps;
193194
BlitProperties blitProperties;
@@ -202,6 +203,11 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
202203
nodesCount = estimateTimestampPacketNodesCount(multiDispatchInfo);
203204
}
204205

206+
if (blitEnqueue && getGpgpuCommandStreamReceiver().isStallingPipeControlOnNextFlushRequired()) {
207+
auto allocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
208+
barrierTimestampPacketNode.add(allocator->getTag());
209+
}
210+
205211
if (nodesCount > 0) {
206212
obtainNewTimestampPacketNodes(nodesCount, previousTimestampPacketNodes, clearAllDependencies);
207213
csrDeps.push_back(&previousTimestampPacketNodes);
@@ -219,8 +225,8 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
219225
bool flushDependenciesForNonKernelCommand = false;
220226

221227
if (blitEnqueue) {
222-
blitProperties = processDispatchForBlitEnqueue(multiDispatchInfo, previousTimestampPacketNodes, eventsRequest, commandStream,
223-
commandType, blockQueue);
228+
blitProperties = processDispatchForBlitEnqueue(multiDispatchInfo, previousTimestampPacketNodes, barrierTimestampPacketNode,
229+
eventsRequest, commandStream, commandType, blockQueue);
224230
} else if (multiDispatchInfo.empty() == false) {
225231
processDispatchForKernels<commandType>(multiDispatchInfo, printfHandler, eventBuilder.getEvent(),
226232
hwTimeStamps, blockQueue, devQueueHw, csrDeps, blockedCommandsData.get(),
@@ -289,6 +295,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
289295
blocking,
290296
enqueueProperties,
291297
&previousTimestampPacketNodes,
298+
barrierTimestampPacketNode,
292299
eventsRequest,
293300
eventBuilder,
294301
taskLevel);
@@ -337,6 +344,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
337344
numSurfaceForResidency,
338345
multiDispatchInfo,
339346
&previousTimestampPacketNodes,
347+
barrierTimestampPacketNode,
340348
blockedCommandsData,
341349
enqueueProperties,
342350
eventsRequest,
@@ -431,6 +439,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
431439
template <typename GfxFamily>
432440
BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(const MultiDispatchInfo &multiDispatchInfo,
433441
TimestampPacketContainer &previousTimestampPacketNodes,
442+
TimestampPacketContainer &barrierTimestampPacketNode,
434443
const EventsRequest &eventsRequest, LinearStream &commandStream,
435444
uint32_t commandType, bool queueBlocked) {
436445
auto blitDirection = BlitProperties::obtainBlitDirection(commandType);
@@ -444,6 +453,7 @@ BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(const Mu
444453
CsrDependencies::DependenciesType::All);
445454

446455
blitProperties.csrDependencies.push_back(&previousTimestampPacketNodes);
456+
blitProperties.csrDependencies.push_back(&barrierTimestampPacketNode);
447457
}
448458

449459
blitProperties.outputTimestampPacket = timestampPacketContainer.get();
@@ -662,6 +672,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
662672

663673
DispatchFlags dispatchFlags(
664674
{}, //csrDependencies
675+
nullptr, //barrierTimestampPacketNodes
665676
{}, //pipelineSelectArgs
666677
this->flushStamp->getStampReference(), //flushStampReference
667678
getThrottle(), //throttle
@@ -723,6 +734,7 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
723734
size_t surfaceCount,
724735
const MultiDispatchInfo &multiDispatchInfo,
725736
TimestampPacketContainer *previousTimestampPacketNodes,
737+
TimestampPacketContainer &barrierTimestampPacketNode,
726738
std::unique_ptr<KernelOperation> &blockedCommandsData,
727739
const EnqueueProperties &enqueueProperties,
728740
EventsRequest &eventsRequest,
@@ -801,7 +813,7 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
801813
auto event = castToObjectOrAbort<Event>(eventsRequest.eventWaitList[i]);
802814
event->incRefInternal();
803815
}
804-
command->setTimestampPacketNode(*timestampPacketContainer, *previousTimestampPacketNodes);
816+
command->setTimestampPacketNode(*timestampPacketContainer, *previousTimestampPacketNodes, barrierTimestampPacketNode);
805817
command->setEventsRequest(eventsRequest);
806818
}
807819
outEvent->setCommand(std::move(command));
@@ -826,6 +838,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
826838
bool &blocking,
827839
const EnqueueProperties &enqueueProperties,
828840
TimestampPacketContainer *previousTimestampPacketNodes,
841+
const TimestampPacketContainer &barrierTimestampPacketNodes,
829842
EventsRequest &eventsRequest,
830843
EventBuilder &eventBuilder,
831844
uint32_t taskLevel) {
@@ -853,6 +866,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
853866

854867
DispatchFlags dispatchFlags(
855868
{}, //csrDependencies
869+
&barrierTimestampPacketNodes, //barrierTimestampPacketNodes
856870
{}, //pipelineSelectArgs
857871
flushStamp->getStampReference(), //flushStampReference
858872
QueueThrottle::MEDIUM, //throttle

runtime/command_stream/command_stream_receiver.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ class CommandStreamReceiver {
125125
GraphicsAllocation *getPreemptionAllocation() const { return preemptionAllocation; }
126126
void requestThreadArbitrationPolicy(uint32_t requiredPolicy) { this->requiredThreadArbitrationPolicy = requiredPolicy; }
127127
void requestStallingPipeControlOnNextFlush() { stallingPipeControlOnNextFlushRequired = true; }
128+
bool isStallingPipeControlOnNextFlushRequired() const { return stallingPipeControlOnNextFlushRequired; }
128129

129130
virtual void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) = 0;
130131
MOCKABLE_VIRTUAL bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait);

runtime/command_stream/command_stream_receiver_hw.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
8686
void programMediaSampler(LinearStream &csr, DispatchFlags &dispatchFlags);
8787
void programStateSip(LinearStream &cmdStream, Device &device);
8888
void programVFEState(LinearStream &csr, DispatchFlags &dispatchFlags, uint32_t maxFrontEndThreads);
89+
void programStallingPipeControlForBarrier(LinearStream &cmdStream, DispatchFlags &dispatchFlags);
8990
virtual void initPageTableManagerRegisters(LinearStream &csr){};
9091

9192
void addClearSLMWorkAround(typename GfxFamily::PIPE_CONTROL *pCmd);

runtime/command_stream/command_stream_receiver_hw_base.inl

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -255,11 +255,9 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
255255
TimestampPacketHelper::programCsrDependencies<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies);
256256

257257
if (stallingPipeControlOnNextFlushRequired) {
258-
stallingPipeControlOnNextFlushRequired = false;
259-
auto stallingPipeControlCmd = commandStream.getSpaceForCmd<PIPE_CONTROL>();
260-
*stallingPipeControlCmd = GfxFamily::cmdInitPipeControl;
261-
stallingPipeControlCmd->setCommandStreamerStallEnable(true);
258+
programStallingPipeControlForBarrier(commandStreamCSR, dispatchFlags);
262259
}
260+
263261
initPageTableManagerRegisters(commandStreamCSR);
264262
programPreemption(commandStreamCSR, dispatchFlags);
265263
programComputeMode(commandStreamCSR, dispatchFlags);
@@ -498,6 +496,27 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
498496
return completionStamp;
499497
}
500498

499+
template <typename GfxFamily>
500+
inline void CommandStreamReceiverHw<GfxFamily>::programStallingPipeControlForBarrier(LinearStream &cmdStream, DispatchFlags &dispatchFlags) {
501+
stallingPipeControlOnNextFlushRequired = false;
502+
503+
PIPE_CONTROL *stallingPipeControlCmd;
504+
auto barrierTimestampPacketNodes = dispatchFlags.barrierTimestampPacketNodes;
505+
506+
if (barrierTimestampPacketNodes && barrierTimestampPacketNodes->peekNodes().size() != 0) {
507+
auto barrierTimestampPacketGpuAddress = dispatchFlags.barrierTimestampPacketNodes->peekNodes()[0]->getGpuAddress() +
508+
offsetof(TimestampPacketStorage, packets[0].contextEnd);
509+
510+
stallingPipeControlCmd = PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(
511+
cmdStream, PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
512+
barrierTimestampPacketGpuAddress, 0, false, peekHwInfo());
513+
} else {
514+
stallingPipeControlCmd = PipeControlHelper<GfxFamily>::addPipeControl(cmdStream, false);
515+
}
516+
517+
stallingPipeControlCmd->setCommandStreamerStallEnable(true);
518+
}
519+
501520
template <typename GfxFamily>
502521
inline void CommandStreamReceiverHw<GfxFamily>::flushBatchedSubmissions() {
503522
if (this->dispatchMode == DispatchMode::ImmediateDispatch) {
@@ -624,8 +643,14 @@ size_t CommandStreamReceiverHw<GfxFamily>::getRequiredCmdStreamSize(const Dispat
624643
size += TimestampPacketHelper::getRequiredCmdStreamSize<GfxFamily>(dispatchFlags.csrDependencies);
625644

626645
if (stallingPipeControlOnNextFlushRequired) {
627-
size += sizeof(typename GfxFamily::PIPE_CONTROL);
646+
auto barrierTimestampPacketNodes = dispatchFlags.barrierTimestampPacketNodes;
647+
if (barrierTimestampPacketNodes && barrierTimestampPacketNodes->peekNodes().size() > 0) {
648+
size += PipeControlHelper<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(peekHwInfo());
649+
} else {
650+
size += sizeof(typename GfxFamily::PIPE_CONTROL);
651+
}
628652
}
653+
629654
if (requiresInstructionCacheFlush) {
630655
size += sizeof(typename GfxFamily::PIPE_CONTROL);
631656
}

runtime/command_stream/csr_definitions.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,13 @@ constexpr uint32_t l3AndL1On = 2u;
4444

4545
struct DispatchFlags {
4646
DispatchFlags() = delete;
47-
DispatchFlags(CsrDependencies csrDependencies, PipelineSelectArgs pipelineSelectArgs, FlushStampTrackingObj *flushStampReference,
48-
QueueThrottle throttle, PreemptionMode preemptionMode, uint32_t numGrfRequired,
47+
DispatchFlags(CsrDependencies csrDependencies, const TimestampPacketContainer *barrierTimestampPacketNodes, PipelineSelectArgs pipelineSelectArgs,
48+
FlushStampTrackingObj *flushStampReference, QueueThrottle throttle, PreemptionMode preemptionMode, uint32_t numGrfRequired,
4949
uint32_t l3CacheSettings, uint64_t sliceCount, bool blocking, bool dcFlush,
5050
bool useSLM, bool guardCommandBufferWithPipeControl, bool gsba32BitRequired,
5151
bool requiresCoherency, bool lowPriority, bool implicitFlush,
5252
bool outOfOrderExecutionAllowed, bool multiEngineQueue, bool epilogueRequired) : csrDependencies(csrDependencies),
53+
barrierTimestampPacketNodes(barrierTimestampPacketNodes),
5354
pipelineSelectArgs(pipelineSelectArgs),
5455
flushStampReference(flushStampReference),
5556
throttle(throttle),
@@ -69,6 +70,7 @@ struct DispatchFlags {
6970
multiEngineQueue(multiEngineQueue),
7071
epilogueRequired(epilogueRequired){};
7172
CsrDependencies csrDependencies;
73+
const TimestampPacketContainer *barrierTimestampPacketNodes = nullptr;
7274
PipelineSelectArgs pipelineSelectArgs;
7375
FlushStampTrackingObj *flushStampReference = nullptr;
7476
QueueThrottle throttle = QueueThrottle::MEDIUM;

runtime/helpers/task_information.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
4646

4747
DispatchFlags dispatchFlags(
4848
{}, //csrDependencies
49+
nullptr, //barrierTimestampPacketNodes
4950
{}, //pipelineSelectArgs
5051
commandQueue.flushStamp->getStampReference(), //flushStampReference
5152
commandQueue.getThrottle(), //throttle
@@ -188,6 +189,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
188189

189190
DispatchFlags dispatchFlags(
190191
{}, //csrDependencies
192+
nullptr, //barrierTimestampPacketNodes
191193
{false, kernel->isVmeKernel()}, //pipelineSelectArgs
192194
commandQueue.flushStamp->getStampReference(), //flushStampReference
193195
commandQueue.getThrottle(), //throttle
@@ -252,6 +254,7 @@ void CommandWithoutKernel::dispatchBlitOperation() {
252254
auto &blitProperties = kernelOperation->blitProperties;
253255
blitProperties.csrDependencies.fillFromEventsRequest(eventsRequest, *bcsCsr, CsrDependencies::DependenciesType::All);
254256
blitProperties.csrDependencies.push_back(previousTimestampPacketNodes.get());
257+
blitProperties.csrDependencies.push_back(barrierTimestampPacketNodes.get());
255258
blitProperties.outputTimestampPacket = currentTimestampPacketNodes.get();
256259

257260
bcsCsr->blitBuffer(blitProperties);
@@ -280,6 +283,7 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
280283

281284
DispatchFlags dispatchFlags(
282285
{}, //csrDependencies
286+
barrierTimestampPacketNodes.get(), //barrierTimestampPacketNodes
283287
{}, //pipelineSelectArgs
284288
commandQueue.flushStamp->getStampReference(), //flushStampReference
285289
commandQueue.getThrottle(), //throttle
@@ -330,12 +334,15 @@ void Command::setEventsRequest(EventsRequest &eventsRequest) {
330334
}
331335
}
332336

333-
void Command::setTimestampPacketNode(TimestampPacketContainer &current, TimestampPacketContainer &previous) {
337+
void Command::setTimestampPacketNode(TimestampPacketContainer &current, TimestampPacketContainer &previous, TimestampPacketContainer &barrier) {
334338
currentTimestampPacketNodes = std::make_unique<TimestampPacketContainer>();
335339
currentTimestampPacketNodes->assignAndIncrementNodesRefCounts(current);
336340

337341
previousTimestampPacketNodes = std::make_unique<TimestampPacketContainer>();
338342
previousTimestampPacketNodes->assignAndIncrementNodesRefCounts(previous);
343+
344+
barrierTimestampPacketNodes = std::make_unique<TimestampPacketContainer>();
345+
barrierTimestampPacketNodes->assignAndIncrementNodesRefCounts(barrier);
339346
}
340347

341348
Command::~Command() {
@@ -364,6 +371,9 @@ void Command::makeTimestampPacketsResident(CommandStreamReceiver &commandStreamR
364371
if (previousTimestampPacketNodes) {
365372
previousTimestampPacketNodes->makeResident(commandStreamReceiver);
366373
}
374+
if (barrierTimestampPacketNodes) {
375+
barrierTimestampPacketNodes->makeResident(commandStreamReceiver);
376+
}
367377
}
368378

369379
Command::Command(CommandQueue &commandQueue) : commandQueue(commandQueue) {}

runtime/helpers/task_information.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ class Command : public IFNode<Command> {
9494
virtual LinearStream *getCommandStream() {
9595
return nullptr;
9696
}
97-
void setTimestampPacketNode(TimestampPacketContainer &current, TimestampPacketContainer &previous);
97+
void setTimestampPacketNode(TimestampPacketContainer &current, TimestampPacketContainer &previous, TimestampPacketContainer &barrier);
9898
void setEventsRequest(EventsRequest &eventsRequest);
9999
void makeTimestampPacketsResident(CommandStreamReceiver &commandStreamReceiver);
100100

@@ -106,6 +106,7 @@ class Command : public IFNode<Command> {
106106
std::unique_ptr<KernelOperation> kernelOperation;
107107
std::unique_ptr<TimestampPacketContainer> currentTimestampPacketNodes;
108108
std::unique_ptr<TimestampPacketContainer> previousTimestampPacketNodes;
109+
std::unique_ptr<TimestampPacketContainer> barrierTimestampPacketNodes;
109110
EventsRequest eventsRequest = {0, nullptr, nullptr};
110111
std::vector<cl_event> eventsWaitlist;
111112
};

0 commit comments

Comments
 (0)