Skip to content

Commit 54893bc

Browse files
authored
Only force-load child partitions after successful initialization (#8230)
## What changed? The force-load child partitions mechanism should only happen after successful initialization of the root. ## Why? If the root fails to load, things can get stuck in a loop where the root loads the children and the children cause the root to be loaded again (from userdata polling). ## How did you test it? - [x] built - [ ] run locally and tested manually - [ ] covered by existing tests - [ ] added new unit test(s) - [ ] added new functional test(s)
1 parent 251e20a commit 54893bc

File tree

2 files changed

+15
-13
lines changed

2 files changed

+15
-13
lines changed

service/matching/matching_engine.go

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -383,13 +383,26 @@ func (e *matchingEngineImpl) getTaskQueuePartitionManager(
383383
create bool,
384384
loadCause loadCause,
385385
) (retPM taskQueuePartitionManager, retCreated bool, retErr error) {
386+
var newPM *taskQueuePartitionManagerImpl
387+
386388
defer func() {
387389
if retErr != nil || retPM == nil {
388390
return
389391
}
390392

391393
if retErr = retPM.WaitUntilInitialized(ctx); retErr != nil {
392394
e.unloadTaskQueuePartition(retPM, unloadCauseInitError)
395+
return
396+
}
397+
398+
if retCreated {
399+
// Whenever a root partition is loaded, we need to force all child partitions to load.
400+
// If there is a backlog of tasks on any child partitions, force loading will ensure
401+
// that they can forward their tasks the poller which caused the root partition to be
402+
// loaded. These partitions could be managed by this matchingEngineImpl, but are most
403+
// likely not. We skip checking and just make gRPC requests to force loading them all.
404+
// Note that if retCreated is true, retPM must be newPM, so we can use newPM here.
405+
newPM.ForceLoadAllChildPartitions()
393406
}
394407
}()
395408

@@ -414,7 +427,6 @@ func (e *matchingEngineImpl) getTaskQueuePartitionManager(
414427
tqConfig := newTaskQueueConfig(partition.TaskQueue(), e.config, nsName)
415428
tqConfig.loadCause = loadCause
416429
logger, throttledLogger, metricsHandler := e.loggerAndMetricsForPartition(nsName, partition, tqConfig)
417-
var newPM *taskQueuePartitionManagerImpl
418430
onFatalErr := func(cause unloadCause) { newPM.unloadFromEngine(cause) }
419431
onUserDataChanged := func() { newPM.userDataChanged() }
420432
userDataManager := newUserDataManager(
@@ -453,15 +465,6 @@ func (e *matchingEngineImpl) getTaskQueuePartitionManager(
453465
e.partitionsLock.Unlock()
454466

455467
newPM.Start()
456-
if newPM.Partition().IsRoot() {
457-
// Whenever a root partition is loaded we need to force all other partitions to load.
458-
// If there is a backlog of tasks on any child partitions force loading will ensure that they
459-
// can forward their tasks the poller which caused the root partition to be loaded.
460-
// These partitions could be managed by this matchingEngineImpl, but are most likely not.
461-
// We skip checking and just make gRPC requests to force loading them all.
462-
463-
newPM.ForceLoadAllNonRootPartitions()
464-
}
465468
return newPM, true, nil
466469
}
467470

service/matching/task_queue_partition_manager.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -738,10 +738,9 @@ func (pm *taskQueuePartitionManagerImpl) callerInfoContext(ctx context.Context)
738738
return headers.SetCallerInfo(ctx, headers.NewBackgroundHighCallerInfo(pm.ns.Name().String()))
739739
}
740740

741-
// ForceLoadAllNonRootPartitions spins off go routines which make RPC calls to all the
742-
func (pm *taskQueuePartitionManagerImpl) ForceLoadAllNonRootPartitions() {
741+
// ForceLoadAllChildPartitions spins off go routines which make RPC calls to all the
742+
func (pm *taskQueuePartitionManagerImpl) ForceLoadAllChildPartitions() {
743743
if !pm.partition.IsRoot() {
744-
pm.logger.Info("ForceLoadAllNonRootPartitions called on non-root partition. Prevented circular keep alive (loading) of partitions.")
745744
return
746745
}
747746

0 commit comments

Comments
 (0)