Fix throughput_stress resume (#195)

stephanos · web-flow · commit 669a545cb8de · 2025-09-09T14:18:26.000-07:00
&lt;!--- Note to EXTERNAL Contributors --&gt;
&lt;!-- Thanks for opening a PR! 
If it is a significant code change, please **make sure there is an open
issue** for this.
We work best with you when we have accepted the idea first before you
code. --&gt;

&lt;!--- For ALL Contributors 👇 --&gt;

## What was changed
&lt;!-- Describe what has changed in this PR --&gt;

Fix resume bug.

## Why?
&lt;!-- Tell your future self why have you made these changes --&gt;

The `OnComplete` hook wasn't always invoked properly leading to
under-counting the completed iterations.

## Checklist
&lt;!--- add/delete as needed ---&gt;

1. Closes &lt;!-- add issue number here --&gt;

2. How was this tested:
&lt;!--- Please describe how you tested your changes/how we can test them
--&gt;

3. Any docs updates needed?
&lt;!--- update README if applicable
      or point out where to update docs.temporal.io --&gt;
diff --git a/loadgen/generic_executor.go b/loadgen/generic_executor.go
@@ -76,7 +76,7 @@ func (g *genericRun) Run(ctx context.Context) error {
 
 	startTime := time.Now()
 	var runErr error
-	doneCh := make(chan error)
+	doneCh := make(chan error, g.config.MaxConcurrent)
 	var currentlyRunning int
 	waitOne := func(contextToWaitOn context.Context) {
 		select {
@@ -125,11 +125,26 @@ func (g *genericRun) Run(ctx context.Context) error {
 		run := g.info.NewRun(i + 1)
 		go func() {
 			var err error
-			startTime := time.Now()
+			iterStart := time.Now()
+
+			defer func() {
+				g.executeTimer.Record(time.Since(iterStart))
+
+				select {
+				case <-ctx.Done():
+				case doneCh <- err:
+					if err == nil && g.config.OnCompletion != nil {
+						g.config.OnCompletion(ctx, run)
+					}
+				}
+			}()
 
 		retryLoop:
 			for {
 				err = g.executor.Execute(ctx, run)
+				if err != nil && g.config.HandleExecuteError != nil {
+					err = g.config.HandleExecuteError(ctx, run, err)
+				}
 				if err == nil {
 					break
 				}
@@ -146,18 +161,9 @@ func (g *genericRun) Run(ctx context.Context) error {
 
 				select {
 				case <-time.After(backoff):
+					// wait for backoff, then try again
 				case <-ctx.Done():
-					break retryLoop // just fall through to next select
-				}
-			}
-
-			select {
-			case <-ctx.Done():
-			case doneCh <- err:
-				g.executeTimer.Record(time.Since(startTime))
-
-				if err == nil && g.config.OnCompletion != nil {
-					g.config.OnCompletion(ctx, run)
+					break retryLoop
 				}
 			}
 		}()
@@ -166,7 +172,8 @@ func (g *genericRun) Run(ctx context.Context) error {
 	// Wait for all to be done or an error to occur. We will wait past the overall duration for
 	// executions to complete. It is expected that whatever is running omes may choose to enforce
 	// a hard timeout if waiting for started executions to complete exceeds a certain threshold.
-	g.logger.Info("Run cooldown: stopped starting new iterations; waiting for running ones to complete")
+	g.logger.Infof("Run cooldown: stopped starting new iterations and waiting for %d iterations to complete",
+		currentlyRunning)
 	for runErr == nil && currentlyRunning > 0 {
 		waitOne(ctx)
 		if ctx.Err() != nil {
diff --git a/loadgen/helpers.go b/loadgen/helpers.go
@@ -20,8 +20,6 @@ func InitSearchAttribute(
 	info ScenarioInfo,
 	attributeName string,
 ) error {
-	info.Logger.Infof("Initialising Search Attribute %q", attributeName)
-
 	_, err := info.Client.OperatorService().AddSearchAttributes(ctx,
 		&operatorservice.AddSearchAttributesRequest{
 			Namespace: info.Namespace,
@@ -32,9 +30,9 @@ func InitSearchAttribute(
 	var deniedErr *serviceerror.PermissionDenied
 	var alreadyErr *serviceerror.AlreadyExists
 	if errors.As(err, &alreadyErr) {
-		info.Logger.Infof("Search Attribute %q already exists", attributeName)
+		info.Logger.Infof("Search Attribute %q not added: already exists", attributeName)
 	} else if err != nil {
-		info.Logger.Warnf("Failed to add Search Attribute %q: %v", attributeName, err)
+		info.Logger.Warnf("Search Attribute %q not added: %v", attributeName, err)
 		if !errors.As(err, &deniedErr) {
 			return err
 		}
@@ -45,8 +43,6 @@ func InitSearchAttribute(
 	return nil
 }
 
-// MinVisibilityCountEventually checks that the given visibility query returns at least the expected
-// number of workflows. It repeatedly queries until it either finds the expected count or times out.
 func MinVisibilityCountEventually(
 	ctx context.Context,
 	info ScenarioInfo,
@@ -64,24 +60,44 @@ func MinVisibilityCountEventually(
 	defer printTicker.Stop()
 
 	var lastVisibilityCount int64
-	for {
+	done := false
+
+	check := func() error {
+		visibilityCount, err := info.Client.CountWorkflow(timeoutCtx, request)
+		if err != nil {
+			return fmt.Errorf("failed to count workflows in visibility: %w", err)
+		}
+		lastVisibilityCount = visibilityCount.Count
+		if lastVisibilityCount >= int64(minCount) {
+			done = true
+		}
+		return nil
+	}
+
+	// Initial check before entering the loop.
+	if err := check(); err != nil {
+		return err
+	}
+
+	// Loop until we reach the desired count or timeout.
+	for !done {
 		select {
 		case <-timeoutCtx.Done():
-			return fmt.Errorf("expected at least %d workflows in visibility, got %d after waiting %v",
-				minCount, lastVisibilityCount, waitAtMost)
+			return fmt.Errorf(
+				"expected at least %d workflows in visibility, got %d after waiting %v",
+				minCount, lastVisibilityCount, waitAtMost,
+			)
 
 		case <-printTicker.C:
-			info.Logger.Infof("current visibility count: %d (expected at least: %d)\n", lastVisibilityCount, minCount)
+			info.Logger.Infof("current visibility count: %d (expected at least: %d)\n",
+				lastVisibilityCount, minCount)
 
 		case <-countTicker.C:
-			visibilityCount, err := info.Client.CountWorkflow(ctx, request)
-			if err != nil {
-				return fmt.Errorf("failed to count workflows in visibility: %w", err)
-			}
-			lastVisibilityCount = visibilityCount.Count
-			if lastVisibilityCount >= int64(minCount) {
-				return nil
+			if err := check(); err != nil {
+				return err
 			}
 		}
 	}
+
+	return nil
 }
diff --git a/loadgen/kitchen_sink_executor.go b/loadgen/kitchen_sink_executor.go
@@ -16,8 +16,6 @@ type KitchenSinkExecutor struct {
 	// Called for each iteration. TestInput is copied entirely into KitchenSinkWorkflowOptions on
 	// each iteration.
 	UpdateWorkflowOptions func(context.Context, *Run, *KitchenSinkWorkflowOptions) error
-
-	DefaultConfiguration RunConfiguration
 }
 
 func (k KitchenSinkExecutor) Run(ctx context.Context, info ScenarioInfo) error {
@@ -46,7 +44,3 @@ func (k KitchenSinkExecutor) Run(ctx context.Context, info ScenarioInfo) error {
 	}
 	return ge.Run(ctx, info)
 }
-
-func (k KitchenSinkExecutor) GetDefaultConfiguration() RunConfiguration {
-	return k.DefaultConfiguration
-}
diff --git a/loadgen/kitchen_sink_executor_test.go b/loadgen/kitchen_sink_executor_test.go
@@ -784,15 +784,13 @@ func testForSDK(
 
 	executor := &KitchenSinkExecutor{
 		TestInput: tc.testInput,
-		DefaultConfiguration: RunConfiguration{
-			Iterations: 1,
-		},
 	}
-
 	scenarioInfo := ScenarioInfo{
-		ScenarioName:  "kitchenSinkTest",
-		RunID:         fmt.Sprintf("%s-%d", t.Name(), time.Now().Unix()),
-		Configuration: executor.DefaultConfiguration,
+		ScenarioName: "kitchenSinkTest",
+		RunID:        fmt.Sprintf("%s-%d", t.Name(), time.Now().Unix()),
+		Configuration: RunConfiguration{
+			Iterations: 1,
+		},
 	}
 
 	if expectedErr, expectUnsupported := tc.expectedUnsupportedErrs[sdk]; expectUnsupported {
diff --git a/loadgen/scenario.go b/loadgen/scenario.go
@@ -203,6 +203,8 @@ type RunConfiguration struct {
 	DoNotRegisterSearchAttributes bool
 	// OnCompletion, if set, is invoked after each successful iteration completes.
 	OnCompletion func(context.Context, *Run)
+	// HandleExecuteError, if set, is called when Execute returns an error, allowing transformation of errors.
+	HandleExecuteError func(context.Context, *Run, error) error
 }
 
 func (r *RunConfiguration) ApplyDefaults() {
diff --git a/scenarios/fixed_resource_consumption.go b/scenarios/fixed_resource_consumption.go
@@ -1,13 +1,14 @@
 package scenarios
 
 import (
+	"math"
+	"math/rand"
+	"time"
+
 	"github.com/temporalio/omes/loadgen"
 	"github.com/temporalio/omes/loadgen/kitchensink"
 	"go.temporal.io/api/common/v1"
 	"google.golang.org/protobuf/types/known/durationpb"
-	"math"
-	"math/rand"
-	"time"
 )
 
 // This scenario is meant to be adjusted and run manually to evaluate the performance of different
@@ -62,10 +63,6 @@ func init() {
 	loadgen.MustRegisterScenario(loadgen.Scenario{
 		Description: "Used for testing slot provider performance. Runs activities that consume certain amounts of resources.",
 		Executor: loadgen.KitchenSinkExecutor{
-			DefaultConfiguration: loadgen.RunConfiguration{
-				Iterations:    1,
-				MaxConcurrent: 1,
-			},
 			TestInput: &kitchensink.TestInput{
 				WorkflowInput: &kitchensink.WorkflowInput{
 					InitialActions: []*kitchensink.ActionSet{
diff --git a/scenarios/throughput_stress.go b/scenarios/throughput_stress.go
@@ -14,6 +14,7 @@ import (
 	. "github.com/temporalio/omes/loadgen/kitchensink"
 	"go.temporal.io/api/common/v1"
 	"go.temporal.io/api/enums/v1"
+	"go.temporal.io/api/serviceerror"
 	"go.temporal.io/api/workflowservice/v1"
 	"go.temporal.io/sdk/temporal"
 	"google.golang.org/protobuf/types/known/emptypb"
@@ -191,13 +192,27 @@ func (t *tpsExecutor) Run(ctx context.Context, info loadgen.ScenarioInfo) error
 	// Listen to iteration completion events to update the state.
 	info.Configuration.OnCompletion = func(ctx context.Context, run *loadgen.Run) {
 		t.updateStateOnIterationCompletion()
+		info.Logger.Debugf("Completed iteration %d", run.Iteration)
+	}
+
+	// When resuming, it can happen that the workflow for the current iteration already exists since the snapshot
+	// was not up-to-date. In that case, we just skip this iteration and move on.
+	info.Configuration.HandleExecuteError = func(ctx context.Context, run *loadgen.Run, err error) error {
+		if isResuming {
+			var alreadyStartedErr *serviceerror.WorkflowExecutionAlreadyStarted
+			if errors.As(err, &alreadyStartedErr) {
+				info.Logger.Warnf("after resume, workflow for iteration %d already exists", run.Iteration)
+				return nil
+			}
+		}
+		return err
 	}
 
 	// Start the scenario run.
 	//
-	// NOTE: When resuming, it can happen that there is no more time left to run more iterations. In that case,
-	// we skip the executor run and go straight to the post-scenario verification.
-	if isResuming && info.Configuration.Duration <= 0 {
+	// NOTE: When resuming, it can happen that there are no more iterations/time left to run more iterations.
+	// In that case, we skip the executor run and go straight to the post-scenario verification.
+	if isResuming && info.Configuration.Duration <= 0 && info.Configuration.Iterations == 0 {
 		info.Logger.Info("Skipping executor run: out of time")
 	} else {
 		ksExec := &loadgen.KitchenSinkExecutor{
@@ -316,6 +331,7 @@ func (t *tpsExecutor) updateStateOnIterationCompletion() {
 	defer t.lock.Unlock()
 	t.state.CompletedIterations += 1
 	t.state.LastCompletedIterationAt = time.Now()
+	fmt.Println("Updating state on iteration completion", t.state.CompletedIterations)
 }
 
 func (t *tpsExecutor) createActions(iteration int) []*ActionSet {
diff --git a/scenarios/throughput_stress_test.go b/scenarios/throughput_stress_test.go
@@ -19,14 +19,14 @@ func TestThroughputStress(t *testing.T) {
 	taskQueueName := loadgen.TaskQueueForRun(scenarioName, runID)
 
 	env := workers.SetupTestEnvironment(t,
-		workers.WithExecutorTimeout(2*time.Minute),
+		workers.WithExecutorTimeout(1*time.Minute),
 		workers.WithNexusEndpoint(taskQueueName))
 
 	scenarioInfo := loadgen.ScenarioInfo{
 		ScenarioName: scenarioName,
 		RunID:        runID,
 		Configuration: loadgen.RunConfiguration{
-			Iterations: 1,
+			Iterations: 2,
 		},
 		ScenarioOptions: map[string]string{
 			IterFlag:                          "2",
@@ -44,17 +44,29 @@ func TestThroughputStress(t *testing.T) {
 	require.NoError(t, err, "Executor should complete successfully")
 
 	state := executor.Snapshot().(tpsState)
-	require.Equal(t, state.CompletedIterations, 1)
+	require.Equal(t, state.CompletedIterations, 2)
 
-	t.Log("Start the executor again, pretending to resume")
+	t.Log("Start the executor again, resuming from middle")
 
 	err = executor.LoadState(func(v any) error {
 		s := v.(*tpsState)
-		s.CompletedIterations = state.CompletedIterations
+		s.CompletedIterations = 0 // execution will start from iteration 1
 		return nil
 	})
 	require.NoError(t, err)
 
 	err = env.RunExecutorTest(t, executor, scenarioInfo, cmdoptions.LangGo)
-	require.NoError(t, err, "Executor should complete successfully again")
+	require.NoError(t, err, "Executor should complete successfully when resuming from middle")
+
+	t.Log("Start the executor again, resuming from end")
+
+	err = executor.LoadState(func(v any) error {
+		s := v.(*tpsState)
+		s.CompletedIterations = s.CompletedIterations
+		return nil
+	})
+	require.NoError(t, err)
+
+	err = env.RunExecutorTest(t, executor, scenarioInfo, cmdoptions.LangGo)
+	require.NoError(t, err, "Executor should complete successfully when resuming from end")
 }
diff --git a/workers/test_env.go b/workers/test_env.go
@@ -187,7 +187,7 @@ func (env *TestEnvironment) RunExecutorTest(
 	testCtx, cancelTestCtx := context.WithTimeout(t.Context(), env.executorTimeout)
 	defer cancelTestCtx()
 
-	workerDone := env.startWorker(t, sdk, taskQueueName, scenarioID)
+	workerDone := env.startWorker(testCtx, sdk, taskQueueName, scenarioID)
 
 	// Update scenario info with test environment details
 	scenarioInfo.Logger = env.logger.Named("executor")
@@ -259,7 +259,7 @@ func (env *TestEnvironment) ensureWorkerBuilt(t *testing.T, sdk cmdoptions.Langu
 }
 
 func (env *TestEnvironment) startWorker(
-	t *testing.T,
+	ctx context.Context,
 	sdk cmdoptions.Language,
 	taskQueueName string,
 	scenarioID cmdoptions.ScenarioID,
@@ -283,7 +283,7 @@ func (env *TestEnvironment) startWorker(
 				Namespace: testNamespace,
 			},
 		}
-		workerDone <- runner.Run(t.Context(), baseDir)
+		workerDone <- runner.Run(ctx, baseDir)
 	}()
 
 	return workerDone

Original file line number	Diff line number	Diff line change
`@@ -16,8 +16,6 @@ type KitchenSinkExecutor struct {`
`16`	`16`	`// Called for each iteration. TestInput is copied entirely into KitchenSinkWorkflowOptions on`
`17`	`17`	`// each iteration.`
`18`	`18`	`UpdateWorkflowOptions func(context.Context, Run, KitchenSinkWorkflowOptions) error`
`19`		`-`
`20`		`- DefaultConfiguration RunConfiguration`
`21`	`19`	`}`
`22`	`20`
`23`	`21`	`func (k KitchenSinkExecutor) Run(ctx context.Context, info ScenarioInfo) error {`
`@@ -46,7 +44,3 @@ func (k KitchenSinkExecutor) Run(ctx context.Context, info ScenarioInfo) error {`
`46`	`44`	`}`
`47`	`45`	`return ge.Run(ctx, info)`
`48`	`46`	`}`
`49`		`-`
`50`		`-func (k KitchenSinkExecutor) GetDefaultConfiguration() RunConfiguration {`
`51`		`- return k.DefaultConfiguration`
`52`		`-}`
Original file line number	Diff line number	Diff line change
`@@ -203,6 +203,8 @@ type RunConfiguration struct {`
`203`	`203`	`DoNotRegisterSearchAttributes bool`
`204`	`204`	`// OnCompletion, if set, is invoked after each successful iteration completes.`
`205`	`205`	`OnCompletion func(context.Context, *Run)`
	`206`	`+ // HandleExecuteError, if set, is called when Execute returns an error, allowing transformation of errors.`
	`207`	`+ HandleExecuteError func(context.Context, *Run, error) error`
`206`	`208`	`}`
`207`	`209`
`208`	`210`	`func (r *RunConfiguration) ApplyDefaults() {`