Add option to ignore already started error (#227)

stephanos · web-flow · commit f66402507d4d · 2025-10-14T16:27:13.000-07:00
&lt;!--- Note to EXTERNAL Contributors --&gt;
&lt;!-- Thanks for opening a PR! 
If it is a significant code change, please **make sure there is an open
issue** for this.
We work best with you when we have accepted the idea first before you
code. --&gt;

&lt;!--- For ALL Contributors 👇 --&gt;

## What was changed
&lt;!-- Describe what has changed in this PR --&gt;

WISOTT

## Why?
&lt;!-- Tell your future self why have you made these changes --&gt;

When running a scenario it can happen that transient failures lead to an
incomplete scenario run and on retry the scenario will then fail because
workflows were already created. It's not always possible here to use a
new/clean task queue; in those cases, this new flag allows to skip over
these errors.

## Checklist
&lt;!--- add/delete as needed ---&gt;

1. Closes &lt;!-- add issue number here --&gt;

2. How was this tested:
&lt;!--- Please describe how you tested your changes/how we can test them
--&gt;

3. Any docs updates needed?
&lt;!--- update README if applicable
      or point out where to update docs.temporal.io --&gt;
diff --git a/cmd/cli/run_scenario.go b/cmd/cli/run_scenario.go
@@ -56,6 +56,7 @@ type scenarioRunConfig struct {
 	scenarioOptions               []string
 	timeout                       time.Duration
 	doNotRegisterSearchAttributes bool
+	ignoreAlreadyStarted          bool
 }
 
 func (r *scenarioRunner) addCLIFlags(fs *pflag.FlagSet) {
@@ -81,6 +82,8 @@ func (r *scenarioRunConfig) addCLIFlags(fs *pflag.FlagSet) {
 	fs.BoolVar(&r.doNotRegisterSearchAttributes, "do-not-register-search-attributes", false,
 		"Do not register the default search attributes used by scenarios. "+
 			"If the search attributes are not registed by the scenario they must be registered through some other method")
+	fs.BoolVar(&r.ignoreAlreadyStarted, "ignore-already-started", false,
+		"Ignore if a workflow with the same ID already exists. A Scenario may choose to override this behavior.")
 }
 
 func (r *scenarioRunner) preRun() {
@@ -156,6 +159,7 @@ func (r *scenarioRunner) run(ctx context.Context) error {
 			MaxIterationAttempts:          r.maxIterationAttempts,
 			Timeout:                       r.timeout,
 			DoNotRegisterSearchAttributes: r.doNotRegisterSearchAttributes,
+			IgnoreAlreadyStarted:          r.ignoreAlreadyStarted,
 		},
 		ScenarioOptions: scenarioOptions,
 		Namespace:       r.clientOptions.Namespace,
diff --git a/loadgen/scenario.go b/loadgen/scenario.go
@@ -201,6 +201,9 @@ type RunConfiguration struct {
 	// cannot use the SDK to register SAs, instead the SAs must be registered through the control plane.
 	// Default is false.
 	DoNotRegisterSearchAttributes bool
+	// IgnoreAlreadyStarted, if set, will not error when a workflow with the same ID already exists.
+	// Default is false.
+	IgnoreAlreadyStarted bool
 	// OnCompletion, if set, is invoked after each successful iteration completes.
 	OnCompletion func(context.Context, *Run)
 	// HandleExecuteError, if set, is called when Execute returns an error, allowing transformation of errors.
@@ -305,7 +308,7 @@ func (r *Run) DefaultStartWorkflowOptions() client.StartWorkflowOptions {
 	return client.StartWorkflowOptions{
 		TaskQueue:                                TaskQueueForRun(r.RunID),
 		ID:                                       fmt.Sprintf("w-%s-%d", r.RunID, r.Iteration),
-		WorkflowExecutionErrorWhenAlreadyStarted: true,
+		WorkflowExecutionErrorWhenAlreadyStarted: !r.Configuration.IgnoreAlreadyStarted,
 	}
 }
 
diff --git a/scenarios/throughput_stress.go b/scenarios/throughput_stress.go
@@ -14,7 +14,6 @@ import (
 	"github.com/temporalio/omes/loadgen"
 	. "github.com/temporalio/omes/loadgen/kitchensink"
 	"go.temporal.io/api/common/v1"
-	"go.temporal.io/api/serviceerror"
 	"go.temporal.io/api/workflowservice/v1"
 	"go.temporal.io/sdk/temporal"
 	"google.golang.org/protobuf/types/known/emptypb"
@@ -201,19 +200,6 @@ func (t *tpsExecutor) Run(ctx context.Context, info loadgen.ScenarioInfo) error
 		info.Logger.Debugf("Completed iteration %d", run.Iteration)
 	}
 
-	// When resuming, it can happen that the workflow for the current iteration already exists since the snapshot
-	// was not up-to-date. In that case, we just skip this iteration and move on.
-	info.Configuration.HandleExecuteError = func(ctx context.Context, run *loadgen.Run, err error) error {
-		if isResuming {
-			var alreadyStartedErr *serviceerror.WorkflowExecutionAlreadyStarted
-			if errors.As(err, &alreadyStartedErr) {
-				info.Logger.Warnf("after resume, workflow for iteration %d already exists", run.Iteration)
-				return nil
-			}
-		}
-		return err
-	}
-
 	// Start the scenario run.
 	//
 	// NOTE: When resuming, it can happen that there are no more iterations/time left to run more iterations.
@@ -228,7 +214,11 @@ func (t *tpsExecutor) Run(ctx context.Context, info loadgen.ScenarioInfo) error
 				},
 			},
 			UpdateWorkflowOptions: func(ctx context.Context, run *loadgen.Run, options *loadgen.KitchenSinkWorkflowOptions) error {
-				options.StartOptions.ID = workflowID(run.RunID, run.Iteration)
+				options.StartOptions = run.DefaultStartWorkflowOptions()
+				if isResuming {
+					// Enforce to never fail on "workflow already started" when resuming.
+					options.StartOptions.WorkflowExecutionErrorWhenAlreadyStarted = false
+				}
 
 				// Add search attribute to the workflow options so that it can be used in visibility queries.
 				options.StartOptions.TypedSearchAttributes = temporal.NewSearchAttributes(
@@ -255,7 +245,7 @@ func (t *tpsExecutor) Run(ctx context.Context, info loadgen.ScenarioInfo) error
 				//
 				// NOTE: No client actions (e.g. Signal) are defined; however, client action activities are.
 				// That means these client actions are sent from the activity worker instead of Omes.
-				options.Params.WorkflowInput.InitialActions = t.createActions(run.Iteration)
+				options.Params.WorkflowInput.InitialActions = t.createActions(run)
 
 				return nil
 			},
@@ -343,17 +333,17 @@ func (t *tpsExecutor) updateStateOnIterationCompletion() {
 	t.state.LastCompletedIterationAt = time.Now()
 }
 
-func (t *tpsExecutor) createActions(iteration int) []*ActionSet {
+func (t *tpsExecutor) createActions(run *loadgen.Run) []*ActionSet {
 	return []*ActionSet{
 		{
-			Actions:    t.createActionsChunk(iteration, 0, 0, t.config.InternalIterations),
+			Actions:    t.createActionsChunk(run, 0, 0, t.config.InternalIterations),
 			Concurrent: false,
 		},
 	}
 }
 
 func (t *tpsExecutor) createActionsChunk(
-	iteration int,
+	run *loadgen.Run,
 	childCount int,
 	continueAsNewCounter int,
 	remainingInternalIters int,
@@ -367,7 +357,7 @@ func (t *tpsExecutor) createActionsChunk(
 	isLastChunk := remainingInternalIters <= itersPerChunk
 	itersPerChunk = min(itersPerChunk, remainingInternalIters) // cap chunk size to remaining iterations
 
-	rng := rand.New(rand.NewSource(t.config.RngSeed + int64(iteration)))
+	rng := rand.New(rand.NewSource(t.config.RngSeed + int64(run.Iteration)))
 
 	// Create actions for the current chunk
 	for i := 0; i < itersPerChunk; i++ {
@@ -381,7 +371,7 @@ func (t *tpsExecutor) createActionsChunk(
 
 		childCount++
 		asyncActions := []*Action{
-			t.createChildWorkflowAction(iteration, childCount),
+			t.createChildWorkflowAction(run, childCount),
 			PayloadActivity(256, 256, DefaultRemoteActivity),
 			PayloadActivity(256, 256, DefaultRemoteActivity),
 			PayloadActivity(0, 256, DefaultLocalActivity),
@@ -445,7 +435,7 @@ func (t *tpsExecutor) createActionsChunk(
 							InitialActions: []*ActionSet{
 								{
 									Actions: t.createActionsChunk(
-										iteration,
+										run,
 										childCount,
 										continueAsNewCounter+1,
 										remainingInternalIters-itersPerChunk),
@@ -462,7 +452,7 @@ func (t *tpsExecutor) createActionsChunk(
 	return chunkActions
 }
 
-func (t *tpsExecutor) createChildWorkflowAction(iteration int, childID int) *Action {
+func (t *tpsExecutor) createChildWorkflowAction(run *loadgen.Run, childID int) *Action {
 	return &Action{
 		Variant: &Action_ExecChildWorkflow{
 			ExecChildWorkflow: &ExecuteChildWorkflowAction{
@@ -481,7 +471,7 @@ func (t *tpsExecutor) createChildWorkflowAction(iteration int, childID int) *Act
 						},
 					}),
 				},
-				WorkflowId: fmt.Sprintf("%s/child-%d", workflowID(t.runID, iteration), childID),
+				WorkflowId: fmt.Sprintf("%s/child-%d", run.DefaultStartWorkflowOptions().ID, childID),
 				SearchAttributes: map[string]*common.Payload{
 					ThroughputStressScenarioIdSearchAttribute: &common.Payload{
 						Metadata: map[string][]byte{"encoding": []byte("json/plain")},
@@ -623,10 +613,6 @@ func (t *tpsExecutor) createNexusWaitForCancelAction() *Action {
 	}
 }
 
-func workflowID(runID string, iteration int) string {
-	return fmt.Sprintf("throughputStress/%s/iter-%d", runID, iteration)
-}
-
 func (t *tpsExecutor) maybeWithStart(likelihood float64) bool {
 	t.lock.Lock()
 	defer t.lock.Unlock()