-
Notifications
You must be signed in to change notification settings - Fork 1.2k
No need send sync summary signal after async propagation #8689
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,20 +10,18 @@ import ( | |
| deploymentspb "go.temporal.io/server/api/deployment/v1" | ||
| "go.temporal.io/server/api/matchingservice/v1" | ||
| "go.temporal.io/server/common/namespace" | ||
| "go.temporal.io/server/common/resource" | ||
| ) | ||
|
|
||
| type ( | ||
| Activities struct { | ||
| namespace *namespace.Namespace | ||
| deploymentClient Client | ||
| matchingClient resource.MatchingClient | ||
| activityDeps | ||
| namespace *namespace.Namespace | ||
| } | ||
| ) | ||
|
|
||
| func (a *Activities) SyncWorkerDeploymentVersion(ctx context.Context, args *deploymentspb.SyncVersionStateActivityArgs) (*deploymentspb.SyncVersionStateActivityResult, error) { | ||
| identity := "worker-deployment workflow " + activity.GetInfo(ctx).WorkflowExecution.ID | ||
| res, err := a.deploymentClient.SyncVersionWorkflowFromWorkerDeployment( | ||
| res, err := a.WorkerDeploymentClient.SyncVersionWorkflowFromWorkerDeployment( | ||
| ctx, | ||
| a.namespace, | ||
| args.DeploymentName, | ||
|
|
@@ -48,7 +46,7 @@ func (a *Activities) SyncUnversionedRamp( | |
| ) (*deploymentspb.SyncDeploymentVersionUserDataResponse, error) { | ||
| logger := activity.GetLogger(ctx) | ||
| // Get all the task queues in the current version and put them into SyncUserData format | ||
| currVersionInfo, _, err := a.deploymentClient.DescribeVersion(ctx, a.namespace, input.CurrentVersion, false) | ||
| currVersionInfo, _, err := a.WorkerDeploymentClient.DescribeVersion(ctx, a.namespace, input.CurrentVersion, false) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: is changing from deploymentClient to WorkerDeploymentClient a design change only? I looked around and I think we were repeating the declaration of this earlier on but I just wanna be sure.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's just the name that was there in |
||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
@@ -76,7 +74,7 @@ func (a *Activities) SyncUnversionedRamp( | |
| logger.Info("syncing unversioned ramp to task queue userdata", "taskQueue", syncData.Name, "types", syncData.Types) | ||
| var res *matchingservice.SyncDeploymentUserDataResponse | ||
| var err error | ||
| res, err = a.matchingClient.SyncDeploymentUserData(ctx, &matchingservice.SyncDeploymentUserDataRequest{ | ||
| res, err = a.MatchingClient.SyncDeploymentUserData(ctx, &matchingservice.SyncDeploymentUserDataRequest{ | ||
| NamespaceId: a.namespace.ID().String(), | ||
| TaskQueue: syncData.Name, | ||
| TaskQueueTypes: syncData.Types, | ||
|
|
@@ -107,7 +105,7 @@ func (a *Activities) CheckUnversionedRampUserDataPropagation(ctx context.Context | |
| for n, v := range input.TaskQueueMaxVersions { | ||
| go func(name string, version int64) { | ||
| logger.Info("waiting for unversioned ramp userdata propagation", "taskQueue", name, "version", version) | ||
| _, err := a.matchingClient.CheckTaskQueueUserDataPropagation(ctx, &matchingservice.CheckTaskQueueUserDataPropagationRequest{ | ||
| _, err := a.MatchingClient.CheckTaskQueueUserDataPropagation(ctx, &matchingservice.CheckTaskQueueUserDataPropagationRequest{ | ||
| NamespaceId: a.namespace.ID().String(), | ||
| TaskQueue: name, | ||
| Version: version, | ||
|
|
@@ -126,7 +124,7 @@ func (a *Activities) CheckUnversionedRampUserDataPropagation(ctx context.Context | |
| } | ||
|
|
||
| func (a *Activities) IsVersionMissingTaskQueues(ctx context.Context, args *deploymentspb.IsVersionMissingTaskQueuesArgs) (*deploymentspb.IsVersionMissingTaskQueuesResult, error) { | ||
| res, err := a.deploymentClient.IsVersionMissingTaskQueues( | ||
| res, err := a.WorkerDeploymentClient.IsVersionMissingTaskQueues( | ||
| ctx, | ||
| a.namespace, | ||
| args.PrevCurrentVersion, | ||
|
|
@@ -142,7 +140,7 @@ func (a *Activities) IsVersionMissingTaskQueues(ctx context.Context, args *deplo | |
|
|
||
| func (a *Activities) DeleteWorkerDeploymentVersion(ctx context.Context, args *deploymentspb.DeleteVersionActivityArgs) error { | ||
| identity := "worker-deployment workflow " + activity.GetInfo(ctx).WorkflowExecution.ID | ||
| err := a.deploymentClient.DeleteVersionFromWorkerDeployment( | ||
| err := a.WorkerDeploymentClient.DeleteVersionFromWorkerDeployment( | ||
| ctx, | ||
| a.namespace, | ||
| args.DeploymentName, | ||
|
|
@@ -160,7 +158,7 @@ func (a *Activities) DeleteWorkerDeploymentVersion(ctx context.Context, args *de | |
|
|
||
| func (a *Activities) RegisterWorkerInVersion(ctx context.Context, args *deploymentspb.RegisterWorkerInVersionArgs) error { | ||
| identity := "worker-deployment workflow " + activity.GetInfo(ctx).WorkflowExecution.ID | ||
| err := a.deploymentClient.RegisterWorkerInVersion( | ||
| err := a.WorkerDeploymentClient.RegisterWorkerInVersion( | ||
| ctx, | ||
| a.namespace, | ||
| args, | ||
|
|
@@ -173,7 +171,7 @@ func (a *Activities) RegisterWorkerInVersion(ctx context.Context, args *deployme | |
| } | ||
|
|
||
| func (a *Activities) DescribeVersionFromWorkerDeployment(ctx context.Context, args *deploymentspb.DescribeVersionFromWorkerDeploymentActivityArgs) (*deploymentspb.DescribeVersionFromWorkerDeploymentActivityResult, error) { | ||
| res, _, err := a.deploymentClient.DescribeVersion(ctx, a.namespace, args.Version, false) | ||
| res, _, err := a.WorkerDeploymentClient.DescribeVersion(ctx, a.namespace, args.Version, false) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
@@ -201,7 +199,7 @@ func (a *Activities) SyncDeploymentVersionUserDataFromWorkerDeployment( | |
| var err error | ||
|
|
||
| if input.ForgetVersion { | ||
| res, err = a.matchingClient.SyncDeploymentUserData(ctx, &matchingservice.SyncDeploymentUserDataRequest{ | ||
| res, err = a.MatchingClient.SyncDeploymentUserData(ctx, &matchingservice.SyncDeploymentUserDataRequest{ | ||
| NamespaceId: a.namespace.ID().String(), | ||
| DeploymentName: input.GetDeploymentName(), | ||
| TaskQueue: syncData.Name, | ||
|
|
@@ -211,7 +209,7 @@ func (a *Activities) SyncDeploymentVersionUserDataFromWorkerDeployment( | |
| }, | ||
| }) | ||
| } else { | ||
| res, err = a.matchingClient.SyncDeploymentUserData(ctx, &matchingservice.SyncDeploymentUserDataRequest{ | ||
| res, err = a.MatchingClient.SyncDeploymentUserData(ctx, &matchingservice.SyncDeploymentUserDataRequest{ | ||
| NamespaceId: a.namespace.ID().String(), | ||
| DeploymentName: input.GetDeploymentName(), | ||
| TaskQueue: syncData.Name, | ||
|
|
@@ -250,5 +248,5 @@ func (a *Activities) StartWorkerDeploymentVersionWorkflow( | |
| logger := activity.GetLogger(ctx) | ||
| logger.Info("starting worker deployment version workflow", "deploymentName", input.DeploymentName, "buildID", input.BuildId) | ||
| identity := "deployment workflow " + activity.GetInfo(ctx).WorkflowExecution.ID | ||
| return a.deploymentClient.StartWorkerDeploymentVersion(ctx, a.namespace, input.DeploymentName, input.BuildId, identity, input.RequestId) | ||
| return a.WorkerDeploymentClient.StartWorkerDeploymentVersion(ctx, a.namespace, input.DeploymentName, input.BuildId, identity, input.RequestId) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,7 @@ import ( | |
| "errors" | ||
| "fmt" | ||
| "sync" | ||
| "time" | ||
|
|
||
| deploymentpb "go.temporal.io/api/deployment/v1" | ||
| enumspb "go.temporal.io/api/enums/v1" | ||
|
|
@@ -16,17 +17,19 @@ import ( | |
| "go.temporal.io/sdk/temporal" | ||
| deploymentspb "go.temporal.io/server/api/deployment/v1" | ||
| "go.temporal.io/server/api/matchingservice/v1" | ||
| "go.temporal.io/server/common/log/tag" | ||
| "go.temporal.io/server/common/metrics" | ||
| "go.temporal.io/server/common/namespace" | ||
| "go.temporal.io/server/common/resource" | ||
| "go.temporal.io/server/common/worker_versioning" | ||
| "google.golang.org/protobuf/types/known/timestamppb" | ||
| ) | ||
|
|
||
| const SlowPropagationDelay = 10 * time.Second | ||
|
|
||
| type ( | ||
| VersionActivities struct { | ||
| namespace *namespace.Namespace | ||
| deploymentClient Client | ||
| matchingClient resource.MatchingClient | ||
| activityDeps | ||
| namespace *namespace.Namespace | ||
| } | ||
| ) | ||
|
|
||
|
|
@@ -37,7 +40,7 @@ func (a *VersionActivities) StartWorkerDeploymentWorkflow( | |
| logger := activity.GetLogger(ctx) | ||
| logger.Info("starting worker-deployment workflow", "deploymentName", input.DeploymentName) | ||
| identity := "deployment-version workflow " + activity.GetInfo(ctx).WorkflowExecution.ID | ||
| err := a.deploymentClient.StartWorkerDeployment(ctx, a.namespace, input.DeploymentName, identity, input.RequestId) | ||
| err := a.WorkerDeploymentClient.StartWorkerDeployment(ctx, a.namespace, input.DeploymentName, identity, input.RequestId) | ||
| var precond *serviceerror.FailedPrecondition | ||
| if errors.As(err, &precond) { | ||
| return temporal.NewNonRetryableApplicationError("failed to create deployment", errTooManyDeployments, err) | ||
|
|
@@ -50,6 +53,16 @@ func (a *VersionActivities) SyncDeploymentVersionUserData( | |
| input *deploymentspb.SyncDeploymentVersionUserDataRequest, | ||
| ) (*deploymentspb.SyncDeploymentVersionUserDataResponse, error) { | ||
| logger := activity.GetLogger(ctx) | ||
| scheduledTime := activity.GetInfo(ctx).ScheduledTime | ||
|
|
||
| if scheduledTime.Add(SlowPropagationDelay).Before(time.Now()) { | ||
| a.Logger.Warn("Slow propagation detected, attempting to sync user data", | ||
|
||
| tag.WorkflowNamespace(a.namespace.Name().String()), | ||
| tag.Deployment(input.DeploymentName), | ||
| tag.BuildId(input.GetVersion().GetBuildId()), | ||
| ) | ||
| a.MetricsHandler.Counter(metrics.SlowVersioningDataPropagationCounter.Name()).Record(1) | ||
| } | ||
|
|
||
| errs := make(chan error) | ||
|
|
||
|
|
@@ -88,7 +101,7 @@ func (a *VersionActivities) SyncDeploymentVersionUserData( | |
| req.UpsertVersionsData[input.GetVersion().GetBuildId()] = vd | ||
| } | ||
|
|
||
| res, err = a.matchingClient.SyncDeploymentUserData(ctx, req) | ||
| res, err = a.MatchingClient.SyncDeploymentUserData(ctx, req) | ||
|
|
||
| if err != nil { | ||
| logger.Error("syncing task queue userdata", "taskQueue", syncData.Name, "types", syncData.Types, "error", err) | ||
|
|
@@ -115,14 +128,23 @@ func (a *VersionActivities) SyncDeploymentVersionUserData( | |
| } | ||
|
|
||
| func (a *VersionActivities) CheckWorkerDeploymentUserDataPropagation(ctx context.Context, input *deploymentspb.CheckWorkerDeploymentUserDataPropagationRequest) error { | ||
| scheduledTime := activity.GetInfo(ctx).ScheduledTime | ||
|
|
||
| if scheduledTime.Add(SlowPropagationDelay).Before(time.Now()) { | ||
| a.Logger.Warn("Slow propagation detected, awaiting task queue partition propagation", | ||
| tag.WorkflowNamespace(a.namespace.Name().String()), | ||
| ) | ||
| a.MetricsHandler.Counter(metrics.SlowVersioningDataPropagationCounter.Name()).Record(1) | ||
| } | ||
|
|
||
| logger := activity.GetLogger(ctx) | ||
|
|
||
| errs := make(chan error) | ||
|
|
||
| for n, v := range input.TaskQueueMaxVersions { | ||
| go func(name string, version int64) { | ||
| logger.Info("waiting for userdata propagation", "taskQueue", name, "version", version) | ||
| _, err := a.matchingClient.CheckTaskQueueUserDataPropagation(ctx, &matchingservice.CheckTaskQueueUserDataPropagationRequest{ | ||
| _, err := a.MatchingClient.CheckTaskQueueUserDataPropagation(ctx, &matchingservice.CheckTaskQueueUserDataPropagationRequest{ | ||
| NamespaceId: a.namespace.ID().String(), | ||
| TaskQueue: name, | ||
| Version: version, | ||
|
|
@@ -145,7 +167,7 @@ func (a *VersionActivities) CheckWorkerDeploymentUserDataPropagation(ctx context | |
| func (a *VersionActivities) CheckIfTaskQueuesHavePollers(ctx context.Context, args *deploymentspb.CheckTaskQueuesHavePollersActivityArgs) (bool, error) { | ||
| versionStr := worker_versioning.ExternalWorkerDeploymentVersionToString(worker_versioning.ExternalWorkerDeploymentVersionFromVersion(args.WorkerDeploymentVersion)) | ||
| for tqName, tqTypes := range args.TaskQueuesAndTypes { | ||
| res, err := a.matchingClient.DescribeTaskQueue(ctx, &matchingservice.DescribeTaskQueueRequest{ | ||
| res, err := a.MatchingClient.DescribeTaskQueue(ctx, &matchingservice.DescribeTaskQueueRequest{ | ||
| NamespaceId: a.namespace.ID().String(), | ||
| DescRequest: &workflowservice.DescribeTaskQueueRequest{ | ||
| Namespace: a.namespace.Name().String(), | ||
|
|
@@ -176,7 +198,7 @@ func (a *VersionActivities) CheckIfTaskQueuesHavePollers(ctx context.Context, ar | |
|
|
||
| func (a *VersionActivities) GetVersionDrainageStatus(ctx context.Context, version *deploymentspb.WorkerDeploymentVersion) (*deploymentpb.VersionDrainageInfo, error) { | ||
| logger := activity.GetLogger(ctx) | ||
| response, err := a.deploymentClient.GetVersionDrainageStatus(ctx, a.namespace, worker_versioning.WorkerDeploymentVersionToStringV31(version)) | ||
| response, err := a.WorkerDeploymentClient.GetVersionDrainageStatus(ctx, a.namespace, worker_versioning.WorkerDeploymentVersionToStringV31(version)) | ||
| if err != nil { | ||
| logger.Error("error counting workflows for drainage status", "error", err) | ||
| return nil, err | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -367,18 +367,24 @@ | |
| } | ||
|
|
||
| //nolint:revive,errcheck // In async mode the activities retry indefinitely so this function should not return error | ||
| func (d *VersionWorkflowRunner) deleteVersionFromTaskQueuesAsync(ctx workflow.Context) { | ||
|
Check failure on line 370 in service/worker/workerdeployment/version_workflow.go
|
||
| // If there are propagations in progress, we ask them to cancel and wait for them to do so. | ||
| // The reason is that the ongoing upsert propagation may overwrite the delete that we want to send here, unintentionally undoing it. | ||
| d.cancelPropagations = true | ||
| workflow.Await(ctx, func() bool { return d.asyncPropagationsInProgress == 1 }) // delete itself is counted as one | ||
| d.cancelPropagations = false // need to unset this in case the version is revived | ||
|
|
||
| // Not counting the possible wait for previous propagations in this propagation latency. | ||
| startTime := workflow.Now(ctx) | ||
| defer func() { | ||
| d.metrics.Timer(metrics.VersioningDataPropagationLatency.Name()).Record(time.Since(startTime)) | ||
|
||
| }() | ||
| d.deleteVersionFromTaskQueues(ctx, workflow.WithActivityOptions(ctx, propagationActivityOptions)) | ||
| d.asyncPropagationsInProgress-- | ||
| } | ||
|
|
||
| func (d *VersionWorkflowRunner) deleteVersionFromTaskQueues(ctx workflow.Context, activityCtx workflow.Context) error { | ||
|
|
||
| state := d.GetVersionState() | ||
|
|
||
| // sync version removal to task queues | ||
|
|
@@ -564,7 +570,7 @@ | |
| } | ||
|
|
||
| //nolint:staticcheck // SA1019 | ||
| func (d *VersionWorkflowRunner) handleSyncState(ctx workflow.Context, args *deploymentspb.SyncVersionStateUpdateArgs) (*deploymentspb.SyncVersionStateResponse, error) { | ||
|
Check failure on line 573 in service/worker/workerdeployment/version_workflow.go
|
||
| // use lock to enforce only one update at a time | ||
| err := d.lock.Lock(ctx) | ||
| if err != nil { | ||
|
|
@@ -731,7 +737,7 @@ | |
| } | ||
| } | ||
|
|
||
| func (d *VersionWorkflowRunner) refreshDrainageInfo(ctx workflow.Context) { | ||
|
Check failure on line 740 in service/worker/workerdeployment/version_workflow.go
|
||
| if d.VersionState.GetDrainageInfo().GetStatus() != enumspb.VERSION_DRAINAGE_STATUS_DRAINING { | ||
| return // only refresh when status is draining | ||
| } | ||
|
|
@@ -835,7 +841,7 @@ | |
| return enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_UNSPECIFIED | ||
| } | ||
|
|
||
| func (d *VersionWorkflowRunner) updateVersionStatusAfterDrainageStatusChange(ctx workflow.Context, newStatus enumspb.VersionDrainageStatus) { | ||
|
Check failure on line 844 in service/worker/workerdeployment/version_workflow.go
|
||
| if newStatus == enumspb.VERSION_DRAINAGE_STATUS_DRAINED { | ||
| d.VersionState.Status = enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINED | ||
| } else if newStatus == enumspb.VERSION_DRAINAGE_STATUS_DRAINING { | ||
|
|
@@ -952,11 +958,16 @@ | |
| } | ||
|
|
||
| // syncTaskQueuesAsync performs async propagation of routing config | ||
| func (d *VersionWorkflowRunner) syncTaskQueuesAsync( | ||
|
Check failure on line 961 in service/worker/workerdeployment/version_workflow.go
|
||
| ctx workflow.Context, | ||
| routingConfig *deploymentpb.RoutingConfig, | ||
| newStatus enumspb.WorkerDeploymentVersionStatus, | ||
| ) error { | ||
| startTime := workflow.Now(ctx) | ||
| defer func() { | ||
| d.metrics.Timer(metrics.VersioningDataPropagationLatency.Name()).Record(time.Since(startTime)) | ||
| }() | ||
|
|
||
| state := d.GetVersionState() | ||
|
|
||
| // Build WorkerDeploymentVersionData for this version from current state | ||
|
|
@@ -1044,7 +1055,11 @@ | |
| } | ||
|
|
||
| if routingConfig != nil { | ||
| d.syncSummary(ctx) | ||
| if workflow.GetVersion(ctx, "no-propagation-sync-summary", workflow.DefaultVersion, 0) == workflow.DefaultVersion { | ||
| // TODO: clean this unnecessary sync up. | ||
| // No summary changes happen in async propagation that the deployment workflow | ||
|
||
| d.syncSummary(ctx) | ||
| } | ||
| // Signal deployment workflow that routing config propagation completed | ||
| d.signalPropagationComplete(ctx, routingConfig.GetRevisionNumber()) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
curious: do we wanna also have a counter to track the data propagation for those users using sync workflows?
The reason why I say this is because they could make things easier to debug/understand when they have an operator's hat on
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you agree to the above point I made, we need to rename the variables just fyi
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
but we're not gonna support sync wfs for long. once we verify async then we'd make that the default and eventually cleanup async path.