Skip to content

Commit c56db2e

Browse files
committed
Merge branch 'main' into ss/transitioning-func-test
2 parents b18bff8 + 9524cec commit c56db2e

File tree

88 files changed

+1620
-1024
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

88 files changed

+1620
-1024
lines changed

api/deployment/v1/message.pb.go

Lines changed: 210 additions & 110 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

api/historyservice/v1/request_response.pb.go

Lines changed: 11 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

common/dynamicconfig/constants.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2331,6 +2331,12 @@ the dlq (or will drop them if not enabled)`,
23312331
that task will be sent to DLQ.`,
23322332
)
23332333

2334+
MaxLocalParentWorkflowVerificationDuration = NewGlobalDurationSetting(
2335+
"history.maxLocalParentWorkflowVerificationDuration",
2336+
5*time.Minute,
2337+
`MaxLocalParentWorkflowVerificationDuration controls the maximum duration to verify on the local cluster before requesting to resend parent workflow.`,
2338+
)
2339+
23342340
ReplicationStreamSyncStatusDuration = NewGlobalDurationSetting(
23352341
"history.ReplicationStreamSyncStatusDuration",
23362342
1*time.Second,

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ require (
5959
go.opentelemetry.io/otel/sdk v1.34.0
6060
go.opentelemetry.io/otel/sdk/metric v1.34.0
6161
go.opentelemetry.io/otel/trace v1.34.0
62-
go.temporal.io/api v1.49.1-0.20250509172953-4c3944c121c0
62+
go.temporal.io/api v1.49.2-0.20250514204244-aef60694cca5
6363
go.temporal.io/sdk v1.34.0
6464
go.temporal.io/version v0.3.0
6565
go.uber.org/automaxprocs v1.6.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -385,8 +385,8 @@ go.opentelemetry.io/otel/trace v1.34.0 h1:+ouXS2V8Rd4hp4580a8q23bg0azF2nI8cqLYnC
385385
go.opentelemetry.io/otel/trace v1.34.0/go.mod h1:Svm7lSjQD7kG7KJ/MUHPVXSDGz2OX4h0M2jHBhmSfRE=
386386
go.opentelemetry.io/proto/otlp v1.5.0 h1:xJvq7gMzB31/d406fB8U5CBdyQGw4P399D1aQWU/3i4=
387387
go.opentelemetry.io/proto/otlp v1.5.0/go.mod h1:keN8WnHxOy8PG0rQZjJJ5A2ebUoafqWp0eVQ4yIXvJ4=
388-
go.temporal.io/api v1.49.1-0.20250509172953-4c3944c121c0 h1:o9iwtENRgyA8sxaigvmf4e6//QHJKG8+mxgl7FjlHQE=
389-
go.temporal.io/api v1.49.1-0.20250509172953-4c3944c121c0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM=
388+
go.temporal.io/api v1.49.2-0.20250514204244-aef60694cca5 h1:+rttnhF8IQs6irpAYbRBXf3bolqG7YkFfO123Poj/tU=
389+
go.temporal.io/api v1.49.2-0.20250514204244-aef60694cca5/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM=
390390
go.temporal.io/sdk v1.34.0 h1:VLg/h6ny7GvLFVoQPqz2NcC93V9yXboQwblkRvZ1cZE=
391391
go.temporal.io/sdk v1.34.0/go.mod h1:iE4U5vFrH3asOhqpBBphpj9zNtw8btp8+MSaf5A0D3w=
392392
go.temporal.io/version v0.3.0 h1:dMrei9l9NyHt8nG6EB8vAwDLLTwx2SvRyucCSumAiig=

proto/internal/temporal/server/api/deployment/v1/message.proto

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,11 @@ message VersionLocalState {
6666
// Can be in the range [0, 100] if the version is ramping.
6767
float ramp_percentage = 6;
6868

69+
// Timestamp when this version first became current or ramping.
70+
google.protobuf.Timestamp first_activation_time = 12;
71+
// Timestamp when this version last stopped being current or ramping.
72+
google.protobuf.Timestamp last_deactivation_time = 13;
73+
6974
// Helps user determine when it is safe to decommission the workers of this
7075
// Version. Not present when version is current or ramping.
7176
// Current limitations:
@@ -139,7 +144,27 @@ message WorkerDeploymentLocalState {
139144
message WorkerDeploymentVersionSummary {
140145
string version = 1;
141146
google.protobuf.Timestamp create_time = 2;
142-
temporal.api.enums.v1.VersionDrainageStatus drainage_status = 3;
147+
temporal.api.enums.v1.VersionDrainageStatus drainage_status = 3 [deprecated=true];
148+
// Information about workflow drainage to help the user determine when it is safe
149+
// to decommission a Version. Not present while version is current or ramping.
150+
temporal.api.deployment.v1.VersionDrainageInfo drainage_info = 4;
151+
// Last time `current_since_time`, `ramping_since_time, or `ramp_percentage` of this version changed.
152+
google.protobuf.Timestamp routing_update_time = 5;
153+
154+
// (-- api-linter: core::0140::prepositions=disabled
155+
// aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --)
156+
// Nil if not current.
157+
google.protobuf.Timestamp current_since_time = 6;
158+
159+
// (-- api-linter: core::0140::prepositions=disabled
160+
// aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --)
161+
// Nil if not ramping. Updated when the version first starts ramping, not on each ramp change.
162+
google.protobuf.Timestamp ramping_since_time = 7;
163+
164+
// Timestamp when this version first became current or ramping.
165+
google.protobuf.Timestamp first_activation_time = 8;
166+
// Timestamp when this version last stopped being current or ramping.
167+
google.protobuf.Timestamp last_deactivation_time = 9;
143168
}
144169

145170
// used as Worker Deployment Version workflow update input:

proto/internal/temporal/server/api/historyservice/v1/request_response.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -556,6 +556,7 @@ message VerifyChildExecutionCompletionRecordedRequest {
556556
int64 parent_initiated_id = 4;
557557
int64 parent_initiated_version = 5;
558558
temporal.server.api.clock.v1.VectorClock clock = 6;
559+
bool resend_parent = 7;
559560
}
560561

561562
message VerifyChildExecutionCompletionRecordedResponse {

service/history/api/verifychildworkflowcompletionrecorded/api.go

Lines changed: 102 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,27 +2,32 @@ package verifychildworkflowcompletionrecorded
22

33
import (
44
"context"
5+
"errors"
56

7+
commonpb "go.temporal.io/api/common/v1"
8+
"go.temporal.io/api/serviceerror"
9+
"go.temporal.io/server/api/adminservice/v1"
610
enumsspb "go.temporal.io/server/api/enums/v1"
11+
historyspb "go.temporal.io/server/api/history/v1"
712
"go.temporal.io/server/api/historyservice/v1"
13+
persistencespb "go.temporal.io/server/api/persistence/v1"
814
"go.temporal.io/server/common"
915
"go.temporal.io/server/common/definition"
1016
"go.temporal.io/server/common/locks"
1117
"go.temporal.io/server/common/namespace"
18+
"go.temporal.io/server/common/persistence/transitionhistory"
19+
"go.temporal.io/server/common/persistence/versionhistory"
1220
"go.temporal.io/server/service/history/api"
1321
"go.temporal.io/server/service/history/consts"
22+
historyi "go.temporal.io/server/service/history/interfaces"
1423
)
1524

16-
func Invoke(
25+
func verifyChildExecution(
1726
ctx context.Context,
18-
request *historyservice.VerifyChildExecutionCompletionRecordedRequest,
1927
workflowConsistencyChecker api.WorkflowConsistencyChecker,
20-
) (resp *historyservice.VerifyChildExecutionCompletionRecordedResponse, retError error) {
21-
namespaceID := namespace.ID(request.GetNamespaceId())
22-
if err := api.ValidateNamespaceUUID(namespaceID); err != nil {
23-
return nil, err
24-
}
25-
28+
request *historyservice.VerifyChildExecutionCompletionRecordedRequest,
29+
) (versionedTransition *persistencespb.VersionedTransition,
30+
versionHistories *historyspb.VersionHistories, retError error) {
2631
workflowLease, err := workflowConsistencyChecker.GetWorkflowLease(
2732
ctx,
2833
request.Clock,
@@ -38,40 +43,123 @@ func Invoke(
3843
locks.PriorityLow,
3944
)
4045
if err != nil {
41-
return nil, err
46+
return nil, nil, err
4247
}
4348
defer func() { workflowLease.GetReleaseFn()(retError) }()
4449

4550
mutableState := workflowLease.GetMutableState()
4651
if !mutableState.IsWorkflowExecutionRunning() &&
4752
mutableState.GetExecutionState().State != enumsspb.WORKFLOW_EXECUTION_STATE_ZOMBIE {
4853
// parent has already completed and can't be blocked after failover.
49-
return &historyservice.VerifyChildExecutionCompletionRecordedResponse{}, nil
54+
return nil, nil, nil
5055
}
5156

5257
onCurrentBranch, err := api.IsHistoryEventOnCurrentBranch(mutableState, request.ParentInitiatedId, request.ParentInitiatedVersion)
5358
if err != nil {
5459
// initiated event not found on any branch
55-
return nil, consts.ErrWorkflowNotReady
60+
return nil, nil, consts.ErrWorkflowNotReady
5661
}
5762

5863
if !onCurrentBranch {
5964
// due to conflict resolution, the initiated event may on a different branch of the workflow.
6065
// we don't have to do anything and can simply return not found error. Standby logic
6166
// after seeing this error will give up verification.
62-
return nil, consts.ErrChildExecutionNotFound
67+
return nil, nil, consts.ErrChildExecutionNotFound
6368
}
6469

6570
ci, isRunning := mutableState.GetChildExecutionInfo(request.ParentInitiatedId)
6671
if isRunning {
6772
if ci.StartedEventId != common.EmptyEventID &&
6873
ci.GetStartedWorkflowId() != request.ChildExecution.GetWorkflowId() {
6974
// this can happen since we may not have the initiated version
70-
return nil, consts.ErrChildExecutionNotFound
75+
return nil, nil, consts.ErrChildExecutionNotFound
76+
}
77+
78+
return nil, nil, consts.ErrWorkflowNotReady
79+
}
80+
81+
versionedTransition = transitionhistory.CopyVersionedTransition(transitionhistory.LastVersionedTransition(mutableState.GetExecutionInfo().TransitionHistory))
82+
versionHistories = versionhistory.CopyVersionHistories(mutableState.GetExecutionInfo().VersionHistories)
83+
return versionedTransition, versionHistories, nil
84+
}
85+
86+
func Invoke(
87+
ctx context.Context,
88+
request *historyservice.VerifyChildExecutionCompletionRecordedRequest,
89+
workflowConsistencyChecker api.WorkflowConsistencyChecker,
90+
shardContext historyi.ShardContext,
91+
) (*historyservice.VerifyChildExecutionCompletionRecordedResponse, error) {
92+
namespaceID := namespace.ID(request.GetNamespaceId())
93+
if err := api.ValidateNamespaceUUID(namespaceID); err != nil {
94+
return nil, err
95+
}
96+
97+
resendParent := false
98+
versionedTransition, versionHistories, err := verifyChildExecution(ctx, workflowConsistencyChecker, request)
99+
switch err.(type) {
100+
case nil:
101+
return &historyservice.VerifyChildExecutionCompletionRecordedResponse{}, nil
102+
case *serviceerror.NotFound, *serviceerror.WorkflowNotReady:
103+
resendParent = request.GetResendParent()
104+
}
105+
if !resendParent {
106+
return nil, err
107+
}
108+
109+
// Resend parent workflow from source cluster
110+
111+
clusterMetadata := shardContext.GetClusterMetadata()
112+
targetClusterInfo := clusterMetadata.GetAllClusterInfo()[clusterMetadata.GetCurrentClusterName()]
113+
114+
namespaceEntry, err := shardContext.GetNamespaceRegistry().GetNamespaceByID(namespace.ID(namespaceID))
115+
if err != nil {
116+
return nil, err
117+
}
118+
119+
activeClusterName := namespaceEntry.ActiveClusterName()
120+
if activeClusterName == clusterMetadata.GetCurrentClusterName() {
121+
return nil, errors.New("namespace becomes active when processing task as standby")
122+
}
123+
124+
remoteAdminClient, err := shardContext.GetRemoteAdminClient(activeClusterName)
125+
if err != nil {
126+
return nil, err
127+
}
128+
129+
resp, err := remoteAdminClient.SyncWorkflowState(ctx, &adminservice.SyncWorkflowStateRequest{
130+
NamespaceId: request.NamespaceId,
131+
Execution: &commonpb.WorkflowExecution{
132+
WorkflowId: request.ParentExecution.WorkflowId,
133+
RunId: request.ParentExecution.RunId,
134+
},
135+
VersionedTransition: versionedTransition,
136+
VersionHistories: versionHistories,
137+
TargetClusterId: int32(targetClusterInfo.InitialFailoverVersion),
138+
})
139+
140+
if err != nil {
141+
if common.IsNotFoundError(err) {
142+
// parent workflow is not found on source cluster,
143+
// we can return empty response to indicate that verification is done
144+
// TODO: add parent workflow to workflowNotFoundCache
145+
return &historyservice.VerifyChildExecutionCompletionRecordedResponse{}, nil
71146
}
147+
return nil, err
148+
}
72149

73-
return nil, consts.ErrWorkflowNotReady
150+
engine, err := shardContext.GetEngine(ctx)
151+
if err != nil {
152+
return nil, err
153+
}
154+
err = engine.ReplicateVersionedTransition(ctx, resp.VersionedTransitionArtifact, activeClusterName)
155+
if err != nil {
156+
return nil, err
74157
}
75158

159+
// Verify child execution again after resending parent workflow
160+
_, _, err = verifyChildExecution(ctx, workflowConsistencyChecker, request)
161+
if err != nil {
162+
return nil, err
163+
}
76164
return &historyservice.VerifyChildExecutionCompletionRecordedResponse{}, nil
77165
}

service/history/configs/config.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,8 @@ type Config struct {
363363
BreakdownMetricsByTaskQueue dynamicconfig.BoolPropertyFnWithTaskQueueFilter
364364

365365
LogAllReqErrors dynamicconfig.BoolPropertyFnWithNamespaceFilter
366+
367+
MaxLocalParentWorkflowVerificationDuration dynamicconfig.DurationPropertyFn
366368
}
367369

368370
// NewConfig returns new service config with default values
@@ -393,6 +395,8 @@ func NewConfig(
393395
MaxAutoResetPoints: dynamicconfig.HistoryMaxAutoResetPoints.Get(dc),
394396
DefaultWorkflowTaskTimeout: dynamicconfig.DefaultWorkflowTaskTimeout.Get(dc),
395397

398+
MaxLocalParentWorkflowVerificationDuration: dynamicconfig.MaxLocalParentWorkflowVerificationDuration.Get(dc),
399+
396400
VisibilityPersistenceMaxReadQPS: dynamicconfig.VisibilityPersistenceMaxReadQPS.Get(dc),
397401
VisibilityPersistenceMaxWriteQPS: dynamicconfig.VisibilityPersistenceMaxWriteQPS.Get(dc),
398402
VisibilityPersistenceSlowQueryThreshold: dynamicconfig.VisibilityPersistenceSlowQueryThreshold.Get(dc),

service/history/history_engine.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -695,7 +695,7 @@ func (e *historyEngineImpl) VerifyChildExecutionCompletionRecorded(
695695
ctx context.Context,
696696
req *historyservice.VerifyChildExecutionCompletionRecordedRequest,
697697
) (*historyservice.VerifyChildExecutionCompletionRecordedResponse, error) {
698-
return verifychildworkflowcompletionrecorded.Invoke(ctx, req, e.workflowConsistencyChecker)
698+
return verifychildworkflowcompletionrecorded.Invoke(ctx, req, e.workflowConsistencyChecker, e.shardContext)
699699
}
700700

701701
func (e *historyEngineImpl) ReplicateEventsV2(

0 commit comments

Comments
 (0)