Skip to content

Commit ffae22b

Browse files
Fix handling of context errors (#8702)
## What changed? Context cancelations are getting incorrectly translated to `Unavailable` service errors, which lead to Visibility Availability Alert false positives. This code change adds branches to check context error types before casting to the resulting `serviceerror` for both ElasticSearch and SQL visibility stores. ## Why? Separate client and service side errors. ## How did you test it? - [X] built - [X] run locally and tested manually - [X] covered by existing tests - [ ] added new unit test(s) - [ ] added new functional test(s) --------- Co-authored-by: Rodrigo Zhou <[email protected]>
1 parent a8d1c3e commit ffae22b

File tree

3 files changed

+24
-18
lines changed

3 files changed

+24
-18
lines changed

common/persistence/visibility/store/elasticsearch/visibility_store.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,7 @@ func (s *VisibilityStore) countGroupByWorkflowExecutions(
440440
termsAgg,
441441
)
442442
if err != nil {
443-
return nil, err
443+
return nil, ConvertElasticsearchClientError("CountWorkflowExecutions failed", err)
444444
}
445445
return s.parseCountGroupByResponse(esResponse, groupByFields)
446446
}
@@ -1154,14 +1154,19 @@ func finishParseJSONValue(val interface{}, t enumspb.IndexedValueType) (interfac
11541154

11551155
func ConvertElasticsearchClientError(message string, err error) error {
11561156
errMessage := fmt.Sprintf("%s: %s", message, detailedErrorMessage(err))
1157-
switch e := err.(type) {
1158-
case *elastic.Error:
1159-
switch e.Status {
1157+
var elasticErr *elastic.Error
1158+
switch {
1159+
case errors.As(err, &elasticErr):
1160+
switch elasticErr.Status {
11601161
case 400: // BadRequest
11611162
// Returning InvalidArgument error will prevent retry on a caller side.
11621163
return serviceerror.NewInvalidArgument(errMessage)
11631164
}
1165+
return serviceerror.NewUnavailable(errMessage)
1166+
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded):
1167+
return fmt.Errorf("%s: %w", message, err)
11641168
}
1169+
11651170
return serviceerror.NewUnavailable(errMessage)
11661171
}
11671172

common/persistence/visibility/store/sql/visibility_store.go

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,13 @@ func (s *VisibilityStore) GetName() string {
7373
return s.sqlStore.GetName()
7474
}
7575

76+
func convertSQLError(message string, err error) error {
77+
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
78+
return fmt.Errorf("%s: %w", message, err)
79+
}
80+
return serviceerror.NewUnavailable(fmt.Sprintf("%s: %v", message, err))
81+
}
82+
7683
func (s *VisibilityStore) GetIndexName() string {
7784
return s.sqlStore.GetDbName()
7885
}
@@ -160,7 +167,7 @@ func (s *VisibilityStore) DeleteWorkflowExecution(
160167
RunID: request.RunID,
161168
})
162169
if err != nil {
163-
return serviceerror.NewUnavailable(err.Error())
170+
return convertSQLError("DeleteWorkflowExecution operation failed.", err)
164171
}
165172
return nil
166173
}
@@ -225,8 +232,7 @@ func (s *VisibilityStore) listWorkflowExecutions(
225232

226233
rows, err := s.sqlStore.DB.SelectFromVisibility(ctx, *selectFilter)
227234
if err != nil {
228-
return nil, serviceerror.NewUnavailable(
229-
fmt.Sprintf("ListWorkflowExecutions operation failed. Select failed: %v", err))
235+
return nil, convertSQLError("ListWorkflowExecutions operation failed.", err)
230236
}
231237
if len(rows) == 0 {
232238
return &store.InternalListWorkflowExecutionsResponse{}, nil
@@ -296,8 +302,7 @@ func (s *VisibilityStore) listWorkflowExecutionsLegacy(
296302

297303
rows, err := s.sqlStore.DB.SelectFromVisibility(ctx, *selectFilter)
298304
if err != nil {
299-
return nil, serviceerror.NewUnavailable(
300-
fmt.Sprintf("ListWorkflowExecutions operation failed. Select failed: %v", err))
305+
return nil, convertSQLError("ListWorkflowExecutions operation failed.", err)
301306
}
302307
if len(rows) == 0 {
303308
return &store.InternalListWorkflowExecutionsResponse{}, nil
@@ -381,8 +386,7 @@ func (s *VisibilityStore) countWorkflowExecutionsLegacy(
381386

382387
count, err := s.sqlStore.DB.CountFromVisibility(ctx, *selectFilter)
383388
if err != nil {
384-
return nil, serviceerror.NewUnavailable(
385-
fmt.Sprintf("CountWorkflowExecutions operation failed. Query failed: %v", err))
389+
return nil, convertSQLError("CountWorkflowExecutions operation failed.", err)
386390
}
387391

388392
return &manager.CountWorkflowExecutionsResponse{Count: count}, nil
@@ -443,8 +447,7 @@ func (s *VisibilityStore) countWorkflowExecutions(
443447

444448
count, err := s.sqlStore.DB.CountFromVisibility(ctx, *selectFilter)
445449
if err != nil {
446-
return nil, serviceerror.NewUnavailable(
447-
fmt.Sprintf("CountWorkflowExecutions operation failed. Query failed: %v", err))
450+
return nil, convertSQLError("CountWorkflowExecutions operation failed.", err)
448451
}
449452

450453
return &manager.CountWorkflowExecutionsResponse{Count: count}, nil
@@ -469,8 +472,7 @@ func (s *VisibilityStore) countGroupByWorkflowExecutions(
469472

470473
rows, err := s.sqlStore.DB.CountGroupByFromVisibility(ctx, *selectFilter)
471474
if err != nil {
472-
return nil, serviceerror.NewUnavailable(
473-
fmt.Sprintf("CountWorkflowExecutions operation failed. Query failed: %v", err))
475+
return nil, convertSQLError("CountWorkflowExecutions operation failed.", err)
474476
}
475477
resp := &manager.CountWorkflowExecutionsResponse{
476478
Count: 0,
@@ -505,8 +507,7 @@ func (s *VisibilityStore) GetWorkflowExecution(
505507
RunID: request.RunID,
506508
})
507509
if err != nil {
508-
return nil, serviceerror.NewUnavailable(
509-
fmt.Sprintf("GetWorkflowExecution operation failed. Select failed: %v", err))
510+
return nil, convertSQLError("GetWorkflowExecution operation failed.", err)
510511
}
511512
info, err := s.rowToInfo(row, request.Namespace)
512513
if err != nil {

docs/architecture/retry.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ It's important to note that a special retry policy is used for `ResourceExhauste
1313

1414
## Service Error
1515

16-
Service errors are specific Go errors that can generate a gRCP `Status` (see [status.proto](https://github.com/grpc/grpc/blob/master/src/proto/grpc/status/status.proto)).
16+
Service errors are specific Go errors that can generate a gRPC `Status` (see [status.proto](https://github.com/grpc/grpc/blob/master/src/proto/grpc/status/status.proto)).
1717
A gRPC status contains a gRPC `Code` (see [code.proto](https://github.com/googleapis/googleapis/blob/master/google/rpc/code.proto)), a message and (optionally) a payload with more details.
1818

1919
```go

0 commit comments

Comments
 (0)