Skip to content

Commit 57e05aa

Browse files
HttpTimeoutPolicy Improvements Phase 1: Fixes QueryPlan requests retry gaps (#5476)
# Pull Request Template ## Description This PR fixes: 1. Allow QueryPlan requests to do local retries (within region), before doing cross regional retries. 2. Removing dependency on HttpMethod based retry, Enable all `GET` calls without `DocumentServiceRequest` to retry. 3. Enable caller (CosmosHttpClientCore) to determine when to retry safely. Additional details on the updates in this PR: Allow Retries in following cases: - When `DocumentServiceRequest` is null - If the DocumentServiceRequest's ResourceType is Address - If the DocumentServiceRequest is a ReadOnlyRequest (based on existing criteria). **Follow up**: Another PR will include enhancing/adjusting the defined timeouts on `HttpTimeoutPolicyForPartitionFailover` and potentially `HttpTimeoutPolicyForThinClient` ## Type of change Please delete options that are not relevant. - [x] Bug fix (non-breaking change which fixes an issue) ## Closing issues To automatically close an issue: closes #5481 --------- Co-authored-by: Debdatta Kunda <[email protected]>
1 parent 16c2898 commit 57e05aa

10 files changed

+51
-67
lines changed

Microsoft.Azure.Cosmos/src/HttpClient/CosmosHttpClientCore.cs

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,7 @@ private async Task<HttpResponseMessage> SendHttpHelperAsync(
388388
return responseMessage;
389389
}
390390

391-
bool isOutOfRetries = CosmosHttpClientCore.IsOutOfRetries(timeoutPolicy, startDateTimeUtc, timeoutEnumerator);
391+
bool isOutOfRetries = CosmosHttpClientCore.IsOutOfRetries(timeoutEnumerator);
392392
if (isOutOfRetries)
393393
{
394394
return responseMessage;
@@ -402,7 +402,7 @@ private async Task<HttpResponseMessage> SendHttpHelperAsync(
402402
datum.RecordHttpException(requestMessage, e, resourceType, requestStartTime);
403403
trace = datum.Trace;
404404
}
405-
bool isOutOfRetries = CosmosHttpClientCore.IsOutOfRetries(timeoutPolicy, startDateTimeUtc, timeoutEnumerator);
405+
bool isOutOfRetries = CosmosHttpClientCore.IsOutOfRetries(timeoutEnumerator);
406406

407407
switch (e)
408408
{
@@ -415,7 +415,7 @@ private async Task<HttpResponseMessage> SendHttpHelperAsync(
415415

416416
// Convert OperationCanceledException to 408 when the HTTP client throws it. This makes it clear that the
417417
// the request timed out and was not user canceled operation.
418-
if (isOutOfRetries || !timeoutPolicy.IsSafeToRetry(requestMessage.Method))
418+
if (isOutOfRetries || !CosmosHttpClientCore.IsSafeToRetry(documentServiceRequest))
419419
{
420420
// throw current exception (caught in transport handler)
421421
string message =
@@ -440,14 +440,14 @@ private async Task<HttpResponseMessage> SendHttpHelperAsync(
440440

441441
break;
442442
case WebException webException:
443-
if (isOutOfRetries || (!timeoutPolicy.IsSafeToRetry(requestMessage.Method) && !WebExceptionUtility.IsWebExceptionRetriable(webException)))
443+
if (isOutOfRetries || (!CosmosHttpClientCore.IsSafeToRetry(documentServiceRequest) && !WebExceptionUtility.IsWebExceptionRetriable(webException)))
444444
{
445445
throw;
446446
}
447447

448448
break;
449449
case HttpRequestException httpRequestException:
450-
if (isOutOfRetries || !timeoutPolicy.IsSafeToRetry(requestMessage.Method))
450+
if (isOutOfRetries || !CosmosHttpClientCore.IsSafeToRetry(documentServiceRequest))
451451
{
452452
throw;
453453
}
@@ -493,13 +493,24 @@ private async Task<HttpResponseMessage> SendHttpHelperAsync(
493493
}
494494

495495
private static bool IsOutOfRetries(
496-
HttpTimeoutPolicy timeoutPolicy,
497-
DateTime startDateTimeUtc,
498496
IEnumerator<(TimeSpan requestTimeout, TimeSpan delayForNextRequest)> timeoutEnumerator)
499497
{
500498
return !timeoutEnumerator.MoveNext(); // No more retries are configured
501499
}
502500

501+
private static bool IsSafeToRetry(DocumentServiceRequest documentServiceRequest)
502+
{
503+
// Three scenarios are safely retriable:
504+
// 1) If request is null since they are originated from GetAsync calls
505+
// 2) If request is read-only
506+
// 3) If request is an address request.
507+
if (documentServiceRequest == null)
508+
{
509+
return true;
510+
}
511+
return documentServiceRequest.IsReadOnlyRequest || documentServiceRequest.ResourceType == ResourceType.Address;
512+
}
513+
503514
private async Task<HttpResponseMessage> ExecuteHttpHelperAsync(
504515
HttpRequestMessage requestMessage,
505516
ResourceType resourceType,

Microsoft.Azure.Cosmos/src/HttpClient/HttpTimeoutPolicy.cs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ internal abstract class HttpTimeoutPolicy
1313
public abstract string TimeoutPolicyName { get; }
1414
public abstract int TotalRetryCount { get; }
1515
public abstract IEnumerator<(TimeSpan requestTimeout, TimeSpan delayForNextRequest)> GetTimeoutEnumerator();
16-
public abstract bool IsSafeToRetry(HttpMethod httpMethod);
1716

1817
public abstract bool ShouldRetryBasedOnResponse(HttpMethod requestHttpMethod, HttpResponseMessage responseMessage);
1918

Microsoft.Azure.Cosmos/src/HttpClient/HttpTimeoutPolicyControlPlaneRead.cs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,6 @@ private HttpTimeoutPolicyControlPlaneRead()
3232
return this.TimeoutsAndDelays.GetEnumerator();
3333
}
3434

35-
// This is for control plane reads which should always be safe to retry on.
36-
public override bool IsSafeToRetry(HttpMethod httpMethod)
37-
{
38-
return true;
39-
}
40-
4135
public override bool ShouldRetryBasedOnResponse(HttpMethod requestHttpMethod, HttpResponseMessage responseMessage)
4236
{
4337
return false;

Microsoft.Azure.Cosmos/src/HttpClient/HttpTimeoutPolicyControlPlaneRetriableHotPath.cs

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,6 @@ private HttpTimeoutPolicyControlPlaneRetriableHotPath(bool shouldThrow503OnTimeo
3636
return this.TimeoutsAndDelays.GetEnumerator();
3737
}
3838

39-
// The hot path should always be safe to retires since it should be retrieving meta data
40-
// information that is not idempotent.
41-
public override bool IsSafeToRetry(HttpMethod httpMethod)
42-
{
43-
return true;
44-
}
45-
4639
public override bool ShouldRetryBasedOnResponse(HttpMethod requestHttpMethod, HttpResponseMessage responseMessage)
4740
{
4841
if (responseMessage == null)
@@ -55,11 +48,6 @@ public override bool ShouldRetryBasedOnResponse(HttpMethod requestHttpMethod, Ht
5548
return false;
5649
}
5750

58-
if (!this.IsSafeToRetry(requestHttpMethod))
59-
{
60-
return false;
61-
}
62-
6351
return true;
6452
}
6553

Microsoft.Azure.Cosmos/src/HttpClient/HttpTimeoutPolicyDefault.cs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,6 @@ private HttpTimeoutPolicyDefault(bool shouldThrow503OnTimeout)
3636
return this.TimeoutsAndDelays.GetEnumerator();
3737
}
3838

39-
// Assume that it is not safe to retry unless it is a get method.
40-
// Create and other operations could have succeeded even though a timeout occurred.
41-
public override bool IsSafeToRetry(HttpMethod httpMethod)
42-
{
43-
return httpMethod == HttpMethod.Get;
44-
}
45-
4639
public override bool ShouldRetryBasedOnResponse(HttpMethod requestHttpMethod, HttpResponseMessage responseMessage)
4740
{
4841
return false;

Microsoft.Azure.Cosmos/src/HttpClient/HttpTimeoutPolicyForPartitionFailover.cs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,6 @@ private HttpTimeoutPolicyForPartitionFailover(bool shouldThrow503OnTimeout)
3535
return this.TimeoutsAndDelays.GetEnumerator();
3636
}
3737

38-
// Assume that it is not safe to retry unless it is a get method.
39-
// Create and other operations could have succeeded even though a timeout occurred.
40-
public override bool IsSafeToRetry(HttpMethod httpMethod)
41-
{
42-
return httpMethod == HttpMethod.Get;
43-
}
44-
4538
public override bool ShouldRetryBasedOnResponse(HttpMethod requestHttpMethod, HttpResponseMessage responseMessage)
4639
{
4740
return false;

Microsoft.Azure.Cosmos/src/HttpClient/HttpTimeoutPolicyForThinClient.cs

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,6 @@ private HttpTimeoutPolicyForThinClient(
3939
return this.TimeoutsAndDelays.GetEnumerator();
4040
}
4141

42-
public override bool IsSafeToRetry(HttpMethod httpMethod)
43-
{
44-
return this.shouldRetry;
45-
}
46-
4742
public override bool ShouldRetryBasedOnResponse(HttpMethod requestHttpMethod, HttpResponseMessage responseMessage)
4843
{
4944
if (responseMessage == null)
@@ -56,7 +51,7 @@ public override bool ShouldRetryBasedOnResponse(HttpMethod requestHttpMethod, Ht
5651
return false;
5752
}
5853

59-
if (!this.IsSafeToRetry(requestHttpMethod))
54+
if (!this.shouldRetry)
6055
{
6156
return false;
6257
}

Microsoft.Azure.Cosmos/src/HttpClient/HttpTimeoutPolicyNoRetry.cs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,6 @@ private HttpTimeoutPolicyNoRetry()
3131
return this.TimeoutsAndDelays.GetEnumerator();
3232
}
3333

34-
// Always Unsafe to retry
35-
public override bool IsSafeToRetry(HttpMethod httpMethod)
36-
{
37-
return false;
38-
}
39-
4034
public override bool ShouldRetryBasedOnResponse(HttpMethod requestHttpMethod, HttpResponseMessage responseMessage)
4135
{
4236
return false;

Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosHttpClientCoreTests.cs

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,8 @@ async Task<HttpResponseMessage> sendFunc(HttpRequestMessage request, Cancellatio
123123
resourceType: ResourceType.Collection,
124124
timeoutPolicy: currentTimeoutPolicy.Key,
125125
clientSideRequestStatistics: new ClientSideRequestStatisticsTraceDatum(DateTime.UtcNow, trace),
126-
cancellationToken: default);
126+
cancellationToken: default,
127+
documentServiceRequest: CreateDocumentServiceRequestByOperation(ResourceType.Collection, OperationType.Read));
127128

128129
Assert.AreEqual(HttpStatusCode.OK, responseMessage.StatusCode);
129130
}
@@ -266,7 +267,8 @@ Task<HttpResponseMessage> sendFunc(HttpRequestMessage request, CancellationToken
266267
resourceType: ResourceType.Collection,
267268
timeoutPolicy: HttpTimeoutPolicyDefault.Instance,
268269
clientSideRequestStatistics: new ClientSideRequestStatisticsTraceDatum(DateTime.UtcNow, trace),
269-
cancellationToken: default);
270+
cancellationToken: default,
271+
documentServiceRequest: CreateDocumentServiceRequestByOperation(ResourceType.Collection, OperationType.Read));
270272
}
271273
}
272274
catch (Exception)
@@ -281,7 +283,7 @@ Task<HttpResponseMessage> sendFunc(HttpRequestMessage request, CancellationToken
281283
public async Task HttpTimeoutThrow503TestAsync()
282284
{
283285

284-
async Task TestScenarioAsync(HttpMethod method, ResourceType resourceType, HttpTimeoutPolicy timeoutPolicy, Type expectedException, int expectedNumberOfRetrys)
286+
async Task TestScenarioAsync(HttpMethod method, ResourceType resourceType, OperationType operationType, HttpTimeoutPolicy timeoutPolicy, Type expectedException, int expectedNumberOfRetrys)
285287
{
286288
int count = 0;
287289
Task<HttpResponseMessage> sendFunc(HttpRequestMessage request, CancellationToken cancellationToken)
@@ -306,7 +308,8 @@ Task<HttpResponseMessage> sendFunc(HttpRequestMessage request, CancellationToken
306308
resourceType: resourceType,
307309
timeoutPolicy: timeoutPolicy,
308310
clientSideRequestStatistics: new ClientSideRequestStatisticsTraceDatum(DateTime.UtcNow, trace),
309-
cancellationToken: default);
311+
cancellationToken: default,
312+
documentServiceRequest: CreateDocumentServiceRequestByOperation(resourceType, operationType));
310313
}
311314
}
312315
catch (Exception e)
@@ -328,19 +331,19 @@ Task<HttpResponseMessage> sendFunc(HttpRequestMessage request, CancellationToken
328331
}
329332

330333
//Data plane read
331-
await TestScenarioAsync(HttpMethod.Get, ResourceType.Document, HttpTimeoutPolicyDefault.InstanceShouldThrow503OnTimeout, typeof(CosmosException), 3);
334+
await TestScenarioAsync(HttpMethod.Get, ResourceType.Document, OperationType.Read, HttpTimeoutPolicyDefault.InstanceShouldThrow503OnTimeout, typeof(CosmosException), 3);
332335

333336
//Data plane write (Should throw a 408 OperationCanceledException rather than a 503)
334-
await TestScenarioAsync(HttpMethod.Post, ResourceType.Document, HttpTimeoutPolicyDefault.Instance, typeof(TaskCanceledException), 1);
337+
await TestScenarioAsync(HttpMethod.Post, ResourceType.Document, OperationType.Upsert, HttpTimeoutPolicyDefault.Instance, typeof(TaskCanceledException), 1);
335338

336339
//Meta data read
337-
await TestScenarioAsync(HttpMethod.Get, ResourceType.Database, HttpTimeoutPolicyDefault.InstanceShouldThrow503OnTimeout, typeof(CosmosException), 3);
340+
await TestScenarioAsync(HttpMethod.Get, ResourceType.Database, OperationType.Read, HttpTimeoutPolicyDefault.InstanceShouldThrow503OnTimeout, typeof(CosmosException), 3);
338341

339342
//Query plan read (note all query plan operations are reads).
340-
await TestScenarioAsync(HttpMethod.Get, ResourceType.Document, HttpTimeoutPolicyDefault.InstanceShouldThrow503OnTimeout, typeof(CosmosException), 3);
343+
await TestScenarioAsync(HttpMethod.Get, ResourceType.Document, OperationType.Read, HttpTimeoutPolicyDefault.InstanceShouldThrow503OnTimeout, typeof(CosmosException), 3);
341344

342345
//Metadata Write (Should throw a 408 OperationCanceledException rather than a 503)
343-
await TestScenarioAsync(HttpMethod.Post, ResourceType.Document, HttpTimeoutPolicyDefault.Instance, typeof(TaskCanceledException), 1);
346+
await TestScenarioAsync(HttpMethod.Post, ResourceType.Document, OperationType.Upsert, HttpTimeoutPolicyDefault.Instance, typeof(TaskCanceledException), 1);
344347
}
345348

346349
[TestMethod]
@@ -433,7 +436,8 @@ async Task<HttpResponseMessage> sendFunc(HttpRequestMessage request, Cancellatio
433436
resourceType: ResourceType.Document,
434437
timeoutPolicy: HttpTimeoutPolicyControlPlaneRetriableHotPath.Instance,
435438
clientSideRequestStatistics: new ClientSideRequestStatisticsTraceDatum(DateTime.UtcNow, trace),
436-
cancellationToken: default);
439+
cancellationToken: default,
440+
documentServiceRequest: documentServiceRequest);
437441

438442
Assert.AreEqual(HttpStatusCode.OK, responseMessage.StatusCode);
439443
}
@@ -492,7 +496,7 @@ public void CreateHttpClientHandlerCreatesCorrectValueType()
492496
public async Task HttpTimeoutPolicyForThinClientOn503TestAsync()
493497
{
494498

495-
async Task TestScenarioAsync(HttpMethod method, ResourceType resourceType, HttpTimeoutPolicy timeoutPolicy, Type expectedException, int expectedNumberOfRetrys)
499+
async Task TestScenarioAsync(HttpMethod method, ResourceType resourceType, OperationType operationType, HttpTimeoutPolicy timeoutPolicy, Type expectedException, int expectedNumberOfRetrys)
496500
{
497501
int count = 0;
498502
Task<HttpResponseMessage> sendFunc(HttpRequestMessage request, CancellationToken cancellationToken)
@@ -517,7 +521,8 @@ Task<HttpResponseMessage> sendFunc(HttpRequestMessage request, CancellationToken
517521
resourceType: resourceType,
518522
timeoutPolicy: timeoutPolicy,
519523
clientSideRequestStatistics: new ClientSideRequestStatisticsTraceDatum(DateTime.UtcNow, trace),
520-
cancellationToken: default);
524+
cancellationToken: default,
525+
documentServiceRequest: CreateDocumentServiceRequestByOperation(resourceType, operationType));
521526
}
522527
}
523528
catch (Exception e)
@@ -542,6 +547,7 @@ Task<HttpResponseMessage> sendFunc(HttpRequestMessage request, CancellationToken
542547
await TestScenarioAsync(
543548
method: HttpMethod.Get,
544549
resourceType: ResourceType.Document,
550+
operationType: OperationType.Read,
545551
timeoutPolicy: HttpTimeoutPolicy.GetTimeoutPolicy(
546552
documentServiceRequest: CosmosHttpClientCoreTests.CreateDocumentServiceRequestByOperation(ResourceType.Document, OperationType.Read),
547553
isPartitionLevelFailoverEnabled: false,
@@ -553,6 +559,7 @@ await TestScenarioAsync(
553559
await TestScenarioAsync(
554560
method: HttpMethod.Get,
555561
resourceType: ResourceType.Document,
562+
operationType: OperationType.Read,
556563
timeoutPolicy: HttpTimeoutPolicy.GetTimeoutPolicy(
557564
documentServiceRequest: CosmosHttpClientCoreTests.CreateDocumentServiceRequestByOperation(ResourceType.Document, OperationType.Query),
558565
isPartitionLevelFailoverEnabled: false,
@@ -564,6 +571,7 @@ await TestScenarioAsync(
564571
await TestScenarioAsync(
565572
method: HttpMethod.Post,
566573
resourceType: ResourceType.Document,
574+
operationType: OperationType.Upsert,
567575
timeoutPolicy: HttpTimeoutPolicy.GetTimeoutPolicy(
568576
documentServiceRequest: CosmosHttpClientCoreTests.CreateDocumentServiceRequestByOperation(ResourceType.Document, OperationType.Create),
569577
isPartitionLevelFailoverEnabled: false,
@@ -575,6 +583,7 @@ await TestScenarioAsync(
575583
await TestScenarioAsync(
576584
method: HttpMethod.Get,
577585
resourceType: ResourceType.Database,
586+
operationType: OperationType.Read,
578587
timeoutPolicy: HttpTimeoutPolicy.GetTimeoutPolicy(
579588
documentServiceRequest: CosmosHttpClientCoreTests.CreateDocumentServiceRequestByOperation(ResourceType.Database, OperationType.Read),
580589
isPartitionLevelFailoverEnabled: false,
@@ -583,6 +592,7 @@ await TestScenarioAsync(
583592
expectedNumberOfRetrys: 3);
584593
}
585594

595+
586596
private static DocumentServiceRequest CreateDocumentServiceRequestByOperation(
587597
ResourceType resourceType,
588598
OperationType operationType)

Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayStoreModelTest.cs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -889,15 +889,22 @@ public async Task GatewayStatsDurationTest()
889889
DocumentClientEventSource.Instance);
890890

891891
using (ITrace trace = Tracing.Trace.GetRootTrace(nameof(GatewayStatsDurationTest)))
892-
{
893-
892+
{
893+
894894
Tracing.TraceData.ClientSideRequestStatisticsTraceDatum clientSideRequestStatistics = new Tracing.TraceData.ClientSideRequestStatisticsTraceDatum(DateTime.UtcNow, trace);
895895

896896
await cosmosHttpClient.SendHttpAsync(() => new ValueTask<HttpRequestMessage>(new HttpRequestMessage(HttpMethod.Get, "http://someuri.com")),
897897
ResourceType.Document,
898898
HttpTimeoutPolicyDefault.InstanceShouldThrow503OnTimeout,
899899
clientSideRequestStatistics,
900-
CancellationToken.None);
900+
CancellationToken.None,
901+
documentServiceRequest: new DocumentServiceRequest(
902+
OperationType.Read,
903+
ResourceType.Document,
904+
$"dbs/dummy_db_id/colls/dummy_ct_id",
905+
body: null,
906+
AuthorizationTokenType.PrimaryMasterKey,
907+
headers: null));
901908

902909
Assert.AreEqual(clientSideRequestStatistics.HttpResponseStatisticsList.Count, 2);
903910
// The duration is calculated using date times which can cause the duration to be slightly off. This allows for up to 15 Ms of variance.

0 commit comments

Comments
 (0)