@@ -50,12 +50,14 @@ internal class GatewayAddressCache : IAddressCache, IDisposable
5050 private readonly ICosmosAuthorizationTokenProvider tokenProvider ;
5151 private readonly bool enableTcpConnectionEndpointRediscovery ;
5252
53+ private readonly SemaphoreSlim semaphore ;
5354 private readonly CosmosHttpClient httpClient ;
5455 private readonly bool isReplicaAddressValidationEnabled ;
5556
5657 private Tuple < PartitionKeyRangeIdentity , PartitionAddressInformation > masterPartitionAddressCache ;
5758 private DateTime suboptimalMasterPartitionTimestamp ;
5859 private bool disposedValue ;
60+ private bool validateUnknownReplicas ;
5961 private IOpenConnectionsHandler openConnectionsHandler ;
6062
6163 public GatewayAddressCache (
@@ -90,8 +92,10 @@ public GatewayAddressCache(
9092 Constants . Properties . Protocol ,
9193 GatewayAddressCache . ProtocolString ( this . protocol ) ) ;
9294
95+ this . semaphore = new SemaphoreSlim ( 1 , 1 ) ;
9396 this . openConnectionsHandler = openConnectionsHandler ;
9497 this . isReplicaAddressValidationEnabled = replicaAddressValidationEnabled ;
98+ this . validateUnknownReplicas = false ;
9599 }
96100
97101 public Uri ServiceEndpoint => this . serviceEndpoint ;
@@ -120,6 +124,14 @@ public async Task OpenConnectionsAsync(
120124 List < Task > tasks = new ( ) ;
121125 int batchSize = GatewayAddressCache . DefaultBatchSize ;
122126
127+ // By design, the Unknown replicas are validated only when the following two conditions meet:
128+ // 1) The CosmosClient is initiated using the CreateAndInitializaAsync() flow.
129+ // 2) The advanced replica selection feature enabled.
130+ if ( shouldOpenRntbdChannels )
131+ {
132+ this . validateUnknownReplicas = true ;
133+ }
134+
123135#if ! ( NETSTANDARD15 || NETSTANDARD16 )
124136#if NETSTANDARD20
125137 // GetEntryAssembly returns null when loaded from native netstandard2.0
@@ -302,11 +314,12 @@ public async Task<PartitionAddressInformation> TryGetAddressesAsync(
302314 . ReplicaTransportAddressUris
303315 . Any ( x => x . ShouldRefreshHealthStatus ( ) ) )
304316 {
305- Task refreshAddressesInBackgroundTask = Task . Run ( async ( ) =>
317+ bool slimAcquired = await this . semaphore . WaitAsync ( 0 ) ;
318+ try
306319 {
307- try
320+ if ( slimAcquired )
308321 {
309- await this . serverPartitionAddressCache . RefreshAsync (
322+ this . serverPartitionAddressCache . Refresh (
310323 key : partitionKeyRangeIdentity ,
311324 singleValueInitFunc : ( currentCachedValue ) => this . GetAddressesForRangeIdAsync (
312325 request ,
@@ -315,14 +328,21 @@ await this.serverPartitionAddressCache.RefreshAsync(
315328 partitionKeyRangeIdentity . PartitionKeyRangeId ,
316329 forceRefresh : true ) ) ;
317330 }
318- catch ( Exception ex )
331+ else
319332 {
320- DefaultTrace . TraceWarning ( "Failed to refresh addresses in the background for the collection rid: {0} with exception : {1}. '{2}'" ,
333+ DefaultTrace . TraceVerbose ( "Failed to refresh addresses in the background for the collection rid: {0}, partition key range id : {1}, because the semaphore is already acquired . '{2}'" ,
321334 partitionKeyRangeIdentity . CollectionRid ,
322- ex ,
335+ partitionKeyRangeIdentity . PartitionKeyRangeId ,
323336 System . Diagnostics . Trace . CorrelationManager . ActivityId ) ;
324337 }
325- } ) ;
338+ }
339+ finally
340+ {
341+ if ( slimAcquired )
342+ {
343+ this . semaphore . Release ( ) ;
344+ }
345+ }
326346 }
327347
328348 return addresses ;
@@ -1008,18 +1028,26 @@ private static PartitionAddressInformation MergeAddresses(
10081028 /// Returns a list of <see cref="TransportAddressUri"/> needed to validate their health status. Validating
10091029 /// a uri is done by opening Rntbd connection to the backend replica, which is a costly operation by nature. Therefore
10101030 /// vaidating both Unhealthy and Unknown replicas at the same time could impose a high CPU utilization. To avoid this
1011- /// situation, the RntbdOpenConnectionHandler has good concurrency control mechanism to open the connections gracefully/>.
1031+ /// situation, the RntbdOpenConnectionHandler has good concurrency control mechanism to open the connections gracefully.
1032+ /// By default, this method only returns the Unhealthy replicas that requires to validate it's connectivity status. The
1033+ /// Unknown replicas are validated only when the CosmosClient is initiated using the CreateAndInitializaAsync() flow.
10121034 /// </summary>
10131035 /// <param name="transportAddresses">A read only list of <see cref="TransportAddressUri"/>s.</param>
10141036 /// <returns>A list of <see cref="TransportAddressUri"/> that needs to validate their status.</returns>
10151037 private IEnumerable < TransportAddressUri > GetAddressesNeededToValidateStatus (
10161038 IReadOnlyList < TransportAddressUri > transportAddresses )
10171039 {
1018- return transportAddresses
1019- . Where ( address => address
1040+ return this . validateUnknownReplicas
1041+ ? transportAddresses
1042+ . Where ( address => address
1043+ . GetCurrentHealthState ( )
1044+ . GetHealthStatus ( ) is
1045+ TransportAddressHealthState . HealthStatus . UnhealthyPending or
1046+ TransportAddressHealthState . HealthStatus . Unknown )
1047+ : transportAddresses
1048+ . Where ( address => address
10201049 . GetCurrentHealthState ( )
10211050 . GetHealthStatus ( ) is
1022- TransportAddressHealthState . HealthStatus . Unknown or
10231051 TransportAddressHealthState . HealthStatus . UnhealthyPending ) ;
10241052 }
10251053
0 commit comments