Skip to content

Commit 0a1aa5f

Browse files
authored
Add Dynamic Config for safe keepalive rollout (#7809)
## What changed? Add Dynamic Config for safe keepalive rollout ## Why? We need ability to switch on/off the keepalive setting for keepalive config ## How did you test it? - [x] built - [x] run locally and tested manually - [x] covered by existing tests - [ ] added new unit test(s) - [ ] added new functional test(s) ## Potential risks No risk is expected.
1 parent 7b9e109 commit 0a1aa5f

File tree

7 files changed

+45
-13
lines changed

7 files changed

+45
-13
lines changed

common/dynamicconfig/constants.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,18 @@ these warning are not emitted if the value is set to 0 or less`,
235235
`OperatorRPSRatio is the percentage of the rate limit provided to priority rate limiters that should be used for
236236
operator API calls (highest priority). Should be >0.0 and <= 1.0 (defaults to 20% if not specified)`,
237237
)
238+
// TODO: The following 2 configs should be removed once server keepalive and client keepalive are enabled by default
239+
EnableInternodeServerKeepAlive = NewGlobalBoolSetting(
240+
"system.enableInternodeServerKeepAlive",
241+
false,
242+
`enableInternodeServerKeepAlive is the config to enable keep alive for inter-node connections on server side.`,
243+
)
244+
EnableInternodeClientKeepAlive = NewGlobalBoolSetting(
245+
"system.enableInternodeClientKeepAlive",
246+
false,
247+
`enableInternodeClientKeepAlive is the config to enable keep alive for inter-node connections on client side.`,
248+
)
249+
238250
PersistenceQPSBurstRatio = NewGlobalFloatSetting(
239251
"system.persistenceQPSBurstRatio",
240252
1.0,

common/resource/fx.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ type (
7373
// See LifetimeHooksModule for detail
7474
var Module = fx.Options(
7575
persistenceClient.Module,
76+
dynamicconfig.Module,
7677
fx.Provide(HostNameProvider),
7778
fx.Provide(TimeSourceProvider),
7879
cluster.MetadataLifetimeHooksModule,
@@ -339,6 +340,7 @@ func RPCFactoryProvider(
339340
resolver *membership.GRPCResolver,
340341
tracingStatsHandler telemetry.ClientStatsHandler,
341342
monitor membership.Monitor,
343+
dc *dynamicconfig.Collection,
342344
) (common.RPCFactory, error) {
343345
frontendURL, frontendHTTPURL, frontendHTTPPort, frontendTLSConfig, err := getFrontendConnectionDetails(cfg, tlsConfigProvider, resolver)
344346
if err != nil {
@@ -349,7 +351,9 @@ func RPCFactoryProvider(
349351
if tracingStatsHandler != nil {
350352
options = append(options, grpc.WithStatsHandler(tracingStatsHandler))
351353
}
352-
return rpc.NewFactory(
354+
enableServerKeepalive := dynamicconfig.EnableInternodeServerKeepAlive.Get(dc)()
355+
enableClientKeepalive := dynamicconfig.EnableInternodeClientKeepAlive.Get(dc)()
356+
factory := rpc.NewFactory(
353357
cfg,
354358
svcName,
355359
logger,
@@ -360,7 +364,11 @@ func RPCFactoryProvider(
360364
frontendTLSConfig,
361365
options,
362366
monitor,
363-
), nil
367+
)
368+
factory.EnableInternodeServerKeepalive = enableServerKeepalive
369+
factory.EnableInternodeClientKeepalive = enableClientKeepalive
370+
logger.Debug(fmt.Sprintf("RPC factory created. enableServerKeepalive: %v, enableClientKeepalive: %v", enableServerKeepalive, enableClientKeepalive))
371+
return factory, nil
364372
}
365373

366374
func FrontendHTTPClientCacheProvider(

common/rpc/rpc.go

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package rpc
33
import (
44
"crypto/tls"
55
"fmt"
6+
"math"
67
"math/rand"
78
"net"
89
"net/http"
@@ -23,6 +24,7 @@ import (
2324
"go.temporal.io/server/temporal/environment"
2425
"google.golang.org/grpc"
2526
"google.golang.org/grpc/credentials"
27+
"google.golang.org/grpc/keepalive"
2628
)
2729

2830
var _ common.RPCFactory = (*RPCFactory)(nil)
@@ -45,6 +47,10 @@ type RPCFactory struct {
4547
// A OnceValues wrapper for createLocalFrontendHTTPClient.
4648
localFrontendClient func() (*common.FrontendHTTPClient, error)
4749
interNodeGrpcConnections cache.Cache
50+
51+
// TODO: Remove these flags once the keepalive settings are rolled out
52+
EnableInternodeServerKeepalive bool
53+
EnableInternodeClientKeepalive bool
4854
}
4955

5056
// NewFactory builds a new RPCFactory
@@ -115,6 +121,12 @@ func (d *RPCFactory) GetRemoteClusterClientConfig(hostname string) (*tls.Config,
115121
func (d *RPCFactory) GetInternodeGRPCServerOptions() ([]grpc.ServerOption, error) {
116122
var opts []grpc.ServerOption
117123

124+
if d.EnableInternodeServerKeepalive {
125+
rpcConfig := d.config.Services[string(d.serviceName)].RPC
126+
kep := rpcConfig.KeepAliveServerConfig.GetKeepAliveEnforcementPolicy()
127+
kp := rpcConfig.KeepAliveServerConfig.GetKeepAliveServerParameters()
128+
opts = append(opts, grpc.KeepaliveEnforcementPolicy(kep), grpc.KeepaliveParams(kp))
129+
}
118130
if d.tlsFactory != nil {
119131
serverConfig, err := d.tlsFactory.GetInternodeServerConfig()
120132
if err != nil {
@@ -126,11 +138,6 @@ func (d *RPCFactory) GetInternodeGRPCServerOptions() ([]grpc.ServerOption, error
126138
opts = append(opts, grpc.Creds(credentials.NewTLS(serverConfig)))
127139
}
128140

129-
rpcConfig := d.config.Services[string(d.serviceName)].RPC
130-
kep := rpcConfig.KeepAliveServerConfig.GetKeepAliveEnforcementPolicy()
131-
kp := rpcConfig.KeepAliveServerConfig.GetKeepAliveServerParameters()
132-
opts = append(opts, grpc.KeepaliveEnforcementPolicy(kep), grpc.KeepaliveParams(kp))
133-
134141
return opts, nil
135142
}
136143

@@ -251,8 +258,17 @@ func (d *RPCFactory) dial(hostName string, tlsClientConfig *tls.Config, dialOpti
251258
}
252259

253260
func (d *RPCFactory) getClientKeepAliveConfig(serviceName primitives.ServiceName) grpc.DialOption {
254-
serviceConfig := d.config.Services[string(serviceName)]
255-
return grpc.WithKeepaliveParams(serviceConfig.RPC.ClientConnectionConfig.GetKeepAliveClientParameters())
261+
// default keepalive settings for clients
262+
params := keepalive.ClientParameters{
263+
Time: time.Duration(math.MaxInt64),
264+
Timeout: 20 * time.Second,
265+
PermitWithoutStream: false,
266+
}
267+
if d.EnableInternodeClientKeepalive {
268+
serviceConfig := d.config.Services[string(serviceName)]
269+
params = serviceConfig.RPC.ClientConnectionConfig.GetKeepAliveClientParameters()
270+
}
271+
return grpc.WithKeepaliveParams(params)
256272
}
257273

258274
func (d *RPCFactory) GetTLSConfigProvider() encryption.TLSConfigProvider {

service/frontend/fx.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ type (
6161
var Module = fx.Options(
6262
resource.Module,
6363
scheduler.Module,
64-
dynamicconfig.Module,
6564
deployment.Module,
6665
workerdeployment.Module,
6766
// Note that with this approach routes may be registered in arbitrary order.

service/history/fx.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ var Module = fx.Options(
5151
events.Module,
5252
cache.Module,
5353
archival.Module,
54-
dynamicconfig.Module,
5554
fx.Provide(ConfigProvider), // might be worth just using provider for configs.Config directly
5655
fx.Provide(workflow.NewCommandHandlerRegistry),
5756
fx.Provide(RetryableInterceptorProvider),

service/matching/fx.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ import (
2828

2929
var Module = fx.Options(
3030
resource.Module,
31-
dynamicconfig.Module,
3231
deployment.Module,
3332
workerdeployment.Module,
3433
fx.Provide(ConfigProvider),

service/worker/fx.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ var Module = fx.Options(
4242
deployment.Module, // [cleanup-wv-pre-release]
4343
workerdeployment.Module,
4444
dlq.Module,
45-
dynamicconfig.Module,
4645
fx.Provide(
4746
func(c resource.HistoryClient) dlq.HistoryClient {
4847
return c

0 commit comments

Comments
 (0)