Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 8be321f

Browse files
committed
Track number of hosts affected by the rate limiter
Follow-up to #13534 Part of #13356
1 parent 149ac1d commit 8be321f

File tree

3 files changed

+56
-15
lines changed

3 files changed

+56
-15
lines changed

synapse/metrics/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,17 @@ def collect() -> Iterable[Metric]:
7878
# TODO Do something nicer about this.
7979
RegistryProxy = cast(CollectorRegistry, _RegistryProxy)
8080

81+
T = TypeVar("T")
82+
83+
84+
def count(func: Callable[[T], bool], it: Iterable[T]) -> int:
85+
"""Return the number of items in it for which func returns true."""
86+
n = 0
87+
for x in it:
88+
if func(x):
89+
n += 1
90+
return n
91+
8192

8293
@attr.s(slots=True, hash=True, auto_attribs=True)
8394
class LaterGauge(Collector):
@@ -475,6 +486,7 @@ def register_threadpool(name: str, threadpool: ThreadPool) -> None:
475486
"MetricsResource",
476487
"generate_latest",
477488
"start_http_server",
489+
count,
478490
"LaterGauge",
479491
"InFlightGauge",
480492
"GaugeBucketCollector",

synapse/notifier.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
from synapse.logging import issue9533_logger
4141
from synapse.logging.context import PreserveLoggingContext
4242
from synapse.logging.opentracing import log_kv, start_active_span
43-
from synapse.metrics import LaterGauge
43+
from synapse.metrics import count, LaterGauge
4444
from synapse.streams.config import PaginationConfig
4545
from synapse.types import (
4646
JsonDict,
@@ -68,16 +68,6 @@
6868
T = TypeVar("T")
6969

7070

71-
# TODO(paul): Should be shared somewhere
72-
def count(func: Callable[[T], bool], it: Iterable[T]) -> int:
73-
"""Return the number of items in it for which func returns true."""
74-
n = 0
75-
for x in it:
76-
if func(x):
77-
n += 1
78-
return n
79-
80-
8171
class _NotificationListener:
8272
"""This represents a single client connection to the events stream.
8373
The events stream handler will have yielded to the deferred, so to

synapse/util/ratelimitutils.py

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import typing
1919
from typing import Any, DefaultDict, Iterator, List, Set
2020

21-
from prometheus_client.core import Counter
21+
from prometheus_client.core import Counter, Gauge
2222

2323
from twisted.internet import defer
2424

@@ -29,6 +29,7 @@
2929
make_deferred_yieldable,
3030
run_in_background,
3131
)
32+
from synapse.metrics import count, LaterGauge
3233
from synapse.util import Clock
3334

3435
if typing.TYPE_CHECKING:
@@ -51,6 +52,34 @@ def new_limiter() -> "_PerHostRatelimiter":
5152
str, "_PerHostRatelimiter"
5253
] = collections.defaultdict(new_limiter)
5354

55+
# We track the number of affected hosts per time-period so we can
56+
# differentiate one really noisy homeserver from a general
57+
# ratelimit tuning problem across the federation.
58+
LaterGauge(
59+
"synapse_rate_limit_sleep_affected_hosts",
60+
"Number of hosts that had requests put to sleep",
61+
[],
62+
lambda: count(
63+
bool,
64+
[
65+
ratelimiter.should_sleep()
66+
for ratelimiter in self.ratelimiters.values()
67+
],
68+
),
69+
)
70+
LaterGauge(
71+
"synapse_rate_limit_reject_affected_hosts",
72+
"Number of hosts that had requests rejected",
73+
[],
74+
lambda: count(
75+
bool,
76+
[
77+
ratelimiter.should_reject()
78+
for ratelimiter in self.ratelimiters.values()
79+
],
80+
),
81+
)
82+
5483
def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]":
5584
"""Used to ratelimit an incoming request from a given host
5685
@@ -116,6 +145,17 @@ def ratelimit(self, host: str) -> "Iterator[defer.Deferred[None]]":
116145
finally:
117146
self._on_exit(request_id)
118147

148+
def should_reject(self):
149+
"""
150+
Reject the request if we already have too many queued up (either
151+
sleeping or in the ready queue).
152+
"""
153+
queue_size = len(self.ready_request_queue) + len(self.sleeping_requests)
154+
return queue_size > self.reject_limit
155+
156+
def should_sleep(self):
157+
return len(self.request_times) > self.sleep_limit
158+
119159
def _on_enter(self, request_id: object) -> "defer.Deferred[None]":
120160
time_now = self.clock.time_msec()
121161

@@ -126,8 +166,7 @@ def _on_enter(self, request_id: object) -> "defer.Deferred[None]":
126166

127167
# reject the request if we already have too many queued up (either
128168
# sleeping or in the ready queue).
129-
queue_size = len(self.ready_request_queue) + len(self.sleeping_requests)
130-
if queue_size > self.reject_limit:
169+
if self.should_reject():
131170
logger.debug("Ratelimiter(%s): rejecting request", self.host)
132171
rate_limit_reject_counter.inc()
133172
raise LimitExceededError(
@@ -157,7 +196,7 @@ def queue_request() -> "defer.Deferred[None]":
157196
len(self.request_times),
158197
)
159198

160-
if len(self.request_times) > self.sleep_limit:
199+
if self.should_sleep():
161200
logger.debug(
162201
"Ratelimiter(%s) [%s]: sleeping request for %f sec",
163202
self.host,

0 commit comments

Comments
 (0)