Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 480d7e2

Browse files
committed
Merge pull request #5844 from matrix-org/erikj/retry_well_known_lookup
2 parents c08ce16 + 09f6152 commit 480d7e2

File tree

5 files changed

+137
-27
lines changed

5 files changed

+137
-27
lines changed

changelog.d/5844.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Retry well-known lookup before the cache expires, giving a grace period where the remote well-known can be down but we still use the old result.

synapse/http/federation/well_known_resolver.py

Lines changed: 60 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@
4444
# lower bound for .well-known cache period
4545
WELL_KNOWN_MIN_CACHE_PERIOD = 5 * 60
4646

47+
# Attempt to refetch a cached well-known N% of the TTL before it expires.
48+
# e.g. if set to 0.2 and we have a cached entry with a TTL of 5mins, then
49+
# we'll start trying to refetch 1 minute before it expires.
50+
WELL_KNOWN_GRACE_PERIOD_FACTOR = 0.2
51+
52+
4753
logger = logging.getLogger(__name__)
4854

4955

@@ -80,15 +86,38 @@ def get_well_known(self, server_name):
8086
Deferred[WellKnownLookupResult]: The result of the lookup
8187
"""
8288
try:
83-
result = self._well_known_cache[server_name]
89+
prev_result, expiry, ttl = self._well_known_cache.get_with_expiry(
90+
server_name
91+
)
92+
93+
now = self._clock.time()
94+
if now < expiry - WELL_KNOWN_GRACE_PERIOD_FACTOR * ttl:
95+
return WellKnownLookupResult(delegated_server=prev_result)
8496
except KeyError:
85-
# TODO: should we linearise so that we don't end up doing two .well-known
86-
# requests for the same server in parallel?
97+
prev_result = None
98+
99+
# TODO: should we linearise so that we don't end up doing two .well-known
100+
# requests for the same server in parallel?
101+
try:
87102
with Measure(self._clock, "get_well_known"):
88103
result, cache_period = yield self._do_get_well_known(server_name)
89104

90-
if cache_period > 0:
91-
self._well_known_cache.set(server_name, result, cache_period)
105+
except _FetchWellKnownFailure as e:
106+
if prev_result and e.temporary:
107+
# This is a temporary failure and we have a still valid cached
108+
# result, so lets return that. Hopefully the next time we ask
109+
# the remote will be back up again.
110+
return WellKnownLookupResult(delegated_server=prev_result)
111+
112+
result = None
113+
114+
# add some randomness to the TTL to avoid a stampeding herd every hour
115+
# after startup
116+
cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD
117+
cache_period += random.uniform(0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER)
118+
119+
if cache_period > 0:
120+
self._well_known_cache.set(server_name, result, cache_period)
92121

93122
return WellKnownLookupResult(delegated_server=result)
94123

@@ -99,40 +128,42 @@ def _do_get_well_known(self, server_name):
99128
Args:
100129
server_name (bytes): name of the server, from the requested url
101130
131+
Raises:
132+
_FetchWellKnownFailure if we fail to lookup a result
133+
102134
Returns:
103-
Deferred[Tuple[bytes|None|object],int]:
104-
result, cache period, where result is one of:
105-
- the new server name from the .well-known (as a `bytes`)
106-
- None if there was no .well-known file.
107-
- INVALID_WELL_KNOWN if the .well-known was invalid
135+
Deferred[Tuple[bytes,int]]: The lookup result and cache period.
108136
"""
109137
uri = b"https://%s/.well-known/matrix/server" % (server_name,)
110138
uri_str = uri.decode("ascii")
111139
logger.info("Fetching %s", uri_str)
140+
141+
# We do this in two steps to differentiate between possibly transient
142+
# errors (e.g. can't connect to host, 503 response) and more permenant
143+
# errors (such as getting a 404 response).
112144
try:
113145
response = yield make_deferred_yieldable(
114146
self._well_known_agent.request(b"GET", uri)
115147
)
116148
body = yield make_deferred_yieldable(readBody(response))
149+
150+
if 500 <= response.code < 600:
151+
raise Exception("Non-200 response %s" % (response.code,))
152+
except Exception as e:
153+
logger.info("Error fetching %s: %s", uri_str, e)
154+
raise _FetchWellKnownFailure(temporary=True)
155+
156+
try:
117157
if response.code != 200:
118158
raise Exception("Non-200 response %s" % (response.code,))
119159

120160
parsed_body = json.loads(body.decode("utf-8"))
121161
logger.info("Response from .well-known: %s", parsed_body)
122-
if not isinstance(parsed_body, dict):
123-
raise Exception("not a dict")
124-
if "m.server" not in parsed_body:
125-
raise Exception("Missing key 'm.server'")
162+
163+
result = parsed_body["m.server"].encode("ascii")
126164
except Exception as e:
127165
logger.info("Error fetching %s: %s", uri_str, e)
128-
129-
# add some randomness to the TTL to avoid a stampeding herd every hour
130-
# after startup
131-
cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD
132-
cache_period += random.uniform(0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER)
133-
return (None, cache_period)
134-
135-
result = parsed_body["m.server"].encode("ascii")
166+
raise _FetchWellKnownFailure(temporary=False)
136167

137168
cache_period = _cache_period_from_headers(
138169
response.headers, time_now=self._reactor.seconds
@@ -185,3 +216,10 @@ def _parse_cache_control(headers):
185216
v = splits[1] if len(splits) > 1 else None
186217
cache_controls[k] = v
187218
return cache_controls
219+
220+
221+
@attr.s()
222+
class _FetchWellKnownFailure(Exception):
223+
# True if we didn't get a non-5xx HTTP response, i.e. this may or may not be
224+
# a temporary failure.
225+
temporary = attr.ib()

synapse/util/caches/ttlcache.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def set(self, key, value, ttl):
5555
if e != SENTINEL:
5656
self._expiry_list.remove(e)
5757

58-
entry = _CacheEntry(expiry_time=expiry, key=key, value=value)
58+
entry = _CacheEntry(expiry_time=expiry, ttl=ttl, key=key, value=value)
5959
self._data[key] = entry
6060
self._expiry_list.add(entry)
6161

@@ -87,7 +87,8 @@ def get_with_expiry(self, key):
8787
key: key to look up
8888
8989
Returns:
90-
Tuple[Any, float]: the value from the cache, and the expiry time
90+
Tuple[Any, float, float]: the value from the cache, the expiry time
91+
and the TTL
9192
9293
Raises:
9394
KeyError if the entry is not found
@@ -99,7 +100,7 @@ def get_with_expiry(self, key):
99100
self._metrics.inc_misses()
100101
raise
101102
self._metrics.inc_hits()
102-
return e.value, e.expiry_time
103+
return e.value, e.expiry_time, e.ttl
103104

104105
def pop(self, key, default=SENTINEL):
105106
"""Remove a value from the cache
@@ -158,5 +159,6 @@ class _CacheEntry(object):
158159

159160
# expiry_time is the first attribute, so that entries are sorted by expiry.
160161
expiry_time = attr.ib()
162+
ttl = attr.ib()
161163
key = attr.ib()
162164
value = attr.ib()

tests/http/federation/test_matrix_federation_agent.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -992,6 +992,75 @@ def test_well_known_cache(self):
992992
r = self.successResultOf(fetch_d)
993993
self.assertEqual(r.delegated_server, b"other-server")
994994

995+
def test_well_known_cache_with_temp_failure(self):
996+
"""Test that we refetch well-known before the cache expires, and that
997+
it ignores transient errors.
998+
"""
999+
1000+
well_known_resolver = WellKnownResolver(
1001+
self.reactor,
1002+
Agent(self.reactor, contextFactory=self.tls_factory),
1003+
well_known_cache=self.well_known_cache,
1004+
)
1005+
1006+
self.reactor.lookups["testserv"] = "1.2.3.4"
1007+
1008+
fetch_d = well_known_resolver.get_well_known(b"testserv")
1009+
1010+
# there should be an attempt to connect on port 443 for the .well-known
1011+
clients = self.reactor.tcpClients
1012+
self.assertEqual(len(clients), 1)
1013+
(host, port, client_factory, _timeout, _bindAddress) = clients.pop(0)
1014+
self.assertEqual(host, "1.2.3.4")
1015+
self.assertEqual(port, 443)
1016+
1017+
well_known_server = self._handle_well_known_connection(
1018+
client_factory,
1019+
expected_sni=b"testserv",
1020+
response_headers={b"Cache-Control": b"max-age=1000"},
1021+
content=b'{ "m.server": "target-server" }',
1022+
)
1023+
1024+
r = self.successResultOf(fetch_d)
1025+
self.assertEqual(r.delegated_server, b"target-server")
1026+
1027+
# close the tcp connection
1028+
well_known_server.loseConnection()
1029+
1030+
# Get close to the cache expiry, this will cause the resolver to do
1031+
# another lookup.
1032+
self.reactor.pump((900.0,))
1033+
1034+
fetch_d = well_known_resolver.get_well_known(b"testserv")
1035+
clients = self.reactor.tcpClients
1036+
(host, port, client_factory, _timeout, _bindAddress) = clients.pop(0)
1037+
1038+
# fonx the connection attempt, this will be treated as a temporary
1039+
# failure.
1040+
client_factory.clientConnectionFailed(None, Exception("nope"))
1041+
1042+
# attemptdelay on the hostnameendpoint is 0.3, so takes that long before the
1043+
# .well-known request fails.
1044+
self.reactor.pump((0.4,))
1045+
1046+
# Resolver should return cached value, despite the lookup failing.
1047+
r = self.successResultOf(fetch_d)
1048+
self.assertEqual(r.delegated_server, b"target-server")
1049+
1050+
# Expire the cache and repeat the request
1051+
self.reactor.pump((100.0,))
1052+
1053+
# Repated the request, this time it should fail if the lookup fails.
1054+
fetch_d = well_known_resolver.get_well_known(b"testserv")
1055+
1056+
clients = self.reactor.tcpClients
1057+
(host, port, client_factory, _timeout, _bindAddress) = clients.pop(0)
1058+
client_factory.clientConnectionFailed(None, Exception("nope"))
1059+
self.reactor.pump((0.4,))
1060+
1061+
r = self.successResultOf(fetch_d)
1062+
self.assertEqual(r.delegated_server, None)
1063+
9951064

9961065
class TestCachePeriodFromHeaders(TestCase):
9971066
def test_cache_control(self):

tests/util/caches/test_ttlcache.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def test_get(self):
3636
self.assertTrue("one" in self.cache)
3737
self.assertEqual(self.cache.get("one"), "1")
3838
self.assertEqual(self.cache["one"], "1")
39-
self.assertEqual(self.cache.get_with_expiry("one"), ("1", 110))
39+
self.assertEqual(self.cache.get_with_expiry("one"), ("1", 110, 10))
4040
self.assertEqual(self.cache._metrics.hits, 3)
4141
self.assertEqual(self.cache._metrics.misses, 0)
4242

@@ -77,7 +77,7 @@ def test_expiry(self):
7777
self.assertEqual(self.cache["two"], "2")
7878
self.assertEqual(self.cache["three"], "3")
7979

80-
self.assertEqual(self.cache.get_with_expiry("two"), ("2", 120))
80+
self.assertEqual(self.cache.get_with_expiry("two"), ("2", 120, 20))
8181

8282
self.assertEqual(self.cache._metrics.hits, 5)
8383
self.assertEqual(self.cache._metrics.misses, 0)

0 commit comments

Comments
 (0)