44
44
# lower bound for .well-known cache period
45
45
WELL_KNOWN_MIN_CACHE_PERIOD = 5 * 60
46
46
47
+ # Attempt to refetch a cached well-known N% of the TTL before it expires.
48
+ # e.g. if set to 0.2 and we have a cached entry with a TTL of 5mins, then
49
+ # we'll start trying to refetch 1 minute before it expires.
50
+ WELL_KNOWN_GRACE_PERIOD_FACTOR = 0.2
51
+
52
+
47
53
logger = logging .getLogger (__name__ )
48
54
49
55
@@ -80,15 +86,38 @@ def get_well_known(self, server_name):
80
86
Deferred[WellKnownLookupResult]: The result of the lookup
81
87
"""
82
88
try :
83
- result = self ._well_known_cache [server_name ]
89
+ prev_result , expiry , ttl = self ._well_known_cache .get_with_expiry (
90
+ server_name
91
+ )
92
+
93
+ now = self ._clock .time ()
94
+ if now < expiry - WELL_KNOWN_GRACE_PERIOD_FACTOR * ttl :
95
+ return WellKnownLookupResult (delegated_server = prev_result )
84
96
except KeyError :
85
- # TODO: should we linearise so that we don't end up doing two .well-known
86
- # requests for the same server in parallel?
97
+ prev_result = None
98
+
99
+ # TODO: should we linearise so that we don't end up doing two .well-known
100
+ # requests for the same server in parallel?
101
+ try :
87
102
with Measure (self ._clock , "get_well_known" ):
88
103
result , cache_period = yield self ._do_get_well_known (server_name )
89
104
90
- if cache_period > 0 :
91
- self ._well_known_cache .set (server_name , result , cache_period )
105
+ except _FetchWellKnownFailure as e :
106
+ if prev_result and e .temporary :
107
+ # This is a temporary failure and we have a still valid cached
108
+ # result, so lets return that. Hopefully the next time we ask
109
+ # the remote will be back up again.
110
+ return WellKnownLookupResult (delegated_server = prev_result )
111
+
112
+ result = None
113
+
114
+ # add some randomness to the TTL to avoid a stampeding herd every hour
115
+ # after startup
116
+ cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD
117
+ cache_period += random .uniform (0 , WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER )
118
+
119
+ if cache_period > 0 :
120
+ self ._well_known_cache .set (server_name , result , cache_period )
92
121
93
122
return WellKnownLookupResult (delegated_server = result )
94
123
@@ -99,40 +128,42 @@ def _do_get_well_known(self, server_name):
99
128
Args:
100
129
server_name (bytes): name of the server, from the requested url
101
130
131
+ Raises:
132
+ _FetchWellKnownFailure if we fail to lookup a result
133
+
102
134
Returns:
103
- Deferred[Tuple[bytes|None|object],int]:
104
- result, cache period, where result is one of:
105
- - the new server name from the .well-known (as a `bytes`)
106
- - None if there was no .well-known file.
107
- - INVALID_WELL_KNOWN if the .well-known was invalid
135
+ Deferred[Tuple[bytes,int]]: The lookup result and cache period.
108
136
"""
109
137
uri = b"https://%s/.well-known/matrix/server" % (server_name ,)
110
138
uri_str = uri .decode ("ascii" )
111
139
logger .info ("Fetching %s" , uri_str )
140
+
141
+ # We do this in two steps to differentiate between possibly transient
142
+ # errors (e.g. can't connect to host, 503 response) and more permenant
143
+ # errors (such as getting a 404 response).
112
144
try :
113
145
response = yield make_deferred_yieldable (
114
146
self ._well_known_agent .request (b"GET" , uri )
115
147
)
116
148
body = yield make_deferred_yieldable (readBody (response ))
149
+
150
+ if 500 <= response .code < 600 :
151
+ raise Exception ("Non-200 response %s" % (response .code ,))
152
+ except Exception as e :
153
+ logger .info ("Error fetching %s: %s" , uri_str , e )
154
+ raise _FetchWellKnownFailure (temporary = True )
155
+
156
+ try :
117
157
if response .code != 200 :
118
158
raise Exception ("Non-200 response %s" % (response .code ,))
119
159
120
160
parsed_body = json .loads (body .decode ("utf-8" ))
121
161
logger .info ("Response from .well-known: %s" , parsed_body )
122
- if not isinstance (parsed_body , dict ):
123
- raise Exception ("not a dict" )
124
- if "m.server" not in parsed_body :
125
- raise Exception ("Missing key 'm.server'" )
162
+
163
+ result = parsed_body ["m.server" ].encode ("ascii" )
126
164
except Exception as e :
127
165
logger .info ("Error fetching %s: %s" , uri_str , e )
128
-
129
- # add some randomness to the TTL to avoid a stampeding herd every hour
130
- # after startup
131
- cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD
132
- cache_period += random .uniform (0 , WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER )
133
- return (None , cache_period )
134
-
135
- result = parsed_body ["m.server" ].encode ("ascii" )
166
+ raise _FetchWellKnownFailure (temporary = False )
136
167
137
168
cache_period = _cache_period_from_headers (
138
169
response .headers , time_now = self ._reactor .seconds
@@ -185,3 +216,10 @@ def _parse_cache_control(headers):
185
216
v = splits [1 ] if len (splits ) > 1 else None
186
217
cache_controls [k ] = v
187
218
return cache_controls
219
+
220
+
221
+ @attr .s ()
222
+ class _FetchWellKnownFailure (Exception ):
223
+ # True if we didn't get a non-5xx HTTP response, i.e. this may or may not be
224
+ # a temporary failure.
225
+ temporary = attr .ib ()
0 commit comments