-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Time how long it takes us to do backfill processing #13535
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Add metrics to time how long it takes us to do backfill processing (`synapse_federation_backfill_processing_before_time_seconds`, `synapse_federation_backfill_processing_after_time_seconds`). |
Original file line number | Diff line number | Diff line change | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -32,6 +32,7 @@ | |||||||||||||||||||
) | ||||||||||||||||||||
|
||||||||||||||||||||
import attr | ||||||||||||||||||||
from prometheus_client import Histogram | ||||||||||||||||||||
from signedjson.key import decode_verify_key_bytes | ||||||||||||||||||||
from signedjson.sign import verify_signed_json | ||||||||||||||||||||
from unpaddedbase64 import decode_base64 | ||||||||||||||||||||
|
@@ -79,6 +80,24 @@ | |||||||||||||||||||
|
||||||||||||||||||||
logger = logging.getLogger(__name__) | ||||||||||||||||||||
|
||||||||||||||||||||
# Added to debug performance and track progress on optimizations | ||||||||||||||||||||
backfill_processing_before_timer = Histogram( | ||||||||||||||||||||
"synapse_federation_backfill_processing_before_time_seconds", | ||||||||||||||||||||
"sec", | ||||||||||||||||||||
[], | ||||||||||||||||||||
buckets=( | ||||||||||||||||||||
1.0, | ||||||||||||||||||||
5.0, | ||||||||||||||||||||
10.0, | ||||||||||||||||||||
20.0, | ||||||||||||||||||||
30.0, | ||||||||||||||||||||
40.0, | ||||||||||||||||||||
60.0, | ||||||||||||||||||||
80.0, | ||||||||||||||||||||
"+Inf", | ||||||||||||||||||||
), | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We will probably want to adjust the fidelity in the buckets once we have some real data. I've seen the linearizer lock take 45s and calculating the |
||||||||||||||||||||
) | ||||||||||||||||||||
|
||||||||||||||||||||
|
||||||||||||||||||||
def get_domains_from_state(state: StateMap[EventBase]) -> List[Tuple[str, int]]: | ||||||||||||||||||||
"""Get joined domains from state | ||||||||||||||||||||
|
@@ -138,6 +157,7 @@ class FederationHandler: | |||||||||||||||||||
def __init__(self, hs: "HomeServer"): | ||||||||||||||||||||
self.hs = hs | ||||||||||||||||||||
|
||||||||||||||||||||
self.clock = hs.get_clock() | ||||||||||||||||||||
self.store = hs.get_datastores().main | ||||||||||||||||||||
self._storage_controllers = hs.get_storage_controllers() | ||||||||||||||||||||
self._state_storage_controller = self._storage_controllers.state | ||||||||||||||||||||
|
@@ -197,12 +217,39 @@ async def maybe_backfill( | |||||||||||||||||||
return. This is used as part of the heuristic to decide if we | ||||||||||||||||||||
should back paginate. | ||||||||||||||||||||
""" | ||||||||||||||||||||
# Starting the processing time here so we can include the room backfill | ||||||||||||||||||||
# linearizer lock queue in the timing | ||||||||||||||||||||
processing_start_time = self.clock.time_msec() | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Starting the processing time here so we can include the room backfill linearizer lock in the timing.
|
||||||||||||||||||||
|
||||||||||||||||||||
async with self._room_backfill.queue(room_id): | ||||||||||||||||||||
return await self._maybe_backfill_inner(room_id, current_depth, limit) | ||||||||||||||||||||
return await self._maybe_backfill_inner( | ||||||||||||||||||||
room_id, | ||||||||||||||||||||
current_depth, | ||||||||||||||||||||
limit, | ||||||||||||||||||||
processing_start_time=processing_start_time, | ||||||||||||||||||||
) | ||||||||||||||||||||
|
||||||||||||||||||||
async def _maybe_backfill_inner( | ||||||||||||||||||||
self, room_id: str, current_depth: int, limit: int | ||||||||||||||||||||
self, | ||||||||||||||||||||
room_id: str, | ||||||||||||||||||||
current_depth: int, | ||||||||||||||||||||
limit: int, | ||||||||||||||||||||
*, | ||||||||||||||||||||
processing_start_time: int, | ||||||||||||||||||||
) -> bool: | ||||||||||||||||||||
""" | ||||||||||||||||||||
Checks whether the `current_depth` is at or approaching any backfill | ||||||||||||||||||||
points in the room and if so, will backfill. We only care about | ||||||||||||||||||||
checking backfill points that happened before the `current_depth` | ||||||||||||||||||||
(meaning less than or equal to the `current_depth`). | ||||||||||||||||||||
|
||||||||||||||||||||
Args: | ||||||||||||||||||||
room_id: The room to backfill in. | ||||||||||||||||||||
current_depth: The depth to check at for any upcoming backfill points. | ||||||||||||||||||||
limit: The max number of events to request from the remote federated server. | ||||||||||||||||||||
processing_start_time: The time when `maybe_backfill` started | ||||||||||||||||||||
processing. Only used for timing. | ||||||||||||||||||||
""" | ||||||||||||||||||||
backwards_extremities = [ | ||||||||||||||||||||
_BackfillPoint(event_id, depth, _BackfillPointType.BACKWARDS_EXTREMITY) | ||||||||||||||||||||
for event_id, depth in await self.store.get_oldest_event_ids_with_depth_in_room( | ||||||||||||||||||||
|
@@ -425,6 +472,11 @@ async def try_backfill(domains: List[str]) -> bool: | |||||||||||||||||||
|
||||||||||||||||||||
return False | ||||||||||||||||||||
|
||||||||||||||||||||
processing_end_time = self.clock.time_msec() | ||||||||||||||||||||
backfill_processing_before_timer.observe( | ||||||||||||||||||||
(processing_start_time - processing_end_time) / 1000 | ||||||||||||||||||||
) | ||||||||||||||||||||
Comment on lines
+475
to
+478
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same problem as #13533 (comment) I don't think this is working how we expect. All of the buckets just have the same values which means it's saying every record is taking less than
So the graph of the percentiles is just average of all of the buckets below it and we get straight lines, https://grafana.matrix.org/d/dYoRgTgVz/messages-timing?orgId=1&from=1660812148100&to=1660855348100&viewPanel=212 The after timing one looks normal: https://grafana.matrix.org/d/dYoRgTgVz/messages-timing?orgId=1&from=1660812186785&to=1660855386785&viewPanel=213 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't think so. It gives values like And we do the same thing for existing metrics which look fine, ex. synapse/synapse/federation/sender/__init__.py Line 487 in 2c42673
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The values in Prometheus are negative: https://prometheus.matrix.org/graph?g0.expr=synapse_federation_backfill_processing_before_time_seconds_sum&g0.tab=1&g0.stacked=0&g0.show_exemplars=0&g0.range_input=1h&g0.end_input=2022-08-22%2017%3A57%3A40&g0.moment_input=2022-08-22%2017%3A57%3A40 Spotted 🕵️♀️ I accidentally did Corrected:
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in #13584 |
||||||||||||||||||||
|
||||||||||||||||||||
success = await try_backfill(likely_domains) | ||||||||||||||||||||
if success: | ||||||||||||||||||||
return True | ||||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,7 +29,7 @@ | |
Tuple, | ||
) | ||
|
||
from prometheus_client import Counter | ||
from prometheus_client import Counter, Histogram | ||
|
||
from synapse import event_auth | ||
from synapse.api.constants import ( | ||
|
@@ -92,6 +92,26 @@ | |
"Events received over federation that we marked as soft_failed", | ||
) | ||
|
||
# Added to debug performance and track progress on optimizations | ||
backfill_processing_after_timer = Histogram( | ||
"synapse_federation_backfill_processing_after_time_seconds", | ||
"sec", | ||
[], | ||
buckets=( | ||
1.0, | ||
5.0, | ||
10.0, | ||
20.0, | ||
30.0, | ||
40.0, | ||
60.0, | ||
80.0, | ||
120.0, | ||
180.0, | ||
"+Inf", | ||
), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The after processing can take a while depending on how slow ex. |
||
) | ||
|
||
|
||
class FederationEventHandler: | ||
"""Handles events that originated from federation. | ||
|
@@ -597,20 +617,21 @@ async def backfill( | |
if not events: | ||
return | ||
|
||
# if there are any events in the wrong room, the remote server is buggy and | ||
# should not be trusted. | ||
for ev in events: | ||
if ev.room_id != room_id: | ||
raise InvalidResponseError( | ||
f"Remote server {dest} returned event {ev.event_id} which is in " | ||
f"room {ev.room_id}, when we were backfilling in {room_id}" | ||
) | ||
with backfill_processing_after_timer.time(): | ||
# if there are any events in the wrong room, the remote server is buggy and | ||
# should not be trusted. | ||
for ev in events: | ||
if ev.room_id != room_id: | ||
raise InvalidResponseError( | ||
f"Remote server {dest} returned event {ev.event_id} which is in " | ||
f"room {ev.room_id}, when we were backfilling in {room_id}" | ||
) | ||
|
||
await self._process_pulled_events( | ||
dest, | ||
events, | ||
backfilled=True, | ||
) | ||
await self._process_pulled_events( | ||
dest, | ||
events, | ||
backfilled=True, | ||
) | ||
|
||
@trace | ||
async def _get_missing_events_for_pdu( | ||
|
Uh oh!
There was an error while loading. Please reload this page.