Skip to content

Commit ccfb372

Browse files
fix(post-process-group): retry retrieving events from eventstore up to 3 times (#50210)
Eventstore is occasionally returning None events. Not totally sure whether its a timing issue or whether eventstore actually doesn't have the event. This PR retries retrieving the event with an exponential backoff delay and returns early in post_process_group if event is None. Also retries on `ServiceUnavailable` exception when Bigtable is temporarily unavailable. Resolves SENTRY-ZEB, SENTRY-11G0
1 parent 8c94358 commit ccfb372

File tree

1 file changed

+32
-3
lines changed

1 file changed

+32
-3
lines changed

src/sentry/tasks/post_process.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import sentry_sdk
88
from django.conf import settings
99
from django.utils import timezone
10+
from google.api_core.exceptions import ServiceUnavailable
1011

1112
from sentry import features
1213
from sentry.exceptions import PluginError
@@ -20,6 +21,7 @@
2021
from sentry.utils.event_frames import get_sdk_name
2122
from sentry.utils.locking import UnableToAcquireLock
2223
from sentry.utils.locking.manager import LockManager
24+
from sentry.utils.retries import ConditionalRetryPolicy, exponential_delay
2325
from sentry.utils.safe import safe_execute
2426
from sentry.utils.sdk import bind_organization_context, set_current_event_project
2527
from sentry.utils.services import build_instance_from_options
@@ -388,6 +390,20 @@ def fetch_buffered_group_stats(group):
388390
group.times_seen_pending = result["times_seen"]
389391

390392

393+
MAX_FETCH_ATTEMPTS = 3
394+
395+
396+
def should_retry_fetch(attempt: int, e: Exception) -> bool:
397+
from sentry.issues.occurrence_consumer import EventLookupError
398+
399+
return not attempt > MAX_FETCH_ATTEMPTS and (
400+
isinstance(e, ServiceUnavailable) or isinstance(e, EventLookupError)
401+
)
402+
403+
404+
fetch_retry_policy = ConditionalRetryPolicy(should_retry_fetch, exponential_delay(1.00))
405+
406+
391407
@instrumented_task(
392408
name="sentry.tasks.post_process.post_process_group",
393409
time_limit=120,
@@ -418,6 +434,7 @@ def post_process_group(
418434
from sentry.ingest.transaction_clusterer.datasource.redis import (
419435
record_transaction_name as record_transaction_name_for_clustering,
420436
)
437+
from sentry.issues.occurrence_consumer import EventLookupError
421438
from sentry.models import Organization, Project
422439
from sentry.reprocessing2 import is_reprocessed_event
423440

@@ -468,9 +485,21 @@ def post_process_group(
468485
return
469486
# Issue platform events don't use `event_processing_store`. Fetch from eventstore
470487
# instead.
471-
event = eventstore.get_event_by_id(
472-
project_id, occurrence.event_id, group_id=group_id, skip_transaction_groupevent=True
473-
)
488+
489+
def get_event_raise_exception() -> Event:
490+
retrieved = eventstore.get_event_by_id(
491+
project_id,
492+
occurrence.event_id,
493+
group_id=group_id,
494+
skip_transaction_groupevent=True,
495+
)
496+
if retrieved is None:
497+
raise EventLookupError(
498+
f"failed to retrieve event(project_id={project_id}, event_id={occurrence.event_id}, group_id={group_id}) from eventstore"
499+
)
500+
return retrieved
501+
502+
event = fetch_retry_policy(get_event_raise_exception)
474503

475504
set_current_event_project(event.project_id)
476505

0 commit comments

Comments
 (0)