fix(profiling): Start profiler thread lazily (#1903)

Zylphrex · antonpirker · web-flow · commit 2d24560ba06d · 2023-02-22T10:57:12.000-05:00
When running with uWSGI, it preforks the process so the profiler thread is
started on the master process but doesn't run on the worker process. This means
that no samples are ever taken. This change delays the start of the profiler
thread to the first profile that is started.

Co-authored-by: Anton Pirker &lt;anton.pirker@sentry.io&gt;
diff --git a/sentry_sdk/profiler.py b/sentry_sdk/profiler.py
@@ -112,6 +112,7 @@
 try:
     from gevent import get_hub as get_gevent_hub  # type: ignore
     from gevent.monkey import get_original, is_module_patched  # type: ignore
+    from gevent.threadpool import ThreadPool  # type: ignore
 
     thread_sleep = get_original("time", "sleep")
 except ImportError:
@@ -127,6 +128,8 @@ def is_module_patched(*args, **kwargs):
         # unable to import from gevent means no modules have been patched
         return False
 
+    ThreadPool = None
+
 
 def is_gevent():
     # type: () -> bool
@@ -177,10 +180,7 @@ def setup_profiler(options):
     ):
         _scheduler = ThreadScheduler(frequency=frequency)
     elif profiler_mode == GeventScheduler.mode:
-        try:
-            _scheduler = GeventScheduler(frequency=frequency)
-        except ImportError:
-            raise ValueError("Profiler mode: {} is not available".format(profiler_mode))
+        _scheduler = GeventScheduler(frequency=frequency)
     else:
         raise ValueError("Unknown profiler mode: {}".format(profiler_mode))
 
@@ -703,7 +703,8 @@ def __init__(self, frequency):
 
         self.sampler = self.make_sampler()
 
-        self.new_profiles = deque()  # type: Deque[Profile]
+        # cap the number of new profiles at any time so it does not grow infinitely
+        self.new_profiles = deque(maxlen=128)  # type: Deque[Profile]
         self.active_profiles = set()  # type: Set[Profile]
 
     def __enter__(self):
@@ -723,8 +724,13 @@ def teardown(self):
         # type: () -> None
         raise NotImplementedError
 
+    def ensure_running(self):
+        # type: () -> None
+        raise NotImplementedError
+
     def start_profiling(self, profile):
         # type: (Profile) -> None
+        self.ensure_running()
         self.new_profiles.append(profile)
 
     def stop_profiling(self, profile):
@@ -827,21 +833,44 @@ def __init__(self, frequency):
 
         # used to signal to the thread that it should stop
         self.running = False
-
-        # make sure the thread is a daemon here otherwise this
-        # can keep the application running after other threads
-        # have exited
-        self.thread = threading.Thread(name=self.name, target=self.run, daemon=True)
+        self.thread = None  # type: Optional[threading.Thread]
+        self.pid = None  # type: Optional[int]
+        self.lock = threading.Lock()
 
     def setup(self):
         # type: () -> None
-        self.running = True
-        self.thread.start()
+        pass
 
     def teardown(self):
         # type: () -> None
-        self.running = False
-        self.thread.join()
+        if self.running:
+            self.running = False
+            if self.thread is not None:
+                self.thread.join()
+
+    def ensure_running(self):
+        # type: () -> None
+        pid = os.getpid()
+
+        # is running on the right process
+        if self.running and self.pid == pid:
+            return
+
+        with self.lock:
+            # another thread may have tried to acquire the lock
+            # at the same time so it may start another thread
+            # make sure to check again before proceeding
+            if self.running and self.pid == pid:
+                return
+
+            self.pid = pid
+            self.running = True
+
+            # make sure the thread is a daemon here otherwise this
+            # can keep the application running after other threads
+            # have exited
+            self.thread = threading.Thread(name=self.name, target=self.run, daemon=True)
+            self.thread.start()
 
     def run(self):
         # type: () -> None
@@ -882,28 +911,52 @@ class GeventScheduler(Scheduler):
     def __init__(self, frequency):
         # type: (int) -> None
 
-        # This can throw an ImportError that must be caught if `gevent` is
-        # not installed.
-        from gevent.threadpool import ThreadPool  # type: ignore
+        if ThreadPool is None:
+            raise ValueError("Profiler mode: {} is not available".format(self.mode))
 
         super(GeventScheduler, self).__init__(frequency=frequency)
 
         # used to signal to the thread that it should stop
         self.running = False
+        self.thread = None  # type: Optional[ThreadPool]
+        self.pid = None  # type: Optional[int]
 
-        # Using gevent's ThreadPool allows us to bypass greenlets and spawn
-        # native threads.
-        self.pool = ThreadPool(1)
+        # This intentionally uses the gevent patched threading.Lock.
+        # The lock will be required when first trying to start profiles
+        # as we need to spawn the profiler thread from the greenlets.
+        self.lock = threading.Lock()
 
     def setup(self):
         # type: () -> None
-        self.running = True
-        self.pool.spawn(self.run)
+        pass
 
     def teardown(self):
         # type: () -> None
-        self.running = False
-        self.pool.join()
+        if self.running:
+            self.running = False
+            if self.thread is not None:
+                self.thread.join()
+
+    def ensure_running(self):
+        # type: () -> None
+        pid = os.getpid()
+
+        # is running on the right process
+        if self.running and self.pid == pid:
+            return
+
+        with self.lock:
+            # another thread may have tried to acquire the lock
+            # at the same time so it may start another thread
+            # make sure to check again before proceeding
+            if self.running and self.pid == pid:
+                return
+
+            self.pid = pid
+            self.running = True
+
+            self.thread = ThreadPool(1)
+            self.thread.spawn(self.run)
 
     def run(self):
         # type: () -> None
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
@@ -2,6 +2,7 @@
 import os
 import sys
 import threading
+import time
 
 import pytest
 
@@ -82,6 +83,13 @@ def test_profiler_setup_twice(teardown_profiling):
     assert not setup_profiler({"_experiments": {}})
 
 
+@pytest.mark.parametrize(
+    "mode",
+    [
+        pytest.param("thread"),
+        pytest.param("gevent", marks=requires_gevent),
+    ],
+)
 @pytest.mark.parametrize(
     ("profiles_sample_rate", "profile_count"),
     [
@@ -99,10 +107,14 @@ def test_profiled_transaction(
     teardown_profiling,
     profiles_sample_rate,
     profile_count,
+    mode,
 ):
     sentry_init(
         traces_sample_rate=1.0,
-        _experiments={"profiles_sample_rate": profiles_sample_rate},
+        _experiments={
+            "profiles_sample_rate": profiles_sample_rate,
+            "profiler_mode": mode,
+        },
     )
 
     envelopes = capture_envelopes()
@@ -177,6 +189,30 @@ def test_minimum_unique_samples_required(
     assert len(items["profile"]) == 0
 
 
+def test_profile_captured(
+    sentry_init,
+    capture_envelopes,
+    teardown_profiling,
+):
+    sentry_init(
+        traces_sample_rate=1.0,
+        _experiments={"profiles_sample_rate": 1.0},
+    )
+
+    envelopes = capture_envelopes()
+
+    with start_transaction(name="profiling"):
+        time.sleep(0.05)
+
+    items = defaultdict(list)
+    for envelope in envelopes:
+        for item in envelope.items:
+            items[item.type].append(item)
+
+    assert len(items["transaction"]) == 1
+    assert len(items["profile"]) == 1
+
+
 def get_frame(depth=1):
     """
     This function is not exactly true to its name. Depending on
@@ -494,9 +530,19 @@ def test_thread_scheduler_single_background_thread(scheduler_class):
 
     scheduler.setup()
 
+    # setup but no profiles started so still no threads
+    assert len(get_scheduler_threads(scheduler)) == 0
+
+    scheduler.ensure_running()
+
     # the scheduler will start always 1 thread
     assert len(get_scheduler_threads(scheduler)) == 1
 
+    scheduler.ensure_running()
+
+    # the scheduler still only has 1 thread
+    assert len(get_scheduler_threads(scheduler)) == 1
+
     scheduler.teardown()
 
     # once finished, the thread should stop