KAFKA-4160: Ensure rebalance listener not called with coordinator lock (#1438)

dpkp · web-flow · commit 1c87c8213e4a · 2025-03-13T08:21:35.000-07:00
diff --git a/kafka/coordinator/base.py b/kafka/coordinator/base.py
@@ -338,7 +338,36 @@ def time_to_next_heartbeat(self):
                 return float('inf')
             return self.heartbeat.time_to_next_heartbeat()
 
+    def _reset_join_group_future(self):
+        with self._lock:
+            self.join_future = None
+
+    def _initiate_join_group(self):
+        with self._lock:
+            # we store the join future in case we are woken up by the user
+            # after beginning the rebalance in the call to poll below.
+            # This ensures that we do not mistakenly attempt to rejoin
+            # before the pending rebalance has completed.
+            if self.join_future is None:
+                self.state = MemberState.REBALANCING
+                self.join_future = self._send_join_group_request()
+
+                # handle join completion in the callback so that the
+                # callback will be invoked even if the consumer is woken up
+                # before finishing the rebalance
+                self.join_future.add_callback(self._handle_join_success)
+
+                # we handle failures below after the request finishes.
+                # If the join completes after having been woken up, the
+                # exception is ignored and we will rejoin
+                self.join_future.add_errback(self._handle_join_failure)
+
+        return self.join_future
+
     def _handle_join_success(self, member_assignment_bytes):
+        # handle join completion in the callback so that the callback
+        # will be invoked even if the consumer is woken up before
+        # finishing the rebalance
         with self._lock:
             log.info("Successfully joined group %s with generation %s",
                      self.group_id, self._generation.generation_id)
@@ -347,6 +376,9 @@ def _handle_join_success(self, member_assignment_bytes):
                 self._heartbeat_thread.enable()
 
     def _handle_join_failure(self, _):
+        # we handle failures below after the request finishes.
+        # if the join completes after having been woken up,
+        # the exception is ignored and we will rejoin
         with self._lock:
             self.state = MemberState.UNJOINED
 
@@ -360,92 +392,67 @@ def ensure_active_group(self, timeout_ms=None):
         Raises: KafkaTimeoutError if timeout_ms is not None
         """
         inner_timeout_ms = timeout_ms_fn(timeout_ms, 'Timeout attempting to join consumer group')
-        with self._client._lock, self._lock:
-            if self._heartbeat_thread is None:
-                self._start_heartbeat_thread()
-
-            while self.need_rejoin() or self._rejoin_incomplete():
-                self.ensure_coordinator_ready(timeout_ms=inner_timeout_ms())
-
-                # call on_join_prepare if needed. We set a flag
-                # to make sure that we do not call it a second
-                # time if the client is woken up before a pending
-                # rebalance completes. This must be called on each
-                # iteration of the loop because an event requiring
-                # a rebalance (such as a metadata refresh which
-                # changes the matched subscription set) can occur
-                # while another rebalance is still in progress.
-                if not self.rejoining:
-                    self._on_join_prepare(self._generation.generation_id,
-                                          self._generation.member_id)
-                    self.rejoining = True
-
-                # ensure that there are no pending requests to the coordinator.
-                # This is important in particular to avoid resending a pending
-                # JoinGroup request.
-                while not self.coordinator_unknown():
-                    if not self._client.in_flight_request_count(self.coordinator_id):
-                        break
-                    self._client.poll(timeout_ms=inner_timeout_ms(200))
-                else:
-                    continue
-
-                # we store the join future in case we are woken up by the user
-                # after beginning the rebalance in the call to poll below.
-                # This ensures that we do not mistakenly attempt to rejoin
-                # before the pending rebalance has completed.
-                if self.join_future is None:
-                    # Fence off the heartbeat thread explicitly so that it cannot
-                    # interfere with the join group. Note that this must come after
-                    # the call to _on_join_prepare since we must be able to continue
-                    # sending heartbeats if that callback takes some time.
-                    self._heartbeat_thread.disable()
-
-                    self.state = MemberState.REBALANCING
-                    future = self._send_join_group_request()
-
-                    self.join_future = future  # this should happen before adding callbacks
-
-                    # handle join completion in the callback so that the
-                    # callback will be invoked even if the consumer is woken up
-                    # before finishing the rebalance
-                    future.add_callback(self._handle_join_success)
-
-                    # we handle failures below after the request finishes.
-                    # If the join completes after having been woken up, the
-                    # exception is ignored and we will rejoin
-                    future.add_errback(self._handle_join_failure)
-
-                else:
-                    future = self.join_future
-
-                self._client.poll(future=future, timeout_ms=inner_timeout_ms())
-
-                if not future.is_done:
-                    raise Errors.KafkaTimeoutError()
+        self.ensure_coordinator_ready(timeout_ms=inner_timeout_ms())
+        self._start_heartbeat_thread()
+        self.join_group(timeout_ms=inner_timeout_ms())
 
-                if future.succeeded():
-                    self._on_join_complete(self._generation.generation_id,
-                                           self._generation.member_id,
-                                           self._generation.protocol,
-                                           future.value)
-                    self.join_future = None
-                    self.rejoining = False
-                    self.rejoin_needed = False
+    def join_group(self, timeout_ms=None):
+        inner_timeout_ms = timeout_ms_fn(timeout_ms, 'Timeout attempting to join consumer group')
+        while self.need_rejoin():
+            self.ensure_coordinator_ready(timeout_ms=inner_timeout_ms())
+
+            # call on_join_prepare if needed. We set a flag
+            # to make sure that we do not call it a second
+            # time if the client is woken up before a pending
+            # rebalance completes. This must be called on each
+            # iteration of the loop because an event requiring
+            # a rebalance (such as a metadata refresh which
+            # changes the matched subscription set) can occur
+            # while another rebalance is still in progress.
+            if not self.rejoining:
+                self._on_join_prepare(self._generation.generation_id,
+                                      self._generation.member_id)
+                self.rejoining = True
+
+            # fence off the heartbeat thread explicitly so that it cannot
+            # interfere with the join group.  # Note that this must come after
+            # the call to onJoinPrepare since we must be able to continue
+            # sending heartbeats if that callback takes some time.
+            self._disable_heartbeat_thread()
+
+            # ensure that there are no pending requests to the coordinator.
+            # This is important in particular to avoid resending a pending
+            # JoinGroup request.
+            while not self.coordinator_unknown():
+                if not self._client.in_flight_request_count(self.coordinator_id):
+                    break
+                self._client.poll(timeout_ms=inner_timeout_ms(200))
+            else:
+                continue
 
-                else:
-                    self.join_future = None
-                    exception = future.exception
-                    if isinstance(exception, (Errors.UnknownMemberIdError,
-                                              Errors.RebalanceInProgressError,
-                                              Errors.IllegalGenerationError)):
-                        continue
-                    elif not future.retriable():
-                        raise exception  # pylint: disable-msg=raising-bad-type
-                    time.sleep(inner_timeout_ms(self.config['retry_backoff_ms']) / 1000)
-
-    def _rejoin_incomplete(self):
-        return self.join_future is not None
+            future = self._initiate_join_group()
+            self._client.poll(future=future, timeout_ms=inner_timeout_ms())
+            if future.is_done:
+                self._reset_join_group_future()
+            else:
+                raise Errors.KafkaTimeoutError()
+
+            if future.succeeded():
+                self.rejoining = False
+                self.rejoin_needed = False
+                self._on_join_complete(self._generation.generation_id,
+                                       self._generation.member_id,
+                                       self._generation.protocol,
+                                       future.value)
+            else:
+                exception = future.exception
+                if isinstance(exception, (Errors.UnknownMemberIdError,
+                                          Errors.RebalanceInProgressError,
+                                          Errors.IllegalGenerationError)):
+                    continue
+                elif not future.retriable():
+                    raise exception  # pylint: disable-msg=raising-bad-type
+                time.sleep(inner_timeout_ms(self.config['retry_backoff_ms']) / 1000)
 
     def _send_join_group_request(self):
         """Join the group and return the assignment for the next generation.
@@ -751,23 +758,31 @@ def request_rejoin(self):
         self.rejoin_needed = True
 
     def _start_heartbeat_thread(self):
-        if self._heartbeat_thread is None:
-            log.info('Starting new heartbeat thread')
-            self._heartbeat_thread = HeartbeatThread(weakref.proxy(self))
-            self._heartbeat_thread.daemon = True
-            self._heartbeat_thread.start()
+        with self._lock:
+            if self._heartbeat_thread is None:
+                log.info('Starting new heartbeat thread')
+                self._heartbeat_thread = HeartbeatThread(weakref.proxy(self))
+                self._heartbeat_thread.daemon = True
+                self._heartbeat_thread.start()
+
+    def _disable_heartbeat_thread(self):
+        with self._lock:
+            if self._heartbeat_thread is not None:
+                self._heartbeat_thread.disable()
 
     def _close_heartbeat_thread(self):
-        if hasattr(self, '_heartbeat_thread') and self._heartbeat_thread is not None:
-            log.info('Stopping heartbeat thread')
-            try:
-                self._heartbeat_thread.close()
-            except ReferenceError:
-                pass
-            self._heartbeat_thread = None
+        with self._lock:
+            if self._heartbeat_thread is not None:
+                log.info('Stopping heartbeat thread')
+                try:
+                    self._heartbeat_thread.close()
+                except ReferenceError:
+                    pass
+                self._heartbeat_thread = None
 
     def __del__(self):
-        self._close_heartbeat_thread()
+        if hasattr(self, '_heartbeat_thread'):
+            self._close_heartbeat_thread()
 
     def close(self):
         """Close the coordinator, leave the current group,
@@ -926,12 +941,15 @@ def __init__(self, coordinator):
 
     def enable(self):
         with self.coordinator._lock:
+            log.debug('Enabling heartbeat thread')
             self.enabled = True
             self.coordinator.heartbeat.reset_timeouts()
             self.coordinator._lock.notify()
 
     def disable(self):
-        self.enabled = False
+        with self.coordinator._lock:
+            log.debug('Disabling heartbeat thread')
+            self.enabled = False
 
     def close(self):
         if self.closed: