Skip to content

Commit 65bbce3

Browse files
committed
MB-19982: Fix potential deadlock between DcpConsumer::bufMutex & connsLock
As identified by ThreadSanitizer (see below). The issue is that the DcpConsumer::bufMutex and connsLock mutexes are acquired in different orders: A) As part of processing incoming DCP messages (PassiveStream::processBufferedMessages) we acquire (1) bufMutex, then (2) acquire connsLock as part of DcpConnMap::vbucketStateChanged. B) When disconnecting a connection from the DcpConnMap, we acquire (1) connsLock to locate the connection to be closed, and then (2) acquire bufMutex as part of PassiveStream::setDead. Address this by changing (B) - update the data structures maintaining the map of cookie -> connection (`all` and `map_`), *release connsLock* and then call PassiveStream::setDead. Finally we re-acquire connsLock to add the (now closed) stream to the deadConnections list. WARNING: ThreadSanitizer: lock-order-inversion (potential deadlock) (pid=393) Cycle in lock order graph: M22701 (0x7d5000018140) => M969 (0x7d840001cc50) => M22701 Mutex M969 acquired here while holding mutex M22701 in thread T10: #0 pthread_mutex_lock <null> (engine_testapp+0x00000047e980) #1 cb_mutex_enter <null> (libplatform.so.0.1.0+0x000000003960) #2 Mutex::acquire() ep-engine/src/mutex.cc:31 (ep.so+0x0000001e60ce) #3 LockHolder::lock() ep-engine/src/locks.h:71 (ep.so+0x000000080b33) #4 LockHolder::LockHolder(Mutex&, bool) ep-engine/src/locks.h:48 (ep.so+0x0000000807a2) #5 DcpConnMap::vbucketStateChanged(unsigned short, vbucket_state_t) ep-engine/src/connmap.cc:1044 (ep.so+0x00000023b3ba) #6 EventuallyPersistentStore::setVBucketState(unsigned short, vbucket_state_t, bool, bool) ep-engine/src/ep.cc:1057 (ep.so+0x0000000dc240) #7 PassiveStream::processSetVBucketState(SetVBucketState*) ep-engine/src/dcp-stream.cc:1483 (ep.so+0x0000002a3125) #8 PassiveStream::processBufferedMessages(unsigned int&) ep-engine/src/dcp-stream.cc:1313 (ep.so+0x0000002a0b58) #9 DcpConsumer::processBufferedItems() ep-engine/src/dcp-consumer.cc:608 (ep.so+0x0000002650c4) #10 Processer::run() ep-engine/src/dcp-consumer.cc:48 (ep.so+0x000000264cbf) #11 ExecutorThread::run() ep-engine/src/executorthread.cc:109 (ep.so+0x0000001e75a1) #12 launch_executor_thread(void*) ep-engine/src/executorthread.cc:34 (ep.so+0x0000001e6bca) #13 platform_thread_wrap platform/src/cb_pthreads.c (libplatform.so.0.1.0+0x00000000371c) Mutex M22701 previously acquired by the same thread here: #0 pthread_mutex_lock <null> (engine_testapp+0x00000047e980) #1 cb_mutex_enter <null> (libplatform.so.0.1.0+0x000000003960) #2 Mutex::acquire() ep-engine/src/mutex.cc:31 (ep.so+0x0000001e60ce) #3 LockHolder::lock() ep-engine/src/locks.h:71 (ep.so+0x000000080b33) #4 LockHolder::LockHolder(Mutex&, bool) ep-engine/src/locks.h:48 (ep.so+0x0000000807a2) #5 PassiveStream::processBufferedMessages(unsigned int&) ep-engine/src/dcp-stream.cc:1286 (ep.so+0x0000002a085d) #6 DcpConsumer::processBufferedItems() ep-engine/src/dcp-consumer.cc:608 (ep.so+0x0000002650c4) #7 Processer::run() ep-engine/src/dcp-consumer.cc:48 (ep.so+0x000000264cbf) #8 ExecutorThread::run() ep-engine/src/executorthread.cc:109 (ep.so+0x0000001e75a1) #9 launch_executor_thread(void*) ep-engine/src/executorthread.cc:34 (ep.so+0x0000001e6bca) #10 platform_thread_wrap platform/src/cb_pthreads.c (libplatform.so.0.1.0+0x00000000371c) Mutex M22701 acquired here while holding mutex M969 in main thread: #0 pthread_mutex_lock <null> (engine_testapp+0x00000047e980) #1 cb_mutex_enter <null> (libplatform.so.0.1.0+0x000000003960) #2 Mutex::acquire() ep-engine/src/mutex.cc:31 (ep.so+0x0000001e60ce) #3 LockHolder::lock() ep-engine/src/locks.h:71 (ep.so+0x000000080b33) #4 LockHolder::LockHolder(Mutex&, bool) ep-engine/src/locks.h:48 (ep.so+0x0000000807a2) #5 PassiveStream::clearBuffer() ep-engine/src/dcp-stream.cc:1573 (ep.so+0x00000029e6a1) #6 PassiveStream::setDead(end_stream_status_t) ep-engine/src/dcp-stream.cc:1171 (ep.so+0x00000029dffc) #7 DcpConsumer::closeAllStreams() ep-engine/src/dcp-consumer.cc:722 (ep.so+0x00000026fb66) #8 DcpConnMap::disconnect_UNLOCKED(void const*) ep-engine/src/connmap.cc:1096 (ep.so+0x00000023bbe3) #9 DcpConnMap::disconnect(void const*) ep-engine/src/connmap.cc:1069 (ep.so+0x00000023b664) #10 EventuallyPersistentEngine::handleDisconnect(void const*) ep-engine/src/ep_engine.cc:5711 (ep.so+0x0000001617ab) #11 EvpHandleDisconnect(void const*, ENGINE_EVENT_TYPE, void const*, void const*) ep-engine/src/ep_engine.cc:1702 (ep.so+0x00000013d415) #12 mock_perform_callbacks memcached/programs/engine_testapp/mock_server.c (engine_testapp+0x0000004d04db) #13 disconnect_mock_connection <null> (engine_testapp+0x0000004d10c6) #14 destroy_mock_cookie <null> (engine_testapp+0x0000004d0fa7) #15 test_mb19982(engine_interface*, engine_interface_v1*) ep-engine/tests/ep_testsuite.cc:12020 (ep_testsuite.so+0x0000000b1d7d) #16 execute_test memcached/programs/engine_testapp/engine_testapp.c (engine_testapp+0x0000004c50bf) #17 main crtstuff.c (engine_testapp+0x0000004c2ea8) Mutex M969 previously acquired by the same thread here: #0 pthread_mutex_lock <null> (engine_testapp+0x00000047e980) #1 cb_mutex_enter <null> (libplatform.so.0.1.0+0x000000003960) #2 Mutex::acquire() ep-engine/src/mutex.cc:31 (ep.so+0x0000001e60ce) #3 LockHolder::lock() ep-engine/src/locks.h:71 (ep.so+0x000000080b33) #4 LockHolder::LockHolder(Mutex&, bool) ep-engine/src/locks.h:48 (ep.so+0x0000000807a2) #5 DcpConnMap::disconnect(void const*) ep-engine/src/connmap.cc:1068 (ep.so+0x00000023b64e) #6 EventuallyPersistentEngine::handleDisconnect(void const*) ep-engine/src/ep_engine.cc:5711 (ep.so+0x0000001617ab) #7 EvpHandleDisconnect(void const*, ENGINE_EVENT_TYPE, void const*, void const*) ep-engine/src/ep_engine.cc:1702 (ep.so+0x00000013d415) #8 mock_perform_callbacks memcached/programs/engine_testapp/mock_server.c (engine_testapp+0x0000004d04db) #9 disconnect_mock_connection <null> (engine_testapp+0x0000004d10c6) #10 destroy_mock_cookie <null> (engine_testapp+0x0000004d0fa7) #11 test_mb19982(engine_interface*, engine_interface_v1*) ep-engine/tests/ep_testsuite.cc:12020 (ep_testsuite.so+0x0000000b1d7d) #12 execute_test memcached/programs/engine_testapp/engine_testapp.c (engine_testapp+0x0000004c50bf) #13 main crtstuff.c (engine_testapp+0x0000004c2ea8) SUMMARY: ThreadSanitizer: lock-order-inversion (potential deadlock) ??:0 __interceptor_pthread_mutex_lock Change-Id: Ia441d5f5898516e3526a610426fa81f5df0e35e6 Reviewed-on: http://review.couchbase.org/65297 Well-Formed: buildbot <[email protected]> Tested-by: buildbot <[email protected]> Reviewed-by: Jim Walker <[email protected]>
1 parent 09ad001 commit 65bbce3

File tree

2 files changed

+33
-22
lines changed

2 files changed

+33
-22
lines changed

src/connmap.cc

Lines changed: 32 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,37 +1065,49 @@ void DcpConnMap::closeAllStreams_UNLOCKED() {
10651065
}
10661066

10671067
void DcpConnMap::disconnect(const void *cookie) {
1068-
LockHolder lh(connsLock);
1069-
disconnect_UNLOCKED(cookie);
1070-
}
1071-
1072-
void DcpConnMap::disconnect_UNLOCKED(const void *cookie) {
1073-
std::list<connection_t>::iterator iter;
1074-
for (iter = all.begin(); iter != all.end(); ++iter) {
1075-
if ((*iter)->getCookie() == cookie) {
1076-
(*iter)->setDisconnect(true);
1077-
all.erase(iter);
1078-
break;
1068+
// Move the connection matching this cookie from the `all` and map_
1069+
// data structures (under connsLock).
1070+
connection_t conn;
1071+
{
1072+
LockHolder lh(connsLock);
1073+
std::list<connection_t>::iterator iter;
1074+
for (iter = all.begin(); iter != all.end(); ++iter) {
1075+
if ((*iter)->getCookie() == cookie) {
1076+
(*iter)->setDisconnect(true);
1077+
all.erase(iter);
1078+
break;
1079+
}
10791080
}
1080-
}
1081-
1082-
std::map<const void*, connection_t>::iterator itr(map_.find(cookie));
1083-
if (itr != map_.end()) {
1084-
connection_t conn = itr->second;
1085-
if (conn.get()) {
1086-
LOG(EXTENSION_LOG_INFO, "%s Removing connection",
1087-
conn->logHeader());
1088-
map_.erase(itr);
1081+
std::map<const void*, connection_t>::iterator itr(map_.find(cookie));
1082+
if (itr != map_.end()) {
1083+
conn = itr->second;
1084+
if (conn.get()) {
1085+
LOG(EXTENSION_LOG_INFO, "%s Removing connection",
1086+
conn->logHeader());
1087+
map_.erase(itr);
1088+
}
10891089
}
1090+
}
10901091

1092+
// Note we shutdown the stream *not* under the connsLock; this is
1093+
// because as part of closing a DcpConsumer stream we need to
1094+
// acquire PassiveStream::buffer.bufMutex; and that could deadlock
1095+
// in EventuallyPersistentStore::setVBucketState, via
1096+
// PassiveStream::processBufferedMessages.
1097+
if (conn) {
10911098
DcpProducer* producer = dynamic_cast<DcpProducer*> (conn.get());
10921099
if (producer) {
10931100
producer->closeAllStreams();
10941101
producer->cancelCheckpointProcessorTask();
10951102
} else {
10961103
static_cast<DcpConsumer*>(conn.get())->closeAllStreams();
10971104
}
1105+
}
10981106

1107+
// Finished disconnecting the stream; add it to the
1108+
// deadConnections list.
1109+
if (conn) {
1110+
LockHolder lh(connsLock);
10991111
deadConnections.push_back(conn);
11001112
}
11011113
}

src/connmap.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -473,10 +473,9 @@ class DcpConnMap : public ConnMap {
473473

474474
bool isPassiveStreamConnected_UNLOCKED(uint16_t vbucket);
475475

476-
void disconnect_UNLOCKED(const void *cookie);
477-
478476
void closeAllStreams_UNLOCKED();
479477

478+
// Guarded by the parent's classes `connsLock`
480479
std::list<connection_t> deadConnections;
481480
};
482481

0 commit comments

Comments
 (0)