Details
-
Bug
-
Resolution: Fixed
-
Critical
-
7.6.0, 7.2.0, 7.2.1, 7.2.2, 7.2.3
-
7.6.0-1387
-
Untriaged
-
Linux x86_64
-
0
-
No
-
KV 2023-4
Description
Cause
As part of the fix for MB-56067, we converted the blocking bucket deletion code into a state machine.
The state machine should only make progress if the bucket has no clients (connections) associated with it.
However, there appears to be a bug in the state machine code, where we incorrectly move the state machine to complete bucket deletion, before all clients have disconnected.
This will happen 2 minutes after the DELETE_BUCKET started executing, due to a missing check for the current number of clients in that code path.
As a consequence of this, we crash in Connection::propagateDisconnect().
Consequences
As a consequence, we only end up waiting for up to 2 minutes for a command to finish execution. If a command takes longer than 2 minutes to complete, we crash. Most operations complete in milliseconds, however, in this case we see a long running COMPACT_DB (explicit compaction) for a Magma bucket, which has taken an abnormally long amout of time, longer than 2 minutes.
Original description follows
Steps To Recreate:
- Create a 6 node cluster(2 KV , 3 eventing and 1 index:n1ql, )
- Create four magma buckets (default, src_bucket, dst_bucket and metadata ) (each with ram_quota = 256 MiB and replicas=1)
- Create and deploy eventing function (Eventing nodes ip:172.23.109.157 ip:172.23.121.117 172.23.121.251]
saving function b'{\n "code": 0,\n "info": "Function: src_bucket/_default/Function_209033809_test_kv_rebalance_in_when_existing_eventing_node_is_processing_mutation stored in temp store"\n}'
|
[2023-08-21 23:38:54,002] - [eventing_base:435] INFO - Deploying the following handler code : Function_209033809_test_kv_rebalance_in_when_existing_eventing_node_is_processing_mutation with
|
bindings: {'buckets': [{'alias': 'dst_bucket', 'bucket_name': 'dst_bucket', 'access': 'rw'}], 'metadata_bucket': 'metadata', 'source_bucket': 'src_bucket', 'curl': []} and
|
settings: {'checkpoint_interval': 20000, 'cleanup_timers': False, 'dcp_stream_boundary': 'everything', 'deployment_status': False, 'description': 'Sample Description', 'log_level': 'INFO', 'skip_timer_threshold': 86400, 'sock_batch_size': 1, 'tick_duration': 5000, 'timer_processing_tick_interval': 500, 'timer_worker_pool_size': 3, 'worker_count': 3, 'processing_status': False, 'cpp_worker_thread_count': 1, 'execution_timeout': 20, 'data_chan_size': 10000, 'worker_queue_cap': 100000, 'use_memory_manager': True, 'deadline_timeout': 21, 'timer_storage_chan_size': 10000, 'dcp_gen_chan_size': 10000}
|
[2023-08-21 23:38:54,002] - [eventing_base:436] INFO -
|
function OnUpdate(doc, meta) {
|
var expiry = new Date();
|
expiry.setSeconds(expiry.getSeconds() + 3000);
|
var context = {docID : meta.id, random_text : "e6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh07Aumoe6cZZGHuh07Aumoe6cZZGHuh07Aumoe6"};
|
createTimer(timerCallback, expiry, meta.id, context);
|
log("create timer for:",meta.id)
|
}
|
function OnDelete(meta) {
|
var context = {docID : meta.id};
|
var cancelled=cancelTimer(timerCallback,meta.id);
|
log("is timer cancelled:",cancelled," for:",meta.id);
|
dst_bucket[meta.id]=meta.id;
|
}
|
|
function NDtimerCallback(context) {
|
log("firing delete timer:",context.docID);
|
var docID = context.docID;
|
while (true) {
|
try {
|
delete dst_bucket[context.docID];
|
break;
|
} catch (e) {
|
log(e);
|
}
|
}
|
}
|
|
function timerCallback(context) {
|
log("firing update timer:",context.docID);
|
var docID = context.docID;
|
while (true) {
|
try {
|
dst_bucket[context.docID]=context.random_text;
|
break;
|
} catch (e) {
|
log(e);
|
}
|
}
|
}
|
- Pause above deployed eventingfunction
- Rebalance in a KV node( 172.23.105.187)
- Resume the function
- Wait for eventing to catch up with all the update mutations and verify results after rebalance
- Observed Memcached crashed in Connection::propagateDisconnect (this=0x7f63275be180)
at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/daemon/connection.cc:1356
Below Core Dump is found on node 172.23.121.249(bfac3f8d-e367-4734-fc77eba9-2e673dda.dmp)
BackTrace:
(gdb) bt full
|
#0 Connection::propagateDisconnect (this=0x7f63275be180)
|
at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/daemon/connection.cc:1356
|
cookie = <optimized out>
|
__for_range = <error reading variable>
|
__for_begin = {_M_cur = <optimized out>, _M_first = <optimized out>, _M_last = <optimized out>, _M_node = <optimized out>}
|
__for_end = {_M_cur = <optimized out>, _M_first = <optimized out>, _M_last = <optimized out>, _M_node = <optimized out>}
|
bucket = <optimized out>
|
engine = <error reading variable>
|
#1 0x0000000000648c06 in Connection::close (this=this@entry=0x7f63275be180)
|
at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/daemon/connection.cc:1334
|
ewb = false
|
rc = 1
|
#2 0x0000000000656217 in Connection::executeCommandsCallback (this=this@entry=0x7f63275be180)
|
at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/daemon/connection.cc:1004
|
start = {__d = {__r = 309478290969202}}
|
stop = {__d = {__r = <optimized out>}}
|
ns = {__r = 3398}
|
#3 0x0000000000663176 in LibeventConnection::read_callback (this=0x7f63275be180)
|
at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/daemon/connection_libevent.cc:121
|
phosphor_internal_category_enabled_116 = {_M_b = {_M_p = 0x0}, static is_always_lock_free = <optimized out>}
|
phosphor_internal_category_enabled_temp_116 = <optimized out>
|
phosphor_internal_tpi_wait_116 = {category = 0xe4bd9a "mutex", name = 0xdd68d8 "LibeventConnection::read_callback::threadLock.wait",
|
type = phosphor::TraceEventType::Complete, argument_names = {_M_elems = {0xe43a81 "this", 0xe4c7bd ""}}, argument_types = {_M_elems = {
|
phosphor::TraceArgumentType::is_pointer, phosphor::TraceArgumentType::is_none}}}
|
phosphor_internal_tpi_held_116 = {category = 0xe4bd9a "mutex", name = 0xdd68a0 "LibeventConnection::read_callback::threadLock.held",
|
type = phosphor::TraceEventType::Complete, argument_names = {_M_elems = {0xe4c7bd "", 0xe4c7bd ""}}, argument_types = {_M_elems = {
|
phosphor::TraceArgumentType::is_pointer, phosphor::TraceArgumentType::is_none}}}
|
phosphor_internal_guard_116 = {tpiWait = 0xdd6b80 <LibeventConnection::read_callback()::phosphor_internal_tpi_wait_116>,
|
tpiHeld = 0xdd6b40 <LibeventConnection::read_callback()::phosphor_internal_tpi_held_116>, enabled = true, mutex = @0x7f63abd5c7c0,
|
threshold = {__r = 10000000}, start = {__d = {__r = 309478290967450}}, lockedAt = {__d = {__r = 309478290968358}}, releasedAt = {__d = {
|
__r = 0}}}
|
#4 0x00007f63ae28284e in bufferevent_run_deferred_callbacks_unlocked (cb=<optimized out>, arg=0x7f63275be400)
|
at /home/couchbase/jenkins/cbdeps-ws/deps/packages/build/libevent/libevent-prefix/src/libevent/bufferevent.c:208
|
readcb = 0x663290 <LibeventConnection::read_callback(bufferevent*, void*)>
|
cbarg = 0x7f63275be180
|
bufev_private = 0x7f63275be400
|
bufev = 0x7f63275be400
|
#5 0x00007f63ae28bb99 in event_process_active_single_queue (base=base@entry=0x7f63abc67500, activeq=0x7f63abd0e440,
|
max_to_process=max_to_process@entry=2147483647, endtime=endtime@entry=0x0)
|
--Type <RET> for more, q to quit, c to continue without paging--
|
at /home/couchbase/jenkins/cbdeps-ws/deps/packages/build/libevent/libevent-prefix/src/libevent/event.c:1726
|
evcb_cbfinalize = <optimized out>
|
ev = <optimized out>
|
evcb = 0x7f63275be5a0
|
count = 2
|
#6 0x00007f63ae28c287 in event_process_active (base=0x7f63abc67500)
|
at /home/couchbase/jenkins/cbdeps-ws/deps/packages/build/libevent/libevent-prefix/src/libevent/event.c:1789
|
activeq = <optimized out>
|
i = 0
|
c = 0
|
tv = {tv_sec = 1, tv_usec = 140066083157983}
|
maxcb = 2147483647
|
endtime = 0x0
|
limit_after_prio = 2147483647
|
activeq = <optimized out>
|
i = <optimized out>
|
c = <optimized out>
|
endtime = <optimized out>
|
tv = {tv_sec = <optimized out>, tv_usec = <optimized out>}
|
maxcb = <optimized out>
|
limit_after_prio = <optimized out>
|
done = <optimized out>
|
#7 event_base_loop (base=0x7f63abc67500, flags=flags@entry=1)
|
at /home/couchbase/jenkins/cbdeps-ws/deps/packages/build/libevent/libevent-prefix/src/libevent/event.c:2012
|
n = <optimized out>
|
evsel = 0x7f63ae49fc20 <epollops>
|
tv = {tv_sec = 309477515880998, tv_usec = 53}
|
tv_p = <optimized out>
|
res = <optimized out>
|
done = 0
|
retval = 0
|
__func__ = "event_base_loop"
|
#8 0x0000000000d67496 in (anonymous namespace)::EventBaseBackend::eb_event_base_loop (flags=1, this=<optimized out>)
|
at /home/couchbase/jenkins/cbdeps-ws/deps/packages/build/folly/folly-prefix/src/folly/folly/io/async/EventBase.cpp:74
|
No locals.
|
#9 folly::EventBase::loopBody (this=0x7f63abd5c560, flags=<optimized out>, ignoreKeepAlive=<optimized out>)
|
at /home/couchbase/jenkins/cbdeps-ws/deps/packages/build/folly/folly-prefix/src/folly/folly/io/async/EventBase.cpp:381
|
callbacks = {<boost::intrusive::list_impl<boost::intrusive::bhtraits<folly::EventBase::LoopCallback, boost::intrusive::list_node_traits<void*>, (boost::intrusive::link_mode_type)2, boost::intrusive::dft_tag, 1>, unsigned long, false, void>> = {static constant_time_size = false,
|
--Type <RET> for more, q to quit, c to continue without paging--
|
static stateful_value_traits = <optimized out>, static has_container_from_iterator = <optimized out>,
|
static safemode_or_autounlink = true,
|
data_ = {<boost::intrusive::bhtraits<folly::EventBase::LoopCallback, boost::intrusive::list_node_traits<void*>, (boost::intrusive::link_mode_type)2, boost::intrusive::dft_tag, 1>> = {<boost::intrusive::bhtraits_base<folly::EventBase::LoopCallback, boost::intrusive::list_node<void*>*, boost::intrusive::dft_tag, 1>> = {<No data fields>}, static link_mode = boost::intrusive::auto_unlink},
|
root_plus_size_ = {<boost::intrusive::detail::size_holder<false, unsigned long, void>> = {static constant_time_size = <optimized out>},
|
m_header = {<boost::intrusive::list_node<void*>> = {next_ = 0x7f63a63ea180,
|
prev_ = 0x7f63a63ea180}, <No data fields>}}}}, <No data fields>}
|
message = 0xe6c268 "Your code just tried to loop over an event base from inside another event base loop. Since libevent is not reentrant, this leads to undefined behavior in opt builds. Please fix immediately. For the co"...
|
SCOPE_EXIT_STATE8 = {<folly::detail::ScopeGuardImplBase> = {dismissed_ = false}, function_ = {__this = 0x7f63abd5c560}}
|
res = 0
|
ranLoopCallbacks = <optimized out>
|
blocking = <optimized out>
|
once = <optimized out>
|
prev = {__d = {__r = 306353699030282}}
|
idleStart = {__d = {__r = 309477536999699}}
|
busy = {__r = <optimized out>}
|
idle = {__r = <optimized out>}
|
prevLoopThread = {_M_thread = 0}
|
#10 0x0000000000d6799e in folly::EventBase::loop (this=0x7f63abd5c560)
|
at /home/couchbase/jenkins/cbdeps-ws/deps/packages/build/folly/folly-prefix/src/folly/folly/io/async/EventBase.cpp:305
|
guard = {list_ = {prev = 0x13d7d20 <folly::emptyList>, curr = {forbid = true, allowTerminationOnBlocking = false, ex = 0x7f63abd5c570, tag = {
|
static npos = <optimized out>, b_ = 0x0, e_ = 0x0}}}}
|
#11 0x0000000000d69e68 in folly::EventBase::loopForever (this=this@entry=0x7f63abd5c560)
|
at /home/couchbase/jenkins/cbdeps-ws/deps/packages/build/folly/folly-prefix/src/folly/folly/io/async/EventBase.cpp:524
|
SCOPE_EXIT_STATE9 = {<folly::detail::ScopeGuardImplBase> = {dismissed_ = <optimized out>}, function_ = {__this = <optimized out>}}
|
SCOPE_EXIT_STATE10 = {<folly::detail::ScopeGuardImplBase> = {dismissed_ = <optimized out>}, function_ = {__this = <optimized out>}}
|
ret = <optimized out>
|
#12 0x0000000000602d19 in worker_libevent (arg=0x7f63abd5c550)
|
at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/daemon/thread.cc:272
|
me = <error reading variable>
|
#13 0x0000000000cdc17c in std::function<void ()>::operator()() const (this=0x7f63abcf8eb8)
|
at /opt/gcc-10.2.0/include/c++/10.2.0/bits/std_function.h:622
|
No locals.
|
#14 operator() (__closure=0x7f63abcf8e98) at /home/couchbase/jenkins/workspace/couchbase-server-unix/platform/src/thread.cc:117
|
main = <error reading variable main (Cannot access memory at address 0x7f63abcf8eb8)>
|
n = <error reading variable n (Cannot access memory at address 0x7f63abcf8e98)>
|
main = {<std::_Maybe_unary_or_binary_function<void>> = {<No data fields>}, <std::_Function_base> = {static _M_max_size = 16,
|
--Type <RET> for more, q to quit, c to continue without paging--
|
static _M_max_align = 8, _M_functor = {_M_unused = {_M_object = <optimized out>, _M_const_object = <optimized out>,
|
_M_function_pointer = <optimized out>, _M_member_pointer = <optimized out>}, _M_pod_data = {<optimized out> <repeats 16 times>}},
|
_M_manager = <optimized out>}, _M_invoker = <optimized out>}
|
n = {static npos = 18446744073709551615,
|
_M_dataplus = {<std::allocator<char>> = {<__gnu_cxx::new_allocator<char>> = {<No data fields>}, <No data fields>}, _M_p = <optimized out>},
|
_M_string_length = <optimized out>, {_M_local_buf = {<optimized out> <repeats 16 times>}, _M_allocated_capacity = <optimized out>}}
|
#15 std::__invoke_impl<void, create_thread(std::function<void()>, std::string)::<lambda()> > (__f=...)
|
at /opt/gcc-10.2.0/include/c++/10.2.0/bits/invoke.h:60
|
No locals.
|
#16 std::__invoke<create_thread(std::function<void()>, std::string)::<lambda()> > (__fn=...) at /opt/gcc-10.2.0/include/c++/10.2.0/bits/invoke.h:95
|
No locals.
|
#17 std::thread::_Invoker<std::tuple<create_thread(std::function<void()>, std::string)::<lambda()> > >::_M_invoke<0> (this=0x7f63abcf8e98)
|
at /opt/gcc-10.2.0/include/c++/10.2.0/thread:264
|
No locals.
|
#18 std::thread::_Invoker<std::tuple<create_thread(std::function<void()>, std::string)::<lambda()> > >::operator() (this=0x7f63abcf8e98)
|
at /opt/gcc-10.2.0/include/c++/10.2.0/thread:271
|
No locals.
|
#19 std::thread::_State_impl<std::thread::_Invoker<std::tuple<create_thread(std::function<void()>, std::string)::<lambda()> > > >::_M_run(void) (
|
this=0x7f63abcf8e90) at /opt/gcc-10.2.0/include/c++/10.2.0/thread:215
|
No locals.
|
#20 0x00007f63ad41ed40 in std::execute_native_thread_routine (__p=0x7f63abcf8e90)
|
at /tmp/deploy/objdir/../gcc-10.2.0/libstdc++-v3/src/c++11/thread.cc:80
|
__t = {_M_t = {<std::__uniq_ptr_impl<std::thread::_State, std::default_delete<std::thread::_State> >> = {
|
_M_t = {<std::_Tuple_impl<0, std::thread::_State*, std::default_delete<std::thread::_State> >> = {<std::_Tuple_impl<1, std::default_delete<std::thread::_State> >> = {<std::_Head_base<1, std::default_delete<std::thread::_State>, true>> = {<std::default_delete<std::thread::_State>> = {<No data fields>}, <No data fields>}, <No data fields>}, <std::_Head_base<0, std::thread::_State*, false>> = {
|
_M_head_impl = <optimized out>}, <No data fields>}, <No data fields>}}, <No data fields>}}
|
#21 0x00007f63adc22ea7 in start_thread (arg=<optimized out>) at pthread_create.c:477
|
ret = <optimized out>
|
pd = <optimized out>
|
unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140065967654656, -7377849436284105510, 140730698566718, 140730698566719, 140065967612800,
|
8396800, 7447776737992658138, 7447764673310641370}, mask_was_saved = 0}}, priv = {pad = {0x0, 0x0, 0x0, 0x0}, data = {prev = 0x0,
|
cleanup = 0x0, canceltype = 0}}}
|
not_first_call = 0
|
#22 0x00007f63acf1aa2f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
|
No locals.
|
QE-TEST:
./testrunner -i /data/workspace/debian-p0-eventing-vset00-00-rebalance_cancel_timer/testexec.30111.ini -p get-cbcollect-info=True,use_https=True,enforce_tls=True,GROUP=cancel_timer,skip_log_scan=False,sirius_url=http://172.23.120.103:4000 -t eventing.eventing_rebalance.EventingRebalance.test_kv_rebalance_in_when_existing_eventing_node_is_processing_mutations,nodes_init=6,services_init=kv-kv-eventing-eventing-eventing-index:n1ql,dataset=default,groups=simple,reset_services=True,doc-per-day=20,cancel_timer=True,handler_code=cancel_timer,GROUP=cancel_timer,pause_resume=True
|
Job : http://qa.sc.couchbase.com/job/test_suite_executor/607740/consoleFull
This is one of the first functional test runs we had on trinity. Marking this as "No" for regression.
Attachments
Issue Links
- is caused by
-
MB-56067 Delete bucket task can block NonIO tasks and deadlock if any are needed for bucket shutdown
- Closed