Details
-
Bug
-
Status: Closed
-
Critical
-
Resolution: Fixed
-
Cheshire-Cat
-
Enterprise Edition 7.0.0 build 4554
-
Untriaged
-
Centos 64-bit
-
1
-
Yes
-
KV-Engine 2021-March
Description
Build: 7.0.0 4554
Test case:
guides/gradlew --refresh-dependencies testrunner -P jython=/opt/jython/bin/jython -P 'args=-i /tmp/testexec.97440.ini bucket_type=ephemeral,num_items=50000,durability=MAJORITY,sdk_timeout=60,num_items=500000,GROUP=P0,get-cbcollect-info=True,upgrade_version=7.0.0-4554 -t bucket_param_tests.bucket_param_update.BucketParamTest.test_replica_update,bucket_type=ephemeral,upgrade_version=7.0.0-4554,GROUP=P0,doc_ops=update;create;delete,sdk_timeout=60,get-cbcollect-info=False,replicas=0,durability=MAJORITY,log_level=error,nodes_init=5,num_items=500000,infra_log_level=critical'
|
Scenario: (all doc_ops with durability=MAJORITY)
- 5 node cluster, 1 ephemeral bucket with replica=0
- Load initial load of 500K into the bucket
- Do incremental replica_update from 0 -> 1 -> 2 -> 3 with doc_ops in parallel
Observation:
Rebalance failed when updating replica from 2 -> 3 with following logs.
Rebalance exited with reason
|
{mover_crashed,
|
{unexpected_exit, {'EXIT',<0.18221.4>,
|
{{bulk_set_vbucket_state_failed,
|
[{'ns_1@172.23.100.21', {'EXIT', {{{{badmatch,{error,timeout}},
|
[{mc_client_binary,cmd_vocal_recv,5, [{file,"src/mc_client_binary.erl"}, {line,158}]},
|
{mc_client_binary,select_bucket,2, [{file,"src/mc_client_binary.erl"}, {line,350}]},
|
{ns_memcached_sockets_pool, take_socket,2, [{file, "src/ns_memcached_sockets_pool.erl"}, {line,46}]},
|
{ns_memcached_sockets_pool, '-executing_on_socket/3-fun-0-',3, [{file, "src/ns_memcached_sockets_pool.erl"}, {line,88}]},
|
{async,'-async_init/4-fun-1-',3, [{file,"src/async.erl"}, {line,197}]}]},
|
{gen_server,call,
|
['ns_memcached-default',
|
{get_dcp_docs_estimate,540, "replication:ns_1@172.23.100.21->ns_1@172.23.100.10:default"}, 180000]}},
|
{gen_server,call,
|
[{'janitor_agent-default', 'ns_1@172.23.100.21'},
|
{if_rebalance,<0.32015.3>,
|
{update_vbucket_state,974,replica, undefined,'ns_1@172.23.100.22'}}, infinity]}}}}]},
|
[{janitor_agent,bulk_set_vbucket_state,4, [{file,"src/janitor_agent.erl"}, {line,403}]},
|
{ns_single_vbucket_mover, update_replication_post_move,5, [{file,"src/ns_single_vbucket_mover.erl"}, {line,530}]},
|
{ns_single_vbucket_mover,on_move_done_body, 6,
|
[{file,"src/ns_single_vbucket_mover.erl"}, {line,556}]},
|
{proc_lib,init_p,3, [{file,"proc_lib.erl"},{line,234}]}]}}}}.
|
|
Rebalance Operation Id = 7c081452ddf5be8bff6e738fb44fe809
|
|
Worker <0.17405.4> (for action {move,{974,
|
['ns_1@172.23.100.21', 'ns_1@172.23.100.20', 'ns_1@172.23.100.22'], ['ns_1@172.23.100.22', 'ns_1@172.23.100.20', 'ns_1@172.23.100.21', 'ns_1@172.23.100.10'],
|
[]}}) exited with reason {unexpected_exit,
|
{'EXIT', <0.18221.4>,
|
{{bulk_set_vbucket_state_failed,
|
[{'ns_1@172.23.100.21', {'EXIT', {{{{badmatch, {error, timeout}},
|
[{mc_client_binary, cmd_vocal_recv, 5, [{file, "src/mc_client_binary.erl"}, {line, 158}]},
|
{mc_client_binary, select_bucket, 2, [{file, "src/mc_client_binary.erl"}, {line, 350}]},
|
{ns_memcached_sockets_pool, take_socket, 2, [{file, "src/ns_memcached_sockets_pool.erl"}, {line, 46}]},
|
{ns_memcached_sockets_pool, '-executing_on_socket/3-fun-0-', 3, [{file, "src/ns_memcached_sockets_pool.erl"}, {line, 88}]},
|
{async, '-async_init/4-fun-1-', 3, [{file, "src/async.erl"}, {line, 197}]}]},
|
{gen_server, call,
|
['ns_memcached-default',
|
{get_dcp_docs_estimate, 540 "replication:ns_1@172.23.100.21->ns_1@172.23.100.10:default"}, 180000]}},
|
{gen_server, call,
|
[{'janitor_agent-default', 'ns_1@172.23.100.21'},
|
{if_rebalance, <0.32015.3>,
|
{update_vbucket_state, 974, replica, undefined, 'ns_1@172.23.100.22'}}, infinity]}}}}]},
|
[{janitor_agent,
|
bulk_set_vbucket_state, 4, [{file, "src/janitor_agent.erl"}, {line, 403}]},
|
{ns_single_vbucket_mover, update_replication_post_move, 5, [{file, "src/ns_single_vbucket_mover.erl"}, {line, 530}]},
|
{ns_single_vbucket_mover, on_move_done_body, 6, [{file, "src/ns_single_vbucket_mover.erl"}, {line, 556}]},
|
{proc_lib, init_p,3, [{file, "proc_lib.erl"}, {line, 234}]}]}}}
|
Jenkins log: http://qa.sc.couchbase.com/job/test_suite_executor-TAF/96702/console
Looking at the backtrace of thread 28 we can see that the exception was thrown Monotonic<l> (ActiveDM(vb:758)::State::highCompletedSeqno) invariant failed: new value (1831) breaks invariant on current value (5887)\nActiveDurabilityMonitor[0x7fa0006eb260].
Looking up the stack we see that the SyncWrite is ToCommit
(gdb) f 17
#17 0x00007fa0846e7f43 in ActiveDurabilityMonitor::processCompletedSyncWriteQueue (this=0x7fa0006eb260) at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/engines/ep/src/durability/active_durability_monitor.cc:465
465 /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/engines/ep/src/durability/active_durability_monitor.cc: No such file or directory.
(gdb) info locals
sw = {
storage_ = {
{
emptyState = 0 '\000',
value = {
<DurabilityMonitor::SyncWrite> = {
item = {
value = 0x7fa015a2a300
},
startTime = {
__d = {
__r = 6536535862151361
}
},
status = ToCommit
},
And it has a senqo of 1831 which correlates with our exception.
(gdb) f
#16 0x00007fa0846e7a6f in ActiveDurabilityMonitor::commit (this=this@entry=0x7fa0006eb260, sw=...) at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/engines/ep/src/durability/active_durability_monitor.cc:882
882 in /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/engines/ep/src/durability/active_durability_monitor.cc
(gdb) p (*(Item*) 0x7fa015a2a300).bySeqno
$5 = {
<std::__atomic_base<long>> = {
static _S_alignment = 8,
_M_i = 1831
},
members of std::atomic<long>:
static is_always_lock_free = true
}
Going down a frame we gain more information about the ADM's seqno states, with the lastCommittedSeqno being set at frame 16 before updateHighCompletedSeqno() was called.
(gdb) f
#15 0x00007fa0846e768d in ActiveDurabilityMonitor::State::updateHighCompletedSeqno (this=0x7fa06c3d6c80) at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/engines/ep/src/durability/active_durability_monitor.cc:1562
1562 in /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/engines/ep/src/durability/active_durability_monitor.cc
(gdb) p lastTrackedSeqno.val
$15 = 5887
(gdb) p lastCommittedSeqno.val
$16 = 1831
(gdb) p lastAbortedSeqno.val
$17 = 0
(gdb) p highPreparedSeqno.val
$18 = 5888
(gdb) p highCompletedSeqno.val
$19 = 5887
(gdb) p totalAccepted
$20 = 0
(gdb) p totalCommitted
$21 = 0
(gdb) p totalAborted
$22 = 0