Uploaded image for project: 'Couchbase Server'
  1. Couchbase Server
  2. MB-50492

[NexusKVStore] Memcached aborted in NexusKVStore::scan(BySeqnoScanContext&) const ()

    XMLWordPrintable

Details

    • Bug
    • Resolution: Cannot Reproduce
    • Critical
    • None
    • 7.1.0
    • couchbase-bucket
    • 7.1.0-2111

    Description

      Steps to repro:

      1. Create a 2 node cluster(172.23.122.245, 172.23.122.246(bucket_ram_quota = 2056 MB/node), replicas=0)
      2. Create 10 scopes
      3. Create 10 collections in each of the scopes
      4. Start Loading 300k docs in each of non default collections
      5. Change replicas to 1 and rebalance in a node 172.23.122.247
      6. Trigger full compaction
      7. Rebalance was successful
      8. Change replicas to 2, trigger rebalance
      9. Drop few scopes, recreate few scopes
      10. Observed Memcached aborted in NexusKVStore::scan(BySeqnoScanContext&) const ()

      Below Core found on node : 172.23.122.247(e04e26ac-c7e2-4a09-d925aeac-f79adb11.dmp)
      BackTrace:

      (gdb) bt full
      #0  0x00007f3c41cf9337 in raise () from /lib64/libc.so.6
      No symbol table info available.
      #1  0x00007f3c41cfaa28 in abort () from /lib64/libc.so.6
      No symbol table info available.
      #2  0x00000000006d4312 in cb::handleError(spdlog::logger&, spdlog::level::level_enum, std::basic_string_view<char, std::char_traits<char> >, cb::ErrorHandlingMethod) () at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/engines/ep/src/error_handler.cc:43
      No locals.
      #3  0x00000000008efac0 in handleError (msg=..., this=0x7f3c3c158860)
          at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/engines/ep/src/kvstore/nexus-kvstore/nexus-kvstore-config.h:39
      No locals.
      #4  NexusKVStore::scan(BySeqnoScanContext&) const ()
          at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/engines/ep/src/kvstore/nexus-kvstore/nexus-kvstore.cc:2381
              msg = {static npos = 18446744073709551615,
                _M_dataplus = {<std::allocator<char>> = {<__gnu_cxx::new_allocator<char>> = {<No data fields>}, <No data fields>},
                  _M_p = 0x7f3b6cc61570 <Address 0x7f3b6cc61570 out of bounds>}, _M_string_length = 99, {
                  _M_local_buf = "c\000\000\000\000\000\000\000\001\000\000\000\000\000\000", _M_allocated_capacity = 99}}
              nexusCtx = @0x7f3aedbc2240: <error reading variable>
              primaryCtx = @0x7f3b08ab4de0: <error reading variable>
              secondaryCtx = <optimized out>
              primaryScanResult = scan_again
              secondaryScanResult = <optimized out>
              primaryScanCallback = <optimized out>
              primaryCacheLookup = @0x7f3b833964a0: <error reading variable>
      #5  0x00000000008d229d in DCPBackfillBySeqnoDisk::scan() ()
          at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/engines/ep/src/dcp/backfill_by_seqno_disk.cc:173
              stream = {<std::__shared_ptr<ActiveStream, (__gnu_cxx::_Lock_policy)2>> = {<std::__shared_ptr_access<ActiveStream, (__gnu_cxx::_Lock_policy)2, false, false>> = {<No data fields>}, _M_ptr = <optimized out>, _M_refcount = {_M_pi = 0x7f3b818b3d00}}, <No data fields>}
              kvstore = 0x7f3c3c158860
              bySeqnoCtx = @0x7f3aedbc2240: <error reading variable>
      #6  0x00000000008d5c52 in DCPBackfillDisk::run() ()
          at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/engines/ep/src/dcp/backfill_disk.cc:131
              lh = {_M_device = @0x7f3b0c92f888}
              runtimeGuard = {<folly::detail::ScopeGuardImplBase> = {dismissed_ = false}, function_ = {__start = {__d = {__r = 8067972280088974}},
                  __this = 0x7f3b0c92f880}}
      #7  0x00000000008d7671 in BackfillManager::backfill() ()
          at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/engines/ep/src/dcp/backfill-manager.cc:344
              lh = {_M_device = <optimized out>, _M_owns = false}
              backfill = {_M_t = {<std::__uniq_ptr_impl<DCPBackfillIface, std::default_delete<DCPBackfillIface> >> = {
                    _M_t = {<std::_Tuple_impl<0, DCPBackfillIface*, std::default_delete<DCPBackfillIface> >> = {<std::_Tuple_impl<1, std::default_delete<DCPB---Type <return> to continue, or q <return> to quit---
      ackfillIface> >> = {<std::_Head_base<1, std::default_delete<DCPBackfillIface>, true>> = {<std::default_delete<DCPBackfillIface>> = {<No data fields>}, <No data fields>}, <No data fields>}, <std::_Head_base<0, DCPBackfillIface*, false>> = {
                          _M_head_impl = 0x7f3b0c92f8d8}, <No data fields>}, <No data fields>}}, <No data fields>}}
              source = BackfillManager::Active
              status = <optimized out>
      #8  0x00000000008d7e14 in BackfillManagerTask::run() ()
          at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/engines/ep/src/dcp/backfill-manager.cc:78
              phosphor_internal_category_enabled_67 = {_M_b = {_M_p = 0x0}, static is_always_lock_free = <optimized out>}
              phosphor_internal_category_enabled_temp_67 = <optimized out>
              phosphor_internal_tpi_67 = {category = 0x0, name = 0x0, type = phosphor::AsyncStart, argument_names = {_M_elems = {0x0, 0x0}},
                argument_types = {_M_elems = {phosphor::is_bool, phosphor::is_bool}}}
              phosphor_internal_guard_67 = {tpi = 0x106a360 <BackfillManagerTask::run()::phosphor_internal_tpi_67>, enabled = true,
                arg1 = {<No data fields>}, arg2 = {<No data fields>}, start = {__d = {__r = 8067972280086509}}}
              manager = {<std::__shared_ptr<BackfillManager, (__gnu_cxx::_Lock_policy)2>> = {<std::__shared_ptr_access<BackfillManager, (__gnu_cxx::_Lock_policy)2, false, false>> = {<No data fields>}, _M_ptr = <optimized out>, _M_refcount = {_M_pi = 0x7f3b91809800}}, <No data fields>}
              status = <optimized out>
      #9  0x0000000000aaa622 in GlobalTask::execute() () at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/executor/globaltask.cc:68
       
       
       
              guard = {previous = 0x0}
       
      #10 0x0000000000aa7725 in FollyExecutorPool::TaskProxy::scheduleViaCPUPool()::{lambda()#2}::operator()() const (__closure=0x7f3c05fe9840)
          at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/executor/folly_executorpool.cc:189
              scheduleOverhead = <optimized out>
              start = {__d = {__r = 8067972280085022}}
              runAgain = false
              proxy = @0x7f3c2436f910: <error reading variable>
      #11 0x0000000000bfbbd0 in operator() (this=0x7f3c05fe9840)
          at /home/couchbase/jenkins/workspace/cbdeps-platform-build-old/deps/packages/build/folly/folly-prefix/src/folly/folly/Function.h:416
              fn = @0x7f3c05fe9840: {<folly::detail::function::FunctionTraits<void()>> = {<No data fields>}, data_ = {big = 0x7f3c2436f910, tiny = {
                    __data = "\020\371\066$<\177\000\000\000\000\000\000\000\000\000\000\260\200}@<\177\000\000\314.\000\000\000\000\000\000\030\000\000\000\000\000\000\000\020\231\376\005<\177\000", __align = {<No data fields>}}},
                call_ = 0xaa7de0 <folly::detail::function::FunctionTraits<void ()>::callSmall<FollyExecutorPool::TaskProxy::scheduleViaCPUPool()::{lambda()#2}>(folly::detail::function::Data&)>,
                exec_ = 0xaa6530 <folly::detail::function::execSmall<FollyExecutorPool::TaskProxy::scheduleViaCPUPool()::{lambda()#2}>(folly::detail::function::Op, folly::detail::function::Data*, folly::detail::function::Data)>}
      #12 folly::ThreadPoolExecutor::runTask(std::shared_ptr<folly::ThreadPoolExecutor::Thread> const&, folly::ThreadPoolExecutor::Task&&) (
          this=this@entry=0x7f3c40751000, thread=...,
          task=task@entry=<unknown type in /usr/lib/debug/opt/couchbase/bin/memcached-7.1.0-2111.x86_64.debug, CU 0xa3160b8, DIE 0xa399fed>)
          at /home/couchbase/jenkins/workspace/cbdeps-platform-build-old/deps/packages/build/folly/folly-prefix/src/folly/folly/executors/ThreadPoolExecutor.cpp:97
              rctx = {
      ---Type <return> to continue, or q <return> to quit---
                prev_ = {<std::__shared_ptr<folly::RequestContext, (__gnu_cxx::_Lock_policy)2>> = {<std::__shared_ptr_access<folly::RequestContext, (__gnu_cxx::_Lock_policy)2, false, false>> = {<No data fields>}, _M_ptr = 0x0, _M_refcount = {_M_pi = 0x0}}, <No data fields>}}
              startTime = {__d = {__r = 8067972280081333}}
              stats = {expired = false, waitTime = {__r = 6127}, runTime = {__r = 0}, enqueueTime = {__d = {__r = 8067972280075206}}, requestId = 0}
      #13 0x0000000000be650a in folly::CPUThreadPoolExecutor::threadRun (this=0x7f3c40751000, thread=...)
          at /home/couchbase/jenkins/workspace/cbdeps-platform-build-old/deps/packages/build/folly/folly-prefix/src/folly/folly/executors/CPUThreadPoolExecutor.cpp:265
              task = {storage_ = {{emptyState = 16 '\020', value = {<folly::ThreadPoolExecutor::Task> = {
                        func_ = {<folly::detail::function::FunctionTraits<void()>> = {<No data fields>}, data_ = {big = 0x7f3c2436f910, tiny = {
                              __data = "\020\371\066$<\177\000\000\000\000\000\000\000\000\000\000\260\200}@<\177\000\000\314.\000\000\000\000\000\000\030\000\000\000\000\000\000\000\020\231\376\005<\177\000", __align = {<No data fields>}}},
                          call_ = 0xaa7de0 <folly::detail::function::FunctionTraits<void ()>::callSmall<FollyExecutorPool::TaskProxy::scheduleViaCPUPool()::{lambda()#2}>(folly::detail::function::Data&)>,
                          exec_ = 0xaa6530 <folly::detail::function::execSmall<FollyExecutorPool::TaskProxy::scheduleViaCPUPool()::{lambda()#2}>(folly::detail::function::Op, folly::detail::function::Data*, folly::detail::function::Data)>}, enqueueTime_ = {__d = {__r = 8067972280075206}}, expiration_ = {
                          __r = 0}, expireCallback_ = {<folly::detail::function::FunctionTraits<void()>> = {<No data fields>}, data_ = {big = 0x2ecc,
                            tiny = {
                              __data = "\314.\000\000\000\000\000\000\vq\243", '\000' <repeats 13 times>, "_\036\213D<\177\000\000p\312.?<\177\000\000@\266\253D<\177\000", __align = {<No data fields>}}}, call_ = 0x4662f9
           <folly::detail::function::FunctionTraits<void ()>::uninitCall(folly::detail::function::Data&)>, exec_ = 0x0},
                        context_ = {<std::__shared_ptr<folly::RequestContext, (__gnu_cxx::_Lock_policy)2>> = {<std::__shared_ptr_access<folly::RequestContext, (__gnu_cxx::_Lock_policy)2, false, false>> = {<No data fields>}, _M_ptr = 0x0, _M_refcount = {_M_pi = 0x0}}, <No data fields>}}, poison = false,
                      priority_ = 0 '\000', queueObserverPayload_ = 139896775787984}}, hasValue = true}}
              guard = {list_ = {forbid = true, prev = 0x0, curr = {name = {static npos = <optimized out>, b_ = 0xccd46b "CPUThreadPoolExecutor",
                      e_ = 0xccd480 ""}}}}
      #14 0x0000000000bfeb89 in __invoke_impl<void, void (folly::ThreadPoolExecutor::*&)(std::shared_ptr<folly::ThreadPoolExecutor::Thread>), folly::ThreadPoolExecutor*&, std::shared_ptr<folly::ThreadPoolExecutor::Thread>&> (__t=<optimized out>, __f=<optimized out>)
          at /usr/local/include/c++/7.3.0/bits/invoke.h:73
      No locals.
      #15 __invoke<void (folly::ThreadPoolExecutor::*&)(std::shared_ptr<folly::ThreadPoolExecutor::Thread>), folly::ThreadPoolExecutor*&, std::shared_ptr<folly::ThreadPoolExecutor::Thread>&> (__fn=<optimized out>) at /usr/local/include/c++/7.3.0/bits/invoke.h:95
      No locals.
      #16 __call<void, 0, 1> (__args=<optimized out>, this=<optimized out>) at /usr/local/include/c++/7.3.0/functional:467
      No locals.
      #17 operator()<> (this=<optimized out>) at /usr/local/include/c++/7.3.0/functional:551
      No locals.
      #18 folly::detail::function::FunctionTraits<void ()>::callBig<std::_Bind<void (folly::ThreadPoolExecutor::*(folly::ThreadPoolExecutor*, std::shared_ptr<folly::ThreadPoolExecutor::Thread>))(std::shared_ptr<folly::ThreadPoolExecutor::Thread>)> >(folly::detail::function::Data&) (p=...)
          at /home/couchbase/jenkins/workspace/cbdeps-platform-build-old/deps/packages/build/folly/folly-prefix/src/folly/folly/Function.h:401
      ---Type <return> to continue, or q <return> to quit---
              fn = <optimized out>
      #19 0x0000000000aa73b4 in operator() (this=0x7f3c40bd9c00)
          at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/executor/folly_executorpool.cc:47
              fn = @0x7f3c40bd9c00: <error reading variable>
      #20 operator() (__closure=0x7f3c40bd9c00) at /home/couchbase/jenkins/workspace/couchbase-server-unix/kv_engine/executor/folly_executorpool.cc:47
              threadNameOpt = {storage_ = {{emptyState = -96 '\240', value = {static npos = 18446744073709551615,
                      _M_dataplus = {<std::allocator<char>> = {<__gnu_cxx::new_allocator<char>> = {<No data fields>}, <No data fields>},
                        _M_p = 0x7f3c05fe99a0 "AuxIoPool0"}, _M_string_length = 10, {_M_local_buf = "AuxIoPool0\000\000\000\000\000",
                        _M_allocated_capacity = 8029725099529106753}}}, hasValue = true}}
              func = <error reading variable func (Cannot access memory at address 0x7f3c40bd9c00)>
      #21 folly::detail::function::FunctionTraits<void ()>::callBig<CBRegisteredThreadFactory::newThread(folly::Function<void ()>&&)::{lambda()#1}>(folly::detail::function::Data&) (p=...)
          at /home/couchbase/jenkins/workspace/couchbase-server-unix/server_build/tlm/deps/folly.exploded/include/folly/Function.h:401
              fn = @0x7f3c40bd9c00: <error reading variable>
      #22 0x00007f3c42678d40 in execute_native_thread_routine () from /opt/couchbase/bin/../lib/libstdc++.so.6
      No symbol table info available.
      #23 0x00007f3c44480e65 in start_thread () from /lib64/libpthread.so.0
      No symbol table info available.
      #24 0x00007f3c41dc188d in clone () from /lib64/libc.so.6
      No symbol table info available.
      

      Rebalance Failure:

      Rebalance exited with reason {mover_crashed,
      {unexpected_exit,
      {'EXIT',<0.24217.9>,
      {{{{nocatch,{error,closed}},
      [{mc_binary,recv_with_data,4,
      [{file,"src/mc_binary.erl"},{line,41}]},
      {mc_binary,quick_active_recv,3,
      [{file,"src/mc_binary.erl"},{line,48}]},
      {mc_binary,quick_stats_loop_enter,5,
      [{file,"src/mc_binary.erl"},{line,100}]},
      {mc_binary,quick_stats,5,
      [{file,"src/mc_binary.erl"},{line,85}]},
      {mc_client_binary,get_dcp_docs_estimate,
      3,
      [{file,"src/mc_client_binary.erl"},
      {line,754}]},
      {ns_memcached,do_handle_call,3,
      [{file,"src/ns_memcached.erl"},
      {line,617}]},
      {ns_memcached,worker_loop,3,
      [{file,"src/ns_memcached.erl"},
      {line,226}]},
      {proc_lib,init_p_do_apply,3,
      [{file,"proc_lib.erl"},{line,226}]}]},
      {gen_server,call,
      ['ns_memcached-default',
      {get_dcp_docs_estimate,785,
      "replication:ns_1@172.23.122.247->ns_1@172.23.122.245:default"},
      300000]}},
      {gen_server,call,
      [{'janitor_agent-default',
      'ns_1@172.23.122.247'},
      {if_rebalance,<0.21558.8>,
      {wait_dcp_data_move,
      ['ns_1@172.23.122.246',
      'ns_1@172.23.122.245'],
      785}},
      infinity]}}}}}.
      Rebalance Operation Id = 61bac83551af21cdc012bcb2fd13d3e2
      

      QE-TEST:

      git fetch "https://review.couchbase.org/TAF" refs/changes/88/166488/1 && git checkout FETCH_HEAD
       
      guides/gradlew --refresh-dependencies testrunner -P jython=/opt/jython/bin/jython -P 'args=-i /tmp/qe_r.ini -p bucket_storage=couchstore,rerun=false,bucket_eviction_policy=fullEviction,init_loading=False -t storage.magma.magma_rebalance.MagmaRebalance.test_data_load_collections_with_rebalance_out,num_items=300000,doc_size=28,nodes_init=2,nodes_out=1,standard_buckets=1,magma_buckets=0,bucket_storage=couchstore,data_load_stage=during,sdk_timeout=60,vbuckets=1024,key_size=22,replicas=0,infra_log_level=debug,log_level=debug,skip_cleaup=True,randomize_value=True,bucket_eviction_policy=fullEviction,infra_log_level=debug,log_level=debug,init_loading=False,fragmentation=50,skip_cleanup=True,autoCompactionDefined=true,iterations=1,enable_dp=True,num_collections=101,num_scopes=1,bucket_ram_quota=2056,skip_cleanup=True,sdk_client_pool=False,ops_rate=30000,doc_ops=create,create_perc=0,delete_perc=50,update_perc=50,num_collections_to_drop=50,get-cbcollect-info=True -m rest'
      

      Attachments

        Issue Links

          For Gerrit Dashboard: MB-50492
          # Subject Branch Project Status CR V

          Activity

            People

              ankush.sharma Ankush Sharma
              ankush.sharma Ankush Sharma
              Votes:
              0 Vote for this issue
              Watchers:
              4 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved:

                PagerDuty