Uploaded image for project: 'Couchbase Server'
  1. Couchbase Server
  2. MB-6020

log why ns_server is shutting down ep-engine ( which operation failed ) + why ns_server shuts down ep-engine anyways

    XMLWordPrintable

Details

    Description

      Install couchbase server 2.0.0-1492 on 12 nodes cluster.
      Load 30 million items to bucket and let load running.
      Remove 2 node 26 and 28.
      Do swap rebalance (add back node 26, 28 and remove node 24, 25)
      When rebalance finish, reboot node 14. Node 14 join back to cluster and done warm up
      Let the load running about an hour without doing rebalance or query view.
      Observer node 20 with high disk write queue, cpu usage 20% and low drain rate.
      Then node 20 down due to memcached crashed

      Core was generated by `/opt/couchbase/bin/memcached -X /opt/couchbase/lib/memcached/stdin_term_handler'.
      Program terminated with signal 6, Aborted.
      #0 0x0000003971e32885 in raise () from /lib64/libc.so.6

      Thread 13 (Thread 0x7f51a6575700 (LWP 24860)):
      #0 0x0000003971ee62c3 in epoll_wait () from /lib64/libc.so.6
      #1 0x00007f51a9e06576 in epoll_dispatch (base=0x592e780, tv=<value optimized out>) at epoll.c:404
      #2 0x00007f51a9df1e44 in event_base_loop (base=0x592e780, flags=<value optimized out>) at event.c:1558
      #3 0x00000000004144c4 in worker_libevent (arg=0x14ce8e0) at daemon/thread.c:301
      #4 0x00000039726077f1 in start_thread () from /lib64/libpthread.so.0
      #5 0x0000003971ee5ccd in clone () from /lib64/libc.so.6

      Thread 12 (Thread 0x7f51a9986700 (LWP 24855)):
      #0 0x0000003971ed89cd in read () from /lib64/libc.so.6
      #1 0x0000003971e71128 in _IO_new_file_underflow () from /lib64/libc.so.6
      #2 0x0000003971e72c2e in _IO_default_uflow_internal () from /lib64/libc.so.6
      #3 0x0000003971e6e11b in getc () from /lib64/libc.so.6
      #4 0x00007f51a9987879 in check_stdin_thread (arg=0x403420) at extensions/daemon/stdin_check.c:19
      #5 0x00000039726077f1 in start_thread () from /lib64/libpthread.so.0
      #6 0x0000003971ee5ccd in clone () from /lib64/libc.so.6

      Thread 11 (Thread 0x7f51a1733700 (LWP 9440)):
      #0 0x0000003971eab15d in nanosleep () from /lib64/libc.so.6
      #1 0x0000003971edf124 in usleep () from /lib64/libc.so.6
      #2 0x00007f51a50121ea in Flusher::wait (this=0x59b0120) at flusher.cc:35
      #3 0x00007f51a4fe7601 in EventuallyPersistentStore::~EventuallyPersistentStore (this=0x59b6000, __in_chrg=<value optimized out>) at ep.cc:524
      #4 0x00007f51a500f353 in EventuallyPersistentEngine::~EventuallyPersistentEngine() () from /opt/couchbase/lib/memcached/ep.so
      #5 0x00007f51a4ffd275 in EvpDestroy (handle=<value optimized out>, force=false) at ep_engine.cc:125
      #6 0x00007f51a8d7d616 in engine_shutdown_thread (arg=0x59342a0) at bucket_engine.c:1386
      #7 0x00000039726077f1 in start_thread () from /lib64/libpthread.so.0
      #8 0x0000003971ee5ccd in clone () from /lib64/libc.so.6

      Thread 10 (Thread 0x7f51a8d79700 (LWP 24856)):
      #0 0x0000003971ee62c3 in epoll_wait () from /lib64/libc.so.6
      #1 0x00007f51a9e06576 in epoll_dispatch (base=0x592e500, tv=<value optimized out>) at epoll.c:404
      #2 0x00007f51a9df1e44 in event_base_loop (base=0x592e500, flags=<value optimized out>) at event.c:1558
      #3 0x00000000004144c4 in worker_libevent (arg=0x14ce500) at daemon/thread.c:301
      #4 0x00000039726077f1 in start_thread () from /lib64/libpthread.so.0
      #5 0x0000003971ee5ccd in clone () from /lib64/libc.so.6

      Thread 9 (Thread 0x7f51a7977700 (LWP 24858)):
      #0 0x0000003971ee62c3 in epoll_wait () from /lib64/libc.so.6
      #1 0x00007f51a9e06576 in epoll_dispatch (base=0x592ec80, tv=<value optimized out>) at epoll.c:404
      #2 0x00007f51a9df1e44 in event_base_loop (base=0x592ec80, flags=<value optimized out>) at event.c:1558
      #3 0x00000000004144c4 in worker_libevent (arg=0x14ce6f0) at daemon/thread.c:301
      #4 0x00000039726077f1 in start_thread () from /lib64/libpthread.so.0
      #5 0x0000003971ee5ccd in clone () from /lib64/libc.so.6

      Thread 8 (Thread 0x7f51a9b89720 (LWP 24846)):
      #0 0x0000003971ee62c3 in epoll_wait () from /lib64/libc.so.6
      #1 0x00007f51a9e06576 in epoll_dispatch (base=0x592e000, tv=<value optimized out>) at epoll.c:404
      #2 0x00007f51a9df1e44 in event_base_loop (base=0x592e000, flags=<value optimized out>) at event.c:1558
      #3 0x0000000000409746 in main (argc=<value optimized out>, argv=<value optimized out>) at daemon/memcached.c:7920

      Thread 7 (Thread 0x7f51a3536700 (LWP 24874)):
      #0 0x000000397260b3dc in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
      #1 0x00007f51a4fdb167 in wait (this=0x596b880) at syncobject.hh:36
      #2 Dispatcher::run (this=0x596b880) at dispatcher.cc:135
      #3 0x00007f51a4fdb74b in launch_dispatcher_thread (arg=0x596b880) at dispatcher.cc:28
      #4 0x00000039726077f1 in start_thread () from /lib64/libpthread.so.0
      #5 0x0000003971ee5ccd in clone () from /lib64/libc.so.6

      Thread 6 (Thread 0x7f51a6f76700 (LWP 24859)):
      #0 0x0000003971ee62c3 in epoll_wait () from /lib64/libc.so.6
      #1 0x00007f51a9e06576 in epoll_dispatch (base=0x592ea00, tv=<value optimized out>) at epoll.c:404
      #2 0x00007f51a9df1e44 in event_base_loop (base=0x592ea00, flags=<value optimized out>) at event.c:1558
      #3 0x00000000004144c4 in worker_libevent (arg=0x14ce7e8) at daemon/thread.c:301
      #4 0x00000039726077f1 in start_thread () from /lib64/libpthread.so.0
      #5 0x0000003971ee5ccd in clone () from /lib64/libc.so.6

      Thread 5 (Thread 0x7f51a3f37700 (LWP 24873)):
      #0 0x000000397260b75b in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
      #1 0x00007f51a4fd88d0 in wait (this=0x5994100, d=...) at syncobject.hh:47
      #2 IdleTask::run (this=0x5994100, d=...) at dispatcher.cc:341
      #3 0x00007f51a4fdaf34 in Dispatcher::run (this=0x596aa80) at dispatcher.cc:169
      #4 0x00007f51a4fdb74b in launch_dispatcher_thread (arg=0x596aa80) at dispatcher.cc:28
      #5 0x00000039726077f1 in start_thread () from /lib64/libpthread.so.0
      #6 0x0000003971ee5ccd in clone () from /lib64/libc.so.6

      Thread 4 (Thread 0x7f51a4938700 (LWP 24872)):
      #0 0x000000397260edf3 in pread64 () from /lib64/libpthread.so.0
      #1 0x00007f51a4d64e20 in couch_pread (handle=<value optimized out>, buf=0x5d9a431, nbyte=8192, offset=14540800) at src/os.c:75
      #2 0x00007f51a4d63b6b in load_buffer_from (handle=0xa785a960, buf=<value optimized out>, nbyte=8, offset=<value optimized out>) at src/iobuffer.c:153
      #3 buffered_pread (handle=0xa785a960, buf=<value optimized out>, nbyte=8, offset=<value optimized out>) at src/iobuffer.c:291
      #4 0x00007f51a4d6218c in read_skipping_prefixes (db=0x6294a10, pos=14546986, ret_ptr=0x7f51a4936ff8, header=0) at src/couch_file_read.c:27
      #5 pread_bin_internal (db=0x6294a10, pos=14546986, ret_ptr=0x7f51a4936ff8, header=0) at src/couch_file_read.c:53
      #6 0x00007f51a4d62397 in pread_compressed (db=<value optimized out>, pos=<value optimized out>, ret_ptr=0x7f51a4937098) at src/couch_file_read.c:92
      #7 0x00007f51a4d5e62a in modify_node (rq=0x7f51a49374e0, nptr=0xdcc32a24, start=2172, end=2176, dst=0xdcc328ec) at src/btree_modify.c:321
      #8 0x00007f51a4d5ef45 in modify_node (rq=0x7f51a49374e0, nptr=0xdcc328bc, start=2172, end=2176, dst=0xdcc32804) at src/btree_modify.c:432
      #9 0x00007f51a4d5ef45 in modify_node (rq=0x7f51a49374e0, nptr=0xdcc327d4, start=2172, end=2176, dst=0xdcc27588) at src/btree_modify.c:432
      #10 0x00007f51a4d5e8b5 in modify_node (rq=0x7f51a49374e0, nptr=0xdcc27558, start=2172, end=2242, dst=0xdcf221b0) at src/btree_modify.c:468
      #11 0x00007f51a4d5e8b5 in modify_node (rq=0x7f51a49374e0, nptr=0xdcf22180, start=2040, end=2982, dst=0xdcf220c8) at src/btree_modify.c:468
      #12 0x00007f51a4d5e8b5 in modify_node (rq=0x7f51a49374e0, nptr=0x4a6c6e80, start=0, end=6394, dst=0xdcf22010) at src/btree_modify.c:468
      #13 0x00007f51a4d5f1f7 in modify_btree (rq=0x7f51a49374e0, root=0x4a6c6e80, errcode=0x7f51a49375ec) at src/btree_modify.c:620
      #14 0x00007f51a4d617c5 in update_indexes (db=0x6294a10, seqs=0xdca60010, seqvals=0xdca78fb0, ids=<value optimized out>, idvals=0xdca85780, numdocs=3197) at src
      /couch_save.c:198
      #15 0x00007f51a4d61ed1 in couchstore_save_documents (db=0x6294a10, docs=0xdce3c800, infos=0xdce36000, numdocs=3197, options=0) at src/couch_save.c:354
      #16 0x00007f51a5063b6b in CouchKVStore::saveDocs (this=0x5988680, vbid=505, rev=257, docs=0xdce3c800, docinfos=0xdce36000, docCount=3197) at couch-kvstore/couc
      h-kvstore.cc:1548
      #17 0x00007f51a5064237 in CouchKVStore::commit2couchstore (this=0x5988680) at couch-kvstore/couch-kvstore.cc:1479
      #18 0x00007f51a5064348 in CouchKVStore::commit (this=0x5988680) at couch-kvstore/couch-kvstore.cc:868
      #19 0x00007f51a4fe7ff5 in TransactionContext::commit (this=0x59b6328) at ep.cc:2571
      #20 0x00007f51a4fe903c in EventuallyPersistentStore::flushSome (this=0x59b6000, q=0x59b62c0, rejectQueue=0x57501f40) at ep.cc:1858
      #21 0x00007f51a5011dec in Flusher::doFlush (this=0x59b0120) at flusher.cc:244
      #22 0x00007f51a50127b5 in Flusher::step (this=0x59b0120, d=..., tid=std::tr1::shared_ptr (count 4) 0x5994300) at flusher.cc:158
      #23 0x00007f51a5012e30 in FlusherStepper::callback (this=<value optimized out>, d=<value optimized out>, t=<value optimized out>) at flusher.cc:23
      #24 0x00007f51a4fdc00f in Task::run (this=<value optimized out>, d=<value optimized out>, t=<value optimized out>) at dispatcher.hh:142
      #25 0x00007f51a4fdaf34 in Dispatcher::run (this=0x596ac40) at dispatcher.cc:169
      #26 0x00007f51a4fdb74b in launch_dispatcher_thread (arg=0x596ac40) at dispatcher.cc:28
      #27 0x00000039726077f1 in start_thread () from /lib64/libpthread.so.0
      #28 0x0000003971ee5ccd in clone () from /lib64/libc.so.6

      Thread 3 (Thread 0x7f51a8378700 (LWP 24857)):
      #0 0x0000003971ee62c3 in epoll_wait () from /lib64/libc.so.6
      #1 0x00007f51a9e06576 in epoll_dispatch (base=0x592e280, tv=<value optimized out>) at epoll.c:404
      #2 0x00007f51a9df1e44 in event_base_loop (base=0x592e280, flags=<value optimized out>) at event.c:1558
      #3 0x00000000004144c4 in worker_libevent (arg=0x14ce5f8) at daemon/thread.c:301
      #4 0x00000039726077f1 in start_thread () from /lib64/libpthread.so.0
      #5 0x0000003971ee5ccd in clone () from /lib64/libc.so.6

      Thread 2 (Thread 0x7f51a5b74700 (LWP 24871)):
      #0 0x0000003971eab15d in nanosleep () from /lib64/libc.so.6
      #1 0x0000003971edf124 in usleep () from /lib64/libc.so.6
      #2 0x00007f51a5017c72 in updateStatsThread (arg=0x14cc4c0) at memory_tracker.cc:31
      #3 0x00000039726077f1 in start_thread () from /lib64/libpthread.so.0
      #4 0x0000003971ee5ccd in clone () from /lib64/libc.so.6

      Thread 1 (Thread 0x7f51a2b35700 (LWP 24875)):
      #0 0x0000003971e32885 in raise () from /lib64/libc.so.6
      #1 0x0000003971e34065 in abort () from /lib64/libc.so.6
      #2 0x00007f51a5018c00 in Mutex::acquire (this=0x5e0ac50) at mutex.cc:73
      #3 0x00007f51a502de75 in lock (this=0x5e0ac00, value=false) at locks.hh:48
      #4 LockHolder (this=0x5e0ac00, value=false) at locks.hh:26
      #5 TapProducer::setSuspended (this=0x5e0ac00, value=false) at tapconnection.cc:819
      #6 0x00007f51a503335f in TapResumeCallback::callback(Dispatcher&, std::tr1::shared_ptr<Task>) () from /opt/couchbase/lib/memcached/ep.so
      #7 0x00007f51a4fdc00f in Task::run (this=<value optimized out>, d=<value optimized out>, t=<value optimized out>) at dispatcher.hh:142
      #8 0x00007f51a4fdaf34 in Dispatcher::run (this=0x596b6c0) at dispatcher.cc:169
      #9 0x00007f51a4fdb74b in launch_dispatcher_thread (arg=0x596b6c0) at dispatcher.cc:28
      #10 0x00000039726077f1 in start_thread () from /lib64/libpthread.so.0
      #11 0x0000003971ee5ccd in clone () from /lib64/libc.so.6
      --------------------------------------------------------------------------------

      Diags for all nodes in the following link
      https://s3.amazonaws.com/packages.couchbase/diag-logs/large_cluster_2_0/10-node-tapproducer-suppended.tgz

      Attachments

        No reviews matched the request. Check your Options in the drop-down menu of this sections header.

        Activity

          People

            deepkaran.salooja Deepkaran Salooja
            thuan Thuan Nguyen
            Votes:
            0 Vote for this issue
            Watchers:
            3 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved:

              Gerrit Reviews

                There are no open Gerrit changes

                PagerDuty