Details
-
Bug
-
Resolution: Fixed
-
Blocker
-
2.0
-
Security Level: Public
-
None
-
Centos
1024 vbuckets.
Setup unidirectional replication on 2:2 node cluster
Description
Setup:
--------------
1. Setup a 2:2 node cluster, unidirectional replication from cluster1 to cluster2.
2. Load 12M items on the source, expect 12M items replicated to the destination cluster.
Output
-----------
1. Losing memcached connections intermittently on the destination cluster.
2. After replicating 1.5M items, replication has failed.
3. Source node (31) crashed.
- Note: Seeing 0 active items and 12M replica items on the healthy source node. This is unexpected behaviour.
Although it looks like even before the crash, the active item distribution was uneven i.e.
Source Node 1 : Active items (0), Replica(12M)
Source Node 2: Active item(12M) Replica(0)
Web Logs
----------
Control connection to memcached on 'ns_1@10.5.2.31' disconnected: {badmatch,
{error,
timeout}}
Stack Trace from crashed node
------------------------------------------
Thread 10 (Thread 0x2ad68db07220 (LWP 9128)):
#0 0x0000003f142d48a8 in epoll_wait () from /lib64/libc.so.6
#1 0x00002ad68d685576 in epoll_dispatch (base=0x174b0000, tv=<value optimized out>) at epoll.c:404
#2 0x00002ad68d670e44 in event_base_loop (base=0x174b0000, flags=<value optimized out>) at event.c:1558
#3 0x0000000000409746 in main (argc=<value optimized out>, argv=<value optimized out>) at daemon/memcached.c:7920
Thread 9 (Thread 9139):
#0 0x0000003f142c678b in read () from /lib64/libc.so.6
#1 0x0000003f1426cd57 in _IO_new_file_underflow () from /lib64/libc.so.6
#2 0x0000003f1426d71e in _IO_default_uflow_internal () from /lib64/libc.so.6
#3 0x0000003f14268fdb in getc () from /lib64/libc.so.6
#4 0x00002ad68db08879 in check_stdin_thread (arg=0x403420) at extensions/daemon/stdin_check.c:19
#5 0x0000003f14e0673d in start_thread () from /lib64/libpthread.so.0
#6 0x0000003f142d44bd in clone () from /lib64/libc.so.6
Thread 8 (Thread 9140):
#0 0x0000003f142d48a8 in epoll_wait () from /lib64/libc.so.6
#1 0x00002ad68d685576 in epoll_dispatch (base=0x174b0500, tv=<value optimized out>) at epoll.c:404
#2 0x00002ad68d670e44 in event_base_loop (base=0x174b0500, flags=<value optimized out>) at event.c:1558
#3 0x00000000004144c4 in worker_libevent (arg=0x13050500) at daemon/thread.c:301
#4 0x0000003f14e0673d in start_thread () from /lib64/libpthread.so.0
#5 0x0000003f142d44bd in clone () from /lib64/libc.so.6
Thread 7 (Thread 9141):
#0 0x0000003f142d48a8 in epoll_wait () from /lib64/libc.so.6
#1 0x00002ad68d685576 in epoll_dispatch (base=0x174b0280, tv=<value optimized out>) at epoll.c:404
#2 0x00002ad68d670e44 in event_base_loop (base=0x174b0280, flags=<value optimized out>) at event.c:1558
#3 0x00000000004144c4 in worker_libevent (arg=0x130505f8) at daemon/thread.c:301
#4 0x0000003f14e0673d in start_thread () from /lib64/libpthread.so.0
#5 0x0000003f142d44bd in clone () from /lib64/libc.so.6
Thread 6 (Thread 9142):
#0 0x0000003f142d48a8 in epoll_wait () from /lib64/libc.so.6
#1 0x00002ad68d685576 in epoll_dispatch (base=0x174b0c80, tv=<value optimized out>) at epoll.c:404
#2 0x00002ad68d670e44 in event_base_loop (base=0x174b0c80, flags=<value optimized out>) at event.c:1558
#3 0x00000000004144c4 in worker_libevent (arg=0x130506f0) at daemon/thread.c:301
#4 0x0000003f14e0673d in start_thread () from /lib64/libpthread.so.0
#5 0x0000003f142d44bd in clone () from /lib64/libc.so.6
Thread 5 (Thread 9143):
#0 0x0000003f142d48a8 in epoll_wait () from /lib64/libc.so.6
#1 0x00002ad68d685576 in epoll_dispatch (base=0x174b0a00, tv=<value optimized out>) at epoll.c:404
--Type <return> to continue, or q <return> to quit--
#2 0x00002ad68d670e44 in event_base_loop (base=0x174b0a00, flags=<value optimized out>) at event.c:1558
#3 0x00000000004144c4 in worker_libevent (arg=0x130507e8) at daemon/thread.c:301
#4 0x0000003f14e0673d in start_thread () from /lib64/libpthread.so.0
#5 0x0000003f142d44bd in clone () from /lib64/libc.so.6
Thread 4 (Thread 9144):
#0 0x0000003f142d48a8 in epoll_wait () from /lib64/libc.so.6
#1 0x00002ad68d685576 in epoll_dispatch (base=0x174b0780, tv=<value optimized out>) at epoll.c:404
#2 0x00002ad68d670e44 in event_base_loop (base=0x174b0780, flags=<value optimized out>) at event.c:1558
#3 0x00000000004144c4 in worker_libevent (arg=0x130508e0) at daemon/thread.c:301
#4 0x0000003f14e0673d in start_thread () from /lib64/libpthread.so.0
#5 0x0000003f142d44bd in clone () from /lib64/libc.so.6
Thread 3 (Thread 9163):
#0 0x0000003f1429a541 in nanosleep () from /lib64/libc.so.6
#1 0x0000003f142cded4 in usleep () from /lib64/libc.so.6
#2 0x00002aaaaad67282 in MemoryTracker::MemoryTracker (this=0x1304e4c0) at memory_tracker.cc:79
#3 0x0000000000000000 in ?? ()
Thread 2 (Thread 24365):
#0 0x0000003f14e07b35 in pthread_join () from /lib64/libpthread.so.0
#1 0x00002aaaaad27082 in Dispatcher::stop (this=0x475399d0, force=false) at dispatcher.cc:201
#2 0x00002ad68d8ea900 in tc_delete () from /opt/couchbase/lib/libtcmalloc_minimal.so.4
#3 0x0000000000000000 in ?? ()
Thread 1 (Thread 0x47539940 (LWP 9167)):
#0 0x0000003f14230265 in raise () from /lib64/libc.so.6
#1 0x0000003f14231d10 in abort () from /lib64/libc.so.6
#2 0x00002aaaaad68210 in _S_compare (this=0x18ed4c58, __x=<value optimized out>, __p=0x3f14e0bd20, __v=...)
at /usr/lib/gcc/x86_64-redhat-linux6E/4.4.6/../../../../include/c++/4.4.6/bits/basic_string.h:398
#3 compare (this=0x18ed4c58, __x=<value optimized out>, __p=0x3f14e0bd20, __v=...)
at /usr/lib/gcc/x86_64-redhat-linux6E/4.4.6/../../../../include/c++/4.4.6/bits/basic_string.h:2027
#4 operator< <char, std::char_traits<char>, std::allocator<char> > (this=0x18ed4c58, __x=<value optimized out>,
__p=0x3f14e0bd20, __v=...) at /usr/lib/gcc/x86_64-redhat-linux6E/4.4.6/../../../../include/c++/4.4.6/bits/basic_string.h:2317
#5 operator() (this=0x18ed4c58, __x=<value optimized out>, __p=0x3f14e0bd20, __v=...)
at /usr/lib/gcc/x86_64-redhat-linux6E/4.4.6/../../../../include/c++/4.4.6/bits/stl_function.h:230
#6 std::Rb_tree<std::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::pair<std::basic_string<char, std::char_traits<char>, std::allocator<char> > const, unsigned long>, std::_Select1st<std::pair<std::basic_string<char, std::char_traits<char>, std::allocator<char> > const, unsigned long> >, std::less<std::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::basic_string<char, std::char_traits<char>, std::allocator<char> > const, unsigned long> > >::_M_insert (this=0x18ed4c58, __x=<value optimized out>, __p=0x3f14e0bd20, __v=...)
at /usr/lib/gcc/x86_64-redhat-linux6E/4.4.6/../../../../include/c++/4.4.6/bits/stl_tree.h:879
#7 0x00002ad68d8d7c00 in tcmalloc::SLL_Push(void*, void) () from /opt/couchbase/lib/libtcmalloc_minimal.so.4
--Type <return> to continue, or q <return> to quit--
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
Ns_server logs attached.
https://s3.amazonaws.com/bugdb/jira/source_cluster/source.tar