Details
-
Bug
-
Resolution: Fixed
-
Critical
-
1.8.1
-
Security Level: Public
-
14 node cluster in DGM [10.3.121.206: master node]
1 bucket(default), 1024 vbuckets
Build : 1.8.1-831-rel
Description
Setup
1. Setup a 14 node cluster with 1024 vbuckets for a single bucket 'default'
2. Load ~26M items on the cluster,now in DGM with resident ratios at 84% and 80%
3. Enable Auto-failover
4. Reboot non-master node (228)
Output
1. Node 228 is rebooted
2. Node 228 is failed over
3. Node 206 is failed over.
4. 1% of data is lost from the bucket 'default'
The cluster can be accessed at http://10.3.121.206:8091/index.html#sec=monitor_servers
Error Messages from UI Log
Node ('ns_1@10.3.121.206') was automatically failovered.
[{last_heard,{1337,663695,108570}},
,
{ready_buckets,[]},
{replication,[
]},
{memory,[
,
{processes,150436832},
{processes_used,150315096},
{system,39957008},
{atom,1189649},
{atom_used,1164734},
{binary,2200296},
{code,11682249},
{ets,13939048}]},
{system_stats,[
,
{swap_total,0},
{swap_used,0}]},
{interesting_stats,[]},
{cluster_compatibility_version,1},
{version,[
,
{os_mon,"2.2.6"},
{mnesia,"4.4.19"},
{inets,"5.6"},
{kernel,"2.14.4"},
{sasl,"2.1.9.4"},
{ns_server,"1.8.1-831-rel-enterprise"},
{stdlib,"1.17.4"}]},
{system_arch,"x86_64-unknown-linux-gnu"},
{wall_clock,43524},
{memory_data,{8253337600,7577849856,
}},
{disk_data,[
,
{"/dev/shm",4029948,0},
{"/boot",495844,8},
{"/data",206420664,3}]},
{meminfo,<<"MemTotal: 8059900 kB\nMemFree: 660832 kB\nBuffers: 249580 kB\nCached: 2225680 kB\nSwapCached: 8040 kB\nActive: 5168632 kB\nInactive: 1904272 kB\nActive(anon): 3728872 kB\nInactive(anon): 868772 kB\nActive(file): 1439760 kB\nInactive(file): 1035500 kB\nUnevictable: 0 kB\nMlocked: 0 kB\nSwapTotal: 10289144 kB\nSwapFree: 10229784 kB\nDirty: 1464 kB\nWriteback: 1052 kB\nAnonPages: 4591012 kB\nMapped: 10124 kB\nShmem: 0 kB\nSlab: 204216 kB\nSReclaimable: 151764 kB\nSUnreclaim: 52452 kB\nKernelStack: 2080 kB\nPageTables: 13320 kB\nNFS_Unstable: 0 kB\nBounce: 0 kB\nWritebackTmp: 0 kB\nCommitLimit: 14319092 kB\nCommitted_AS: 5119228 kB\nVmallocTotal: 34359738367 kB\nVmallocUsed: 28396 kB\nVmallocChunk: 34359702008 kB\nHardwareCorrupted: 0 kB\nAnonHugePages: 4286464 kB\nHugePages_Total: 0\nHugePages_Free: 0\nHugePages_Rsvd: 0\nHugePages_Surp: 0\nHugepagesize: 2048 kB\nDirectMap4k: 8180 kB\nDirectMap2M: 8380416 kB\n">>},
{system_memory_data,[
,
{free_swap,10475298816},
{total_swap,10536083456},
{cached_memory,2279096320},
{buffered_memory,255569920},
{free_memory,676691968},
{total_memory,8253337600}]},
{statistics,[{wall_clock,{43509203,2}},
{context_switches,{24564723,0}},
{garbage_collection,{5606266,82441780469,0}},
{io,{{input,28824817921},
}},
{reductions,{21800023662,874154}},
,
{runtime,{4585380,250}}]}]
Data has been lost for 1% of vbuckets in bucket "default".
Failed over 'ns_1@10.3.121.228': ok
The logs from 206(master) are attached below
Output from 206
[root@rvm-0102 ~]# ps -e -o f,s,pid,uid,ppid,pgid,sid,size,stackp,sz,vsz,rss,maj_flt,psr,time,args --forest
F S PID UID PPID PGID SID SZ STACKP SZ VSZ RSS MAJFL PSR TIME COMMAND
1 S 2 0 0 0 0 0 00000000 0 0 0 0 2 00:00:00 [kthreadd]
1 S 3 0 2 0 0 0 00000000 0 0 0 0 0 00:00:00 _ [migration/0]
1 S 4 0 2 0 0 0 00000000 0 0 0 0 0 00:00:03 _ [ksoftirqd/0]
1 S 5 0 2 0 0 0 00000000 0 0 0 0 0 00:00:00 _ [migration/0]
5 S 6 0 2 0 0 0 00000000 0 0 0 0 0 00:00:00 _ [watchdog/0]
1 S 7 0 2 0 0 0 00000000 0 0 0 0 1 00:00:00 _ [migration/1]
1 S 8 0 2 0 0 0 00000000 0 0 0 0 1 00:00:00 _ [migration/1]
1 S 9 0 2 0 0 0 00000000 0 0 0 0 1 00:00:03 _ [ksoftirqd/1]
5 S 10 0 2 0 0 0 00000000 0 0 0 0 1 00:00:00 _ [watchdog/1]
1 S 11 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [migration/2]
1 S 12 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [migration/2]
1 S 13 0 2 0 0 0 00000000 0 0 0 0 2 00:00:02 _ [ksoftirqd/2]
5 S 14 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [watchdog/2]
1 S 15 0 2 0 0 0 00000000 0 0 0 0 3 00:00:00 _ [migration/3]
1 S 16 0 2 0 0 0 00000000 0 0 0 0 3 00:00:00 _ [migration/3]
1 S 17 0 2 0 0 0 00000000 0 0 0 0 3 00:00:01 _ [ksoftirqd/3]
5 S 18 0 2 0 0 0 00000000 0 0 0 0 3 00:00:00 _ [watchdog/3]
5 S 19 0 2 0 0 0 00000000 0 0 0 0 0 00:00:01 _ [events/0]
1 S 20 0 2 0 0 0 00000000 0 0 0 0 1 00:07:26 _ [events/1]
1 S 21 0 2 0 0 0 00000000 0 0 0 0 2 00:00:23 _ [events/2]
1 S 22 0 2 0 0 0 00000000 0 0 0 0 3 00:00:05 _ [events/3]
1 S 23 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [cpuset]
1 S 24 0 2 0 0 0 00000000 0 0 0 0 0 00:00:00 _ [khelper]
1 S 25 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [netns]
1 S 26 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [async/mgr]
1 S 27 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [pm]
1 S 28 0 2 0 0 0 00000000 0 0 0 0 0 00:00:00 _ [sync_supers]
1 S 29 0 2 0 0 0 00000000 0 0 0 0 3 00:00:10 _ [bdi-default]
1 S 30 0 2 0 0 0 00000000 0 0 0 0 0 00:00:00 _ [kintegrityd/0]
1 S 31 0 2 0 0 0 00000000 0 0 0 0 1 00:00:00 _ [kintegrityd/1]
1 S 32 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [kintegrityd/2]
1 S 33 0 2 0 0 0 00000000 0 0 0 0 3 00:00:00 _ [kintegrityd/3]
1 S 34 0 2 0 0 0 00000000 0 0 0 0 0 00:00:18 _ [kblockd/0]
1 S 35 0 2 0 0 0 00000000 0 0 0 0 1 00:00:03 _ [kblockd/1]
1 S 36 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [kblockd/2]
1 S 37 0 2 0 0 0 00000000 0 0 0 0 3 00:00:00 _ [kblockd/3]
1 S 38 0 2 0 0 0 00000000 0 0 0 0 0 00:00:00 _ [kacpid]
1 S 39 0 2 0 0 0 00000000 0 0 0 0 0 00:00:00 _ [kacpi_notify]
1 S 40 0 2 0 0 0 00000000 0 0 0 0 0 00:00:00 _ [kacpi_hotplug]
1 S 41 0 2 0 0 0 00000000 0 0 0 0 0 00:00:00 _ [ata/0]
1 S 42 0 2 0 0 0 00000000 0 0 0 0 1 00:00:00 _ [ata/1]
1 S 43 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [ata/2]
1 S 44 0 2 0 0 0 00000000 0 0 0 0 3 00:00:00 _ [ata/3]
1 S 45 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [ata_aux]
1 S 46 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [ksuspend_usbd]
1 S 47 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [khubd]
5 S 48 0 2 0 0 0 00000000 0 0 0 0 1 00:00:00 _ [kseriod]
1 S 49 0 2 0 0 0 00000000 0 0 0 0 0 00:00:00 _ [md/0]
1 S 50 0 2 0 0 0 00000000 0 0 0 0 1 00:00:00 _ [md/1]
1 S 51 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [md/2]
1 S 52 0 2 0 0 0 00000000 0 0 0 0 3 00:00:00 _ [md/3]
1 S 53 0 2 0 0 0 00000000 0 0 0 0 0 00:00:00 _ [md_misc/0]
1 S 54 0 2 0 0 0 00000000 0 0 0 0 1 00:00:00 _ [md_misc/1]
1 S 55 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [md_misc/2]
1 S 56 0 2 0 0 0 00000000 0 0 0 0 3 00:00:00 _ [md_misc/3]
1 S 57 0 2 0 0 0 00000000 0 0 0 0 1 00:00:01 _ [khungtaskd]
1 S 58 0 2 0 0 0 00000000 0 0 0 0 1 00:00:41 _ [kswapd0]
1 S 59 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [ksmd]
1 S 60 0 2 0 0 0 00000000 0 0 0 0 0 00:01:30 _ [khugepaged]
1 S 61 0 2 0 0 0 00000000 0 0 0 0 0 00:00:00 _ [aio/0]
1 S 62 0 2 0 0 0 00000000 0 0 0 0 1 00:00:00 _ [aio/1]
1 S 63 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [aio/2]
1 S 64 0 2 0 0 0 00000000 0 0 0 0 3 00:00:00 _ [aio/3]
1 S 65 0 2 0 0 0 00000000 0 0 0 0 0 00:00:00 _ [crypto/0]
1 S 66 0 2 0 0 0 00000000 0 0 0 0 1 00:00:00 _ [crypto/1]
1 S 67 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [crypto/2]
1 S 68 0 2 0 0 0 00000000 0 0 0 0 3 00:00:00 _ [crypto/3]
1 S 73 0 2 0 0 0 00000000 0 0 0 0 0 00:00:00 _ [kthrotld/0]
1 S 74 0 2 0 0 0 00000000 0 0 0 0 1 00:00:00 _ [kthrotld/1]
1 S 75 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [kthrotld/2]
1 S 76 0 2 0 0 0 00000000 0 0 0 0 3 00:00:00 _ [kthrotld/3]
1 S 78 0 2 0 0 0 00000000 0 0 0 0 3 00:00:00 _ [kpsmoused]
1 S 79 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [usbhid_resumer]
1 S 110 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [kstriped]
1 S 229 0 2 0 0 0 00000000 0 0 0 0 1 00:00:00 _ [scsi_eh_0]
1 S 233 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [scsi_eh_1]
1 S 373 0 2 0 0 0 00000000 0 0 0 0 3 00:00:00 _ [virtio-blk]
1 S 419 0 2 0 0 0 00000000 0 0 0 0 1 00:00:09 _ [kdmflush]
1 S 421 0 2 0 0 0 00000000 0 0 0 0 1 00:00:00 _ [kdmflush]
1 S 440 0 2 0 0 0 00000000 0 0 0 0 1 00:02:28 _ [kjournald]
1 S 927 0 2 0 0 0 00000000 0 0 0 0 0 00:00:09 _ [kdmflush]
1 S 961 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [kjournald]
1 S 962 0 2 0 0 0 00000000 0 0 0 0 0 00:02:20 _ [kjournald]
1 S 992 0 2 0 0 0 00000000 0 0 0 0 0 00:00:50 _ [flush-253:0]
1 S 1011 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [kauditd]
1 S 1383 0 2 0 0 0 00000000 0 0 0 0 0 00:00:00 _ [rpciod/0]
1 S 1384 0 2 0 0 0 00000000 0 0 0 0 1 00:00:00 _ [rpciod/1]
1 S 1385 0 2 0 0 0 00000000 0 0 0 0 2 00:00:00 _ [rpciod/2]
1 S 1386 0 2 0 0 0 00000000 0 0 0 0 3 00:00:00 _ [rpciod/3]
4 S 1 0 0 1 1 420 6a937580 4851 19404 1124 143 0 00:00:01 /sbin/init
5 S 532 0 1 532 532 804 f133e290 2780 11120 252 1 0 00:00:00 /sbin/udevd -d
5 S 1827 0 532 532 532 2092 f133e290 3102 12408 348 0 0 00:00:00 _ /sbin/udevd -d
5 S 1828 0 532 532 532 2092 f133e290 3102 12408 348 0 2 00:00:00 _ /sbin/udevd -d
1 S 1225 0 1 1225 1225 604 761bd710 2293 9172 620 102 0 00:00:00 /sbin/dhclient -1 -q -l
5 S 1269 0 1 1269 1269 10552 297f58d0 6922 27688 628 33 3 00:00:03 auditd
5 S 1294 0 1 1291 1012 227824 4b99ffa0 63771 255084 1012 85 0 00:00:01 /sbin/rsyslogd -i /var/
5 S 1323 0 1 1323 1323 304 388a1a90 2301 9204 484 0 1 00:01:52 irqbalance
5 S 1337 32 1 1337 1337 304 e0662ff0 4756 19024 532 1 0 00:00:01 rpcbind
5 S 1355 29 1 1355 1355 328 5db10d80 5800 23200 672 1 1 00:00:00 rpc.statd
1 S 1390 0 1 1390 1390 316 4a0ab1f0 7377 29508 236 1 1 00:00:00 rpc.idmapd
5 S 1481 81 1 1481 1481 76200 d1babe10 24336 97344 1092 140 1 00:00:00 dbus-daemon --system
4 S 1493 0 1 1493 1493 712 8f852d90 47286 189144 896 1 0 00:00:00 cupsd -C /etc/cups/cups
1 S 1518 0 1 1518 1518 268 9a65f890 1033 4132 456 1 0 00:00:00 /usr/sbin/acpid
5 S 1527 68 1 1527 1527 804 333c8190 6295 25180 1512 56 1 00:00:06 hald
0 S 1528 0 1527 1527 1527 296 8c2257b0 4540 18160 636 1 0 00:00:00 _ hald-runner
0 S 1556 0 1528 1527 1527 292 a26ce100 5069 20276 592 3 1 00:00:00 _ hald-addon-inpu
4 S 1582 68 1528 1527 1527 296 a960c330 4465 17860 680 3 1 00:00:00 _ hald-addon-acpi
5 S 1587 0 1 1587 1587 350116 95a693b0 96438 385752 872 1 0 00:00:06 automount --pid-file /v
1 S 1603 0 1 1603 1603 784 c59fc540 1704 6816 272 1 1 00:00:00 /usr/sbin/mcelog --daem
4 S 1690 0 1 1690 1690 596 1a8a5ee0 19669 78676 1036 7 0 00:00:15 /usr/libexec/postfix/ma
4 S 1707 89 1690 1690 1690 704 fca32220 19732 78928 996 0 1 00:00:01 _ qmgr -l -t fifo -u
4 S 30013 89 1690 1690 1690 600 a91bfb10 19689 78756 3212 0 3 00:00:00 _ pickup -l -t fifo -
5 S 1714 0 1 1714 1714 292 620d57b0 29710 118840 728 15 1 00:00:00 /usr/sbin/abrtd
0 S 1722 0 1 1722 1722 268 3f71fb10 2304 9216 564 32 1 00:00:00 abrt-dump-oops -d /var/
1 S 1733 498 1 1733 1733 379996 bb7e8d50 121031 484124 1724 10 0 00:02:21 /usr/sbin/qpidd -data
1 S 1768 0 1 1768 1768 1428 2317df30 29313 117252 832 0 1 00:00:08 crond
5 S 1779 0 1 1779 1779 480 abb95130 5373 21492 304 9 2 00:00:00 /usr/sbin/atd
1 S 1795 0 1 1795 1795 268 2e9885c0 1028 4112 280 10 0 00:00:00 /usr/bin/rhsmcertd 240
1 S 1796 0 1795 1795 1795 268 2e9885c0 1028 4112 276 17 0 00:00:00 _ /usr/bin/rhsmcertd
4 S 1814 0 1 1814 1814 268 be37b670 1029 4116 448 0 1 00:00:00 /sbin/mingetty /dev/tty
4 S 1816 0 1 1816 1816 268 5cef2cc0 1029 4116 448 1 1 00:00:00 /sbin/mingetty /dev/tty
4 S 1818 0 1 1818 1818 268 76b17230 1029 4116 448 1 1 00:00:00 /sbin/mingetty /dev/tty
4 S 1820 0 1 1820 1820 268 aab82040 1029 4116 448 1 1 00:00:00 /sbin/mingetty /dev/tty
4 S 1822 0 1 1822 1822 268 85db8060 1029 4116 448 1 2 00:00:00 /sbin/mingetty /dev/tty
4 S 15600 0 1 1481 1481 4078148 20b12270 1028479 4113916 1384 159 2 00:00:00 /usr/sbin/console-kit-d
4 S 15825 0 1 15825 15825 268 e533f260 1029 4116 448 1 1 00:00:00 /sbin/mingetty /dev/tty
5 S 20972 0 1 20972 20972 608 bebbf120 16017 64068 508 80 1 00:00:00 /usr/sbin/sshd
4 S 30344 0 20972 30344 30344 792 2b561a80 24454 97816 3892 0 0 00:00:00 _ sshd: root@pts/0
4 S 30350 0 30344 30350 30350 424 115337a0 27098 108392 1784 0 1 00:00:00 _ -bash
4 R 30367 0 30350 30367 30350 1192 ff35e010 27073 108292 1028 2 0 00:00:00 _ ps -e -o f,
1 S 24930 497 1 24929 24929 300 fee85420 2720 10880 544 3 1 00:00:00 /opt/couchbase/lib/erla
0 S 24945 497 1 24944 24944 1587308 8216c240 402176 1608704 161872 149 1 02:21:02 /opt/couchbase/lib/erla
0 S 24972 497 24945 24972 24972 292 52646f80 26539 106156 1232 0 0 00:00:00 _ sh -s disksup
0 S 24974 497 24945 24974 24974 264 db473280 1027 4108 544 0 2 00:00:01 _ /opt/couchbase/lib/
0 S 24975 497 24945 24975 24975 264 70e4f170 1026 4104 384 0 1 00:00:00 _ /opt/couchbase/lib/
0 S 24993 497 24945 24993 24993 268 b5906790 2711 10844 480 0 0 00:00:03 _ inet_gethost 4
1 S 24994 497 24993 24993 24993 268 b5906790 2711 10844 360 0 1 00:00:02 | _ inet_gethost 4
1 S 25198 497 24993 24993 24993 268 b5906790 2711 10844 324 0 2 00:00:00 | _ inet_gethost 4
0 S 27579 497 24945 27579 27579 139148 e98ff4c0 37643 150572 2824 1 2 00:00:12 _ /opt/couchbase/bin/
0 S 27580 497 24945 27580 27580 143528 7642b730 43076 172304 72312 7 0 00:00:04 _ /opt/couchbase/bin/
0 S 27581 497 24945 27581 27581 272 b196e5a0 1046 4184 552 1 0 00:00:33 _ portsigar for ns_1@
I am trying to create a repro for this error on a smaller setup, will update as I make progress.
Attached the system logs