Description
http://qa.hq.northscale.net/view/2.0.1/job/centos-64-2.0-rebalance-regressions/174/consoleFull
node 10.3.121.93 had no free space on the disk. node also contain memcaced and MnesiaCore crashes , which is most likely are the consequence.
here is my investigations.
1) disk usage -99.5%
ssh root@10.3.121.93
Usage of /: 99.5% of 12.56GB Users logged in: 0
2) core.memcached size is empty
root@ubuntu1104-64:/tmp# ls -la
total 718384
drwxrwxrwt 3 root root 4096 2013-01-29 00:17 .
drwxr-xr-x 21 root root 4096 2012-09-14 06:41 ..
drwxr-xr-x 2 root root 4096 2013-01-26 00:56 backup
rw------ 1 couchbase couchbase 0 2013-01-28 23:11 core.memcached.21138
rw-rr- 1 root root 120658794 2013-01-26 00:37 couchbase-server-enterprise_x86_64_2.0.1-144-rel.deb
rw------ 1 root root 9175842 2013-01-28 23:41 tmp3weBC9
rw------ 1 root root 1506488 2013-01-28 23:41 tmp93Ggx4
rw------ 1 root root 86510475 2013-01-28 23:41 tmp9XTa06
rw------ 1 root root 85963020 2013-01-28 23:41 tmpAnP_Fx
rw------ 1 root root 6706 2013-01-28 23:41 tmpEiKpfd
rw------ 1 root root 229 2013-01-28 23:41 tmpGBeCRO
rw------ 1 root root 207196659 2013-01-28 23:41 tmpHajLsi
rw------ 1 root root 983241 2013-01-28 23:41 tmpJrqtvB
rw------ 1 root root 16505 2013-01-28 23:41 tmpLi_iyA
rw------ 1 root root 199220455 2013-01-28 23:41 tmpm9GRiZ
rw------ 1 root root 24171250 2013-01-28 23:41 tmpMwNzhf
rw------ 1 root root 213 2013-01-28 23:41 tmpR05wqo
rw------ 1 root root 223 2013-01-28 23:41 tmpTFQmyf
3) MnesiaCore is empty also
root@ubuntu1104-64:/opt/couchbase/var/lib/couchbase# ls -la
total 48
drwxr-xr-x 7 couchbase couchbase 4096 2013-01-28 23:12 .
drwxr-xr-x 5 couchbase couchbase 4096 2013-01-28 13:57 ..
drwxr-xr-x 2 couchbase couchbase 4096 2013-01-28 23:11 config
rw-rr- 1 couchbase couchbase 17 2013-01-28 23:07 couchbase-server.cookie
rw-rr- 1 couchbase couchbase 17 2013-01-28 13:58 couchbase-server.node
rw-rr- 1 couchbase root 6 2013-01-28 22:30 couchbase-server.pid
drwxr-xr-x 6 couchbase couchbase 4096 2013-01-28 22:31 data
rw-rr- 1 couchbase couchbase 11 2013-01-28 13:58 ip
rw-rr- 1 couchbase couchbase 0 2013-01-26 00:30 ip_start
drwxr-xr-x 2 couchbase couchbase 4096 2013-01-28 23:11 logs
drwxr-xr-x 2 couchbase couchbase 4096 2013-01-28 23:11 mnesia
rw-rr- 1 couchbase couchbase 0 2013-01-28 23:12 MnesiaCore.ns_1@10.3.121.93_1359_443570_644400
rw-rr- 1 couchbase couchbase 344 2013-01-28 23:10 remote_clusters_cache
drwxr-xr-x 2 couchbase couchbase 4096 2013-01-28 13:58 tmp
4) server status is running but I can't use Web console
root@ubuntu1104-64:/opt/couchbase/var/lib/couchbase# /etc/init.d/couchbase-server status
- couchbase-server is running
root@ubuntu1104-64:/opt/couchbase/var/lib/couchbase# top
top - 02:10:03 up 133 days, 3:41, 1 user, load average: 1.22, 1.46, 1.32
Tasks: 94 total, 1 running, 93 sleeping, 0 stopped, 0 zombie
Cpu(s): 1.1%us, 27.3%sy, 0.0%ni, 71.5%id, 0.0%wa, 0.0%hi, 0.1%si, 0.0%st
Mem: 4058664k total, 2337404k used, 1721260k free, 152988k buffers
Swap: 1048572k total, 5592k used, 1042980k free, 1399428k cached
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
1171 couchbas 20 0 242m 133m 2788 S 98 3.4 149:42.27 memcached
3 root 20 0 0 0 0 S 1 0.0 23:33.84 ksoftirqd/0
1922 root 20 0 0 0 0 S 0 0.0 0:00.93 kworker/u:1
1976 root 20 0 19368 1296 956 R 0 0.0 0:00.03 top
16429 couchbas 20 0 532m 429m 16m S 0 10.8 665:51.71 beam.smp
1 root 20 0 23880 780 352 S 0 0.0 0:07.26 init
2 root 20 0 0 0 0 S 0 0.0 0:02.27 kthreadd
5)removed old core files, cleaned disk
root@ubuntu1104-64:/tmp/backup# rm -rf core.memcached.28835 core.memcached.4751 core.memcached.8944
root@ubuntu1104-64:/tmp/backup# df
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/mapper/ubuntu1104--64-root
13167336 10526264 1972196 85% /
none 2021356 172 2021184 1% /dev
none 2029332 0 2029332 0% /dev/shm
none 2029332 44 2029288 1% /var/run
none 2029332 0 2029332 0% /var/lock
/dev/sda1 233191 88417 132333 41% /boot
6) trying to restart server - Failed to stop couchbase-serve
root@ubuntu1104-64:/opt/couchbase/var/lib/couchbase# /etc/init.d/couchbase-server
Usage: /etc/init.d/couchbase-server
root@ubuntu1104-64:/opt/couchbase/var/lib/couchbase# /etc/init.d/couchbase-server status
- couchbase-server is running
root@ubuntu1104-64:/opt/couchbase/var/lib/couchbase# /etc/init.d/couchbase-server force-reload
=INFO REPORT==== 29-Jan-2013::02:21:10 ===
Initiated server shutdown** at node ns_1@10.3.121.93 **
=INFO REPORT==== 29-Jan-2013::02:21:10 ===
Stopped ns_server application** at node ns_1@10.3.121.93 **
- Failed to start couchbase-server: timed out
root@ubuntu1104-64:/opt/couchbase/var/lib/couchbase# /etc/init.d/couchbase-server stop
NOTE: shutdown failed {badrpc,nodedown} - Failed to stop couchbase-server
7) Only after a few minutes it went back to working condition
root@ubuntu1104-64:/tmp# /opt/couchbase/bin/cbcollect_info 10.3.121.93-diag.zip
uname (uname -a) - OK
Directory structure (ls -lR '/opt/couchbase' /opt/membase /var/membase /etc/opt/membase) - Exit code 2
Database directory structure (ls -lR /opt/couchbase/var/lib/couchbase/data) - OK
Directory structure membase - previous versions (ls -lR /opt/membase /var/membase /var/opt/membase /etc/opt/membase) - Exit code 2
Process list snapshot (top -H -n 1) - OK
Process list (ps -AwwL -o user,pid,lwp,ppid,nlwp,pcpu,pri,nice,vsize,rss,tty,stat,wchan:12,start,bsdtime,command) - OK
Swap configuration (free -t) - OK
Swap configuration (swapon -s) - OK
Kernel modules (lsmod) - OK
Distro version (cat /etc/redhat-release) - Exit code 1
Distro version (lsb_release -a) - OK
Installed software (rpm -qa) - Exit code 127
Installed software (COLUMNS=300 dpkg -l) - OK
Extended iostat (iostat -x -p ALL 1 10 || iostat -x 1 10) - Exit code 127
Process usage (export TERM=linux; top -b -n1 | egrep 'moxi|memcached|vbucketmigrator|CPU|load|Mem:|Swap:|Cpu(s)') - OK
Core dump settings (find /proc/sys/kernel -type f -name 'core' -print -exec cat '{}' ';') - OK
netstat -nap (netstat -nap) - OK
relevant lsof output (lsof -n | grep 'moxi|memcached|vbucketmigrator|beam') - OK
Network configuration (ifconfig -a) - OK
Taking sample 2 after 10.000000 seconds -
OK
Network status (netstat -an) - OK
Network routing table (netstat -rn) - OK
Arp cache (arp -na) - OK
Filesystem (df -ha) - OK
System activity reporter (sar 1 10) - Exit code 127
System paging activity (vmstat 1 10) - OK
System uptime (uptime) - OK
couchbase user definition (getent passwd couchbase) - OK
couchbase user limits (su couchbase -c "ulimit -a") - OK
membase user definition (getent passwd membase) - OK
couchbase user limits (su couchbase -c "ulimit -a") - OK
membase user limits (su membase -c "ulimit -a") - OK
Interrupt status (intrstat 1 10) - Exit code 127
Processor status (mpstat 1 10) - Exit code 127
System log (cat /var/adm/messages) - Exit code 1
System log (cat /var/log/syslog) - OK
System log (cat /var/log/messages) - Exit code 1
Version file (cat '/opt/couchbase/VERSION.txt') - OK
Manifest file (cat '/opt/couchbase/manifest.txt') - OK
Manifest file (cat '/opt/couchbase/manifest.xml') - OK
Memcached logs (cd '/opt/couchbase'/var/lib/couchbase/logs && for file in $(ls -tr memcached.log.*); do cat "$file"; done) - OK
Ini files (cd '/opt/couchbase'/etc && for file in $(find . -type f -name '*.ini'); do echo -e "
File: ${file}
";cat "$file"; done) - OK
Kernel log buffer (dmesg) - OK
couchbase config ('/opt/couchbase/bin'/escript '/opt/couchbase/bin'/cbdump-config '/opt/couchbase/var/lib/couchbase/config/config.dat') - OK
couchbase logs (debug) (cbbrowse_logs) - OK
couchbase logs (info) (cbbrowse_logs info) - OK
couchbase logs (error) (cbbrowse_logs error) - OK
couchbase logs (couchdb) (cbbrowse_logs couchdb) - OK
couchbase logs (xdcr) (cbbrowse_logs xdcr) - OK
couchbase logs (xdcr_errors) (cbbrowse_logs xdcr_errors) - OK
couchbase logs (views) (cbbrowse_logs views) - OK
couchbase logs (mapreduce errors) (cbbrowse_logs mapreduce_errors) - OK
couchbase logs (stats) (cbbrowse_logs stats) - OK
memcached stats all (cbstats -a 127.0.0.1:11210 all -b _admin -p _admin) - OK
memcached stats checkpoint (cbstats -a 127.0.0.1:11210 checkpoint -b _admin -p _admin) - OK
memcached stats config (cbstats -a 127.0.0.1:11210 config -b _admin -p _admin) - OK
memcached stats dispatcher (cbstats -a 127.0.0.1:11210 dispatcher logs -b _admin -p _admin) - OK
memcached stats hash (cbstats -a 127.0.0.1:11210 hash detail -b _admin -p _admin) - OK
memcached stats klog (cbstats -a 127.0.0.1:11210 klog -b _admin -p _admin) - OK
memcached stats kvstore (cbstats -a 127.0.0.1:11210 kvstore -b _admin -p _admin) - OK
memcached stats kvtimings (cbstats -a 127.0.0.1:11210 kvtimings -b _admin -p _admin) - OK
memcached stats tap (cbstats -a 127.0.0.1:11210 tap -b _admin -p _admin) - OK
memcached stats tapagg (cbstats -a 127.0.0.1:11210 tapagg -b _admin -p _admin) - OK
memcached stats timings (cbstats -a 127.0.0.1:11210 timings -b _admin -p _admin) - OK
memcached memory stats (cbstats -a 127.0.0.1:11210 raw memory -b _admin -p _admin) - OK
memcached allocator stats (cbstats -a 127.0.0.1:11210 raw allocator -b _admin -p _admin) - OK
memcached stats prev-vbucket (cbstats -a 127.0.0.1:11210 prev-vbucket -b _admin -p _admin) - OK
memcached stats vbucket (cbstats -a 127.0.0.1:11210 vbucket -b _admin -p _admin) - OK
memcached stats vbucket details (cbstats -a 127.0.0.1:11210 vbucket-details -b _admin -p _admin) - OK
memcached stats warmup (cbstats -a 127.0.0.1:11210 warmup -b _admin -p _admin) - OK
couchbase diags (curl -sS "http://Administrator:password@127.0.0.1:8091/diag?noLogs=1") - OK
couchbase design docs - bucket:bucket-1 (couch_dbdump "/opt/couchbase/var/lib/couchbase/data/bucket-1/master.couch.8") - Exit code 1
couchbase design docs - bucket:bucket-2 (couch_dbdump "/opt/couchbase/var/lib/couchbase/data/bucket-2/master.couch.12") - OK
couchbase design docs - bucket:bucket-0 (couch_dbdump "/opt/couchbase/var/lib/couchbase/data/bucket-0/master.couch.7") - Exit code 1
root@ubuntu1104-64:/tmp# ls -la
total 164408
drwxrwxrwt 3 root root 4096 2013-01-29 02:44 .
drwxr-xr-x 21 root root 4096 2012-09-14 06:41 ..
rw-rr- 1 root root 47670910 2013-01-29 02:44 10.3.121.98-1292013-154-diag.zip
drwxr-xr-x 2 root root 4096 2013-01-29 02:17 backup
rw------ 1 couchbase couchbase 0 2013-01-28 23:11 core.memcached.21138
rw-rr- 1 root root 120658794 2013-01-26 00:37 couchbase-server-enterprise_x86_64_2.0.1-144-rel.deb
root@ubuntu1104-64:/tmp# df
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/mapper/ubuntu1104--64-root
13167336 6669480 5828980 54% /
none 2021356 172 2021184 1% /dev
none 2029332 0 2029332 0% /dev/shm
none 2029332 44 2029288 1% /var/run
none 2029332 0 2029332 0% /var/lock
/dev/sda1 233191 88417 132333 41% /boot
Farshid, I think we should at least superficially cover such scenarios.