Loading...

Details

Type: Bug
Resolution: Fixed
Priority: Major
Fix Version/s: 2.1.0
Affects Version/s: 2.0.1
Component/s: couchbase-bucket, ns_server
Security Level: Public
Labels:
None
Environment:
2.0.1-156-rel
Centos
7 : 5
Source: 7 nodes
all 4-core, 30G SSDs
Destination: 5 nodes
3 4-core, 30G SSDs
2 8-core, 30G SSDs
default: source --> destination
saslbucket: source --> destination

Description

Current status on buckets:
default ~80M items
saslbucket ~65M items

1 ongoing replication for each of the buckets
Mixed load (creates-updates-deletes-expirations) running on both nodes
1 Production view under a design doc for default on the source cluster
Continuously running queries on the view as well

Live cluster: (Uptime >24 hours)
Source: http://10.6.2.37:8091
Destination: http://10.6.2.45:8091

Rebalance operation running:
Starting rebalance, KeepNodes = ['ns_1@10.6.2.37','ns_1@10.6.2.38',
'ns_1@10.6.2.39','ns_1@10.6.2.40',
'ns_1@10.6.2.42'], EjectNodes = ['ns_1@10.6.2.43',
'ns_1@10.6.2.44']

Rebalance operation very slow because:
Indexing default/_design/d1
Compacting bucket default
Compacting bucket saslbucket
Compacting bucket saslbucket

While rebalance is running, saw the following the message atleast a couple of times:
"Could not auto-failover node ('ns_1@10.6.2.38'). There was at least another node down."
"Could not automatically failover node 'ns_1@10.6.2.38' because I think rebalance is running"

And then:

Couchbase server on Node 10.6.2.39 goes down
Rebalance exits

Rebalance exited with reason {bulk_set_vbucket_state_failed,
[{'ns_1@10.6.2.39',
{'EXIT',
{{nodedown,'ns_1@10.6.2.39'},
{gen_server,call,
[

{'janitor_agent-saslbucket', 'ns_1@10.6.2.39'}

,
{if_rebalance,<0.7863.100>,
{update_vbucket_state,918,replica,
undefined,undefined}},
infinity]}}}}]}

Node 10.6.2.38 gets auto-failed-over, with the following log entry:

Node ('ns_1@10.6.2.39') was automatically failovered.
[down,stale,
{last_heard,{1360,973486,296813}},
{outgoing_replications_safeness_level,
[

{"saslbucket",stale}

,

{"default",green}

]},
{incoming_replications_conf_hashes,
[{"saslbucket",
[

{'ns_1@10.6.2.37',7942524}

,

{'ns_1@10.6.2.38',130938511}

,

{'ns_1@10.6.2.40',131674787}

,

{'ns_1@10.6.2.42',99738451}

,

{'ns_1@10.6.2.43',85152916}

,

{'ns_1@10.6.2.44',119351330}

]},
{"default",
[

{'ns_1@10.6.2.37',80548424}

,

{'ns_1@10.6.2.38',99434638}

,

{'ns_1@10.6.2.40',41181054}

,

{'ns_1@10.6.2.42',75330863}

,

{'ns_1@10.6.2.43',104165652}

,

{'ns_1@10.6.2.44',55133429}

]}]},

{active_buckets,["saslbucket","default"]}

,

{ready_buckets,["saslbucket","default"]}

,
{local_tasks,
[[

{pid,<<"<0.15978.88>">>}

,

{changes_done,519741}

,

{design_documents,[<<"_design/d1">>]}, {indexer_type,replica}, {initial_build,false}, {progress,100}, {set,<<"default">>}, {signature,<<"cf1ae2783bd44c07b46c3cac242842a5">>}, {started_on,1360953239}, {total_changes,519741}, {type,indexer}, {updated_on,1360972465}],
[{pid,<<"<0.21626.100>">>}, {changes_done,0},{design_documents,[<<"_design/d1">>]}

,

{indexer_type,main}

,

{initial_build,false}

,

{progress,0}

,

{set,<<"default">>}

,

{signature,<<"cf1ae2783bd44c07b46c3cac242842a5">>}

,

{started_on,1360966193}

,

{total_changes,5784547}

,

{type,indexer}

,

{updated_on,1360966193}

],
[

{pid,<<"<0.20399.103>">>}

,

{bucket,<<"default">>}

,
{original_target,{[

{type,bucket}

]}},

{progress,52}

,

{started_on,1360973268}

,

{total_vbuckets,294}

,

{trigger_type,scheduled}

,

{type,bucket_compaction}

,

{updated_on,1360973367}

,

{vbuckets_done,153}

],
[

{type,xdcr}, {id,<<"e106ec063395f02c97dbba63a247cfad/saslbucket/saslbucket">>}, {errors,[]}, {changes_left,7131130}, {docs_checked,24571336}, {docs_written,19190871}, {data_replicated,14022432192}, {active_vbreps,32}, {waiting_vbreps,144}, {time_working,2828014}, {time_committing,4754}, {num_checkpoints,2417}, {num_failedckpts,62}, {docs_rep_queue,90071}, {size_rep_queue,9075615}],
[{type,xdcr}

,

{id,<<"e106ec063395f02c97dbba63a247cfad/default/default">>}

,

{errors,[]}

,

{changes_left,3699342}

,

{docs_checked,26077879}

,

{docs_written,22400220}

,

{data_replicated,16439871268}

,

{active_vbreps,32}

,

{waiting_vbreps,114}

,

{time_working,3222330}

,

{time_committing,5285}

,

{num_checkpoints,2875}

,

{num_failedckpts,75}

,

{docs_rep_queue,38011}

,

{size_rep_queue,3849309}

]]},
{memory,
[

{total,1467390560}

,

{processes,1242860128}

,

{processes_used,1242396472}

,

{system,224530432}

,

{atom,1506713}

,

{atom_used,1501102}

,

{binary,137844480}

,

{code,15673585}

,

{ets,57752440}

]},
{system_memory_data,
[

{system_total_memory,32745521152}

,

{free_swap,2119372800}

,

{total_swap,5368700928}

,

{cached_memory,13826760704}

,

{buffered_memory,14364672}

,

{free_memory,331780096}

,

{total_memory,32745521152}

]},
{node_storage_conf,[

{db_path,"/data"}

,

{index_path,"/index"}

]},
{statistics,
[{wall_clock,{169038234,13826}},
{context_switches,{1855663220,0}},
{garbage_collection,{176172403,3879508106118,0}},
{io,{{input,2517880361702},

{output,239493307889}

}},
{reductions,{558661606055,22017720}},

{run_queue,118}

,
{runtime,{149380120,7780}}]},
{system_stats,
[

{cpu_utilization_rate,50.67178502879079}

,

{swap_total,5368700928}

,

{swap_used,3250393088}

]},
{interesting_stats,
[

{couch_docs_actual_disk_size,70063578847}

,

{couch_docs_data_size,28333503657}

,

{couch_views_actual_disk_size,2013908118}

,

{couch_views_data_size,535661359}

,

{curr_items,23294623}

,

{curr_items_tot,46790346}

,

{mem_used,13864234792}

,

{vb_replica_curr_items,23495723}

]},

{cluster_compatibility_version,131072}

,
{version,
[

{public_key,"0.13"}

,

{lhttpc,"1.3.0"}

,

{ale,"8cffe61"}

,

{os_mon,"2.2.7"}

,

{couch_set_view,"1.2.0a-c6e7157-git"}

,

{mnesia,"4.5"}

,

{inets,"5.7.1"}

,

{couch,"1.2.0a-c6e7157-git"}

,

{mapreduce,"1.0.0"}

,

{couch_index_merger,"1.2.0a-c6e7157-git"}

,

{kernel,"2.14.5"}

,

{crypto,"2.0.4"}

,

{ssl,"4.1.6"}

,

{sasl,"2.1.10"}

,

{couch_view_parser,"1.0.0"}

,

{ns_server,"2.0.1-156-rel-enterprise"}

,

{mochiweb,"1.4.1"}

,

{oauth,"7d85d3ef"}

,

{stdlib,"1.17.5"}

]},

{supported_compat_version,[2,0]}

,

{system_arch,"x86_64-unknown-linux-gnu"}

,

{wall_clock,169038}

,
{memory_data,{32745521152,32589651968,

{<18650.21626.100>,30409072}

}},
{disk_data,
[

{"/",49064776,16}

,

{"/dev/shm",15989024,0}

,

{"/boot",495844,7}

,

{"/data",243588516,30}

,

{"/index",123860788,2}

]},

{meminfo, <<"MemTotal: 31978048 kB\nMemFree: 152876 kB\nBuffers: 14232 kB\nCached: 13468780 kB\nSwapCached: 284200 kB\nActive: 20652152 kB\nInactive: 7798348 kB\nActive(anon): 13935296 kB\nInactive(anon): 1074432 kB\nActive(file): 6716856 kB\nInactive(file): 6723916 kB\nUnevictable: 2596820 kB\nMlocked: 2596912 kB\nSwapTotal: 5242872 kB\nSwapFree: 2068752 kB\nDirty: 4864 kB\nWriteback: 0 kB\nAnonPages: 17289872 kB\nMapped: 45316 kB\nShmem: 0 kB\nSlab: 398332 kB\nSReclaimable: 343232 kB\nSUnreclaim: 55100 kB\nKernelStack: 1560 kB\nPageTables: 43464 kB\nNFS_Unstable: 0 kB\nBounce: 0 kB\nWritebackTmp: 0 kB\nCommitLimit: 21231896 kB\nCommitted_AS: 20903176 kB\nVmallocTotal: 34359738367 kB\nVmallocUsed: 64624 kB\nVmallocChunk: 34359662900 kB\nHardwareCorrupted: 0 kB\nAnonHugePages: 0 kB\nHugePages_Total: 0\nHugePages_Free: 0\nHugePages_Rsvd: 0\nHugePages_Surp: 0\nHugepagesize: 2048 kB\nDirectMap4k: 32768000 kB\nDirectMap2M: 0 kB\n">>}

]

No cores on 10.6.2.39

18716 couchbas 20 0 4367m 1.9g 40m S 99.7 6.3 4314:33 beam.smp
19115 couchbas 20 0 17.4g 14g 2616 S 12.6 47.0 714:26.47 memcached

Unable to generate diagnostic report on the source cluster.

Attachments

Gerrit Reviews

- Issue Only
- Show All Reviews
- Show Open Reviews
- Show All Issues
- Show Open Issues

No reviews matched the request. Check your Options in the drop-down menu of this sections header.

[XDCR + views] we should not try and failover a node when rebalance is already in progress

Details

Description

Attachments

Gerrit Reviews

Activity

People

Dates

Gerrit Reviews

PagerDuty