2.23.105.211'], EjectNodes = [], Failed over and being ejected nodes = []; no delta recovery nodes; Operation Id = b5d04fbb2b831e12e043cdf54a46a22f
2021-03-31T23:47:33.837-07:00, ns_orchestrator:0:critical:message(ns_1@172.23.105.155) - Rebalance exited with reason {service_rebalance_failed,inde
x,
{agent_died,<30215.5724.0>,
{noproc,
{gen_server,call,
[<30215.6088.0>,
{call,"ServiceAPI.GetTaskList",
#Fun<json_rpc_connection.0.44122352>},
60000]}}}}.
Rebalance Operation Id = b5d04fbb2b831e12e043cdf54a46a22f
Ashwin Govindarajulu, please change the test to wait for rebalance to finish before setting the storage mode as done in MB-39859.
Deepkaran Salooja
added a comment - This is the same problem as MB-39859 .
Indexer gets a storage mode setting change and restarts
2021-03-31T23:47:33.824-07:00 [Info] New settings received:
{"indexer.settings.compaction.days_of_week":"Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday","indexer.settings.rebalance.redistribute_inde
xes":false,"indexer.settings.compaction.interval":"00:00,00:00","indexer.settings.compaction.compaction_mode":"circular","indexer.settings.log_level
":"info","indexer.settings.persisted_snapshot.interval":5000,"indexer.settings.compaction.min_frag":30,"indexer.settings.inmemory_snapshot.interval"
:200,"indexer.settings.max_cpu_percent":0,"indexer.settings.storage_mode":"plasma","indexer.settings.recovery.max_rollbacks":2,"indexer.settings.num
_replica":0,"indexer.settings.memory_quota":1073741824,"indexer.settings.compaction.abort_exceed_interval":false}
2021-03-31T23:47:33.824-07:00 [Info] Setting maxcpus = 4
2021-03-31T23:47:33.824-07:00 [Info] Setting log level to Info
2021-03-31T23:47:33.828-07:00 [Info] Indexer::updateStorageMode Try setting storage mode to plasma
2021-03-31T23:47:33.828-07:00 [Info] Indexer::updateStorageMode Storage Mode Set plasma. Restarting indexer
Initializing write barrier = 8000
ns-server fails the rebalance due to agent_died/noproc
2021-03-31T23:47:27.235-07:00, ns_orchestrator:0:info:message(ns_1@172.23.105.155) - Starting rebalance, KeepNodes = ['ns_1@172.23.105.155','ns_1@17
2.23.105.211'], EjectNodes = [], Failed over and being ejected nodes = []; no delta recovery nodes; Operation Id = b5d04fbb2b831e12e043cdf54a46a22f
2021-03-31T23:47:33.837-07:00, ns_orchestrator:0:critical:message(ns_1@172.23.105.155) - Rebalance exited with reason {service_rebalance_failed,inde
x,
{agent_died,<30215.5724.0>,
{noproc,
{gen_server,call,
[<30215.6088.0>,
{call,"ServiceAPI.GetTaskList",
#Fun<json_rpc_connection.0.44122352>},
60000]}}}}.
Rebalance Operation Id = b5d04fbb2b831e12e043cdf54a46a22f
Ashwin Govindarajulu , please change the test to wait for rebalance to finish before setting the storage mode as done in MB-39859 .
Ashwin Govindarajulu, please collect the logs and file a new ticket. It could be a different issue. Lot of issues can manifest as a rebalance failure.
Deepkaran Salooja
added a comment - Ashwin Govindarajulu , please collect the logs and file a new ticket. It could be a different issue. Lot of issues can manifest as a rebalance failure.
This scenario was hit purely during manual setup of cluster. Updated the same is the description.
Reopening the ticket based on the discussion with Deepkaran Salooja.
Ashwin Govindarajulu
added a comment - This scenario was hit purely during manual setup of cluster. Updated the same is the description.
Reopening the ticket based on the discussion with Deepkaran Salooja .
2. As part of warmup, indexer tries to connect to projector, but it gets "connection refused"
2021-03-31T23:47:28.773-07:00 [Error] KVSender::closeMutationStream, MAINT_STREAM Error from Projector Post http://172.23.105.211:9999/adminport/shutdownTopicRequest: dial tcp 172.23.105.211:9999: connect: connection refused
3. Indexer waits for 5 seconds for retry
2021-03-31T23:47:28.773-07:00 [Warn] Indexer::closeAllStreams Stream MAINT_STREAM Projector health check needed, indexer can not proceed, Error received Post http://172.23.105.211:9999/adminport/shutdownTopicRequest: dial tcp 172.23.105.211:9999: connect: connection refused. Retrying (1).
4. The storage mode setting change notification comes in during this time, but it gets dropped as Indexer is yet to reach Active state
2021-03-31T23:47:33.719-07:00 [Info] SettingsMgr:: Dropped request /indexing/settings/config {"indexer.settings.compaction.days_of_week":"Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday","indexer.settings.rebalance.redistribute_indexes":false,"indexer.settings.compaction.interval":"00:00,00:00","indexer.settings.compaction.compaction_mode":"circular","indexer.settings.log_level":"info","indexer.settings.persisted_snapshot.interval":5000,"indexer.settings.compaction.min_frag":30,"indexer.settings.inmemory_snapshot.interval":200,"indexer.settings.max_cpu_percent":0,"indexer.settings.storage_mode":"","indexer.settings.recovery.max_rollbacks":2,"indexer.settings.num_replica":0,"indexer.settings.memory_quota":1073741824,"indexer.settings.compaction.abort_exceed_interval":false}. Any setting change will get applied once Indexer is ready.
2021-03-31T23:47:33.719-07:00 [Info] SettingsMgr:: Dropped request /indexing/settings/config {"indexer.settings.rebalance.redistribute_indexes":false,"indexer.settings.compaction.days_of_week":"Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday","indexer.settings.compaction.interval":"00:00,00:00","indexer.settings.persisted_snapshot.interval":5000,"indexer.settings.log_level":"info","indexer.settings.compaction.compaction_mode":"circular","indexer.settings.compaction.min_frag":30,"indexer.settings.inmemory_snapshot.interval":200,"indexer.settings.max_cpu_percent":0,"indexer.settings.storage_mode":"plasma","indexer.settings.recovery.max_rollbacks":2,"indexer.settings.memory_quota":1073741824,"indexer.settings.num_replica":0,"indexer.settings.compaction.abort_exceed_interval":false}. Any setting change will get applied once Indexer is ready.
2021-03-31T23:47:33.723-07:00 [Info] SettingsMgr:: Dropped request /indexing/settings/config {"indexer.settings.compaction.days_of_week":"Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday","indexer.settings.rebalance.redistribute_indexes":false,"indexer.settings.compaction.interval":"00:00,00:00","indexer.settings.compaction.compaction_mode":"circular","indexer.settings.log_level":"info","indexer.settings.persisted_snapshot.interval":5000,"indexer.settings.compaction.min_frag":30,"indexer.settings.inmemory_snapshot.interval":200,"indexer.settings.max_cpu_percent":0,"indexer.settings.storage_mode":"plasma","indexer.settings.recovery.max_rollbacks":2,"indexer.settings.num_replica":0,"indexer.settings.memory_quota":1073741824,"indexer.settings.compaction.abort_exceed_interval":false}. Any setting change will get applied once Indexer is ready.
5. Retry with projector after 5 seconds works fine and indexer proceeds with rest of the warmup. This includes registering with ns-server
Projector might have taken 1sec before it was ready to accept TCP connections, leading to "connection refused".
On co-located setups, with kv+index service together on a node, this failure is possible.
Deepkaran Salooja
added a comment - 1. Indexer on node 172.23.105.211 gets started
2021-03-31T23:47:27.601-07:00 [Info] Indexer started with command line: [/opt/couchbase/bin/indexer -adminPort=9100 -scanPort=9101 -httpPort=9102 -streamInitPort=9103 -streamCatchupPort=9104 -streamMaintPort=9105 --httpsPort=19102 --certFile=/opt/couchbase/var/lib/couchbase/config/memcached-cert.pem --keyFile=/opt/couchbase/var/lib/couchbase/config/memcached-key.pem -vbuckets=1024 -cluster=127.0.0.1:8091 -storageDir=/opt/couchbase/var/lib/couchbase/data/@2i -diagDir=/opt/couchbase/var/lib/couchbase/crash -logDir=/opt/couchbase/var/lib/couchbase/logs -nodeUUID=945900b90b8db3c17cb59af21f15545b -ipv6=false -isEnterprise=true]
2. As part of warmup, indexer tries to connect to projector, but it gets "connection refused"
2021-03-31T23:47:28.773-07:00 [Error] KVSender::closeMutationStream, MAINT_STREAM Error from Projector Post http://172.23.105.211:9999/adminport/shutdownTopicRequest: dial tcp 172.23.105.211:9999: connect: connection refused
3. Indexer waits for 5 seconds for retry
2021-03-31T23:47:28.773-07:00 [Warn] Indexer::closeAllStreams Stream MAINT_STREAM Projector health check needed, indexer can not proceed, Error received Post http://172.23.105.211:9999/adminport/shutdownTopicRequest: dial tcp 172.23.105.211:9999: connect: connection refused. Retrying (1).
4. The storage mode setting change notification comes in during this time, but it gets dropped as Indexer is yet to reach Active state
2021-03-31T23:47:33.719-07:00 [Info] SettingsMgr:: Dropped request /indexing/settings/config {"indexer.settings.compaction.days_of_week":"Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday","indexer.settings.rebalance.redistribute_indexes":false,"indexer.settings.compaction.interval":"00:00,00:00","indexer.settings.compaction.compaction_mode":"circular","indexer.settings.log_level":"info","indexer.settings.persisted_snapshot.interval":5000,"indexer.settings.compaction.min_frag":30,"indexer.settings.inmemory_snapshot.interval":200,"indexer.settings.max_cpu_percent":0,"indexer.settings.storage_mode":"","indexer.settings.recovery.max_rollbacks":2,"indexer.settings.num_replica":0,"indexer.settings.memory_quota":1073741824,"indexer.settings.compaction.abort_exceed_interval":false}. Any setting change will get applied once Indexer is ready.
2021-03-31T23:47:33.719-07:00 [Info] SettingsMgr:: Dropped request /indexing/settings/config {"indexer.settings.rebalance.redistribute_indexes":false,"indexer.settings.compaction.days_of_week":"Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday","indexer.settings.compaction.interval":"00:00,00:00","indexer.settings.persisted_snapshot.interval":5000,"indexer.settings.log_level":"info","indexer.settings.compaction.compaction_mode":"circular","indexer.settings.compaction.min_frag":30,"indexer.settings.inmemory_snapshot.interval":200,"indexer.settings.max_cpu_percent":0,"indexer.settings.storage_mode":"plasma","indexer.settings.recovery.max_rollbacks":2,"indexer.settings.memory_quota":1073741824,"indexer.settings.num_replica":0,"indexer.settings.compaction.abort_exceed_interval":false}. Any setting change will get applied once Indexer is ready.
2021-03-31T23:47:33.723-07:00 [Info] SettingsMgr:: Dropped request /indexing/settings/config {"indexer.settings.compaction.days_of_week":"Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday","indexer.settings.rebalance.redistribute_indexes":false,"indexer.settings.compaction.interval":"00:00,00:00","indexer.settings.compaction.compaction_mode":"circular","indexer.settings.log_level":"info","indexer.settings.persisted_snapshot.interval":5000,"indexer.settings.compaction.min_frag":30,"indexer.settings.inmemory_snapshot.interval":200,"indexer.settings.max_cpu_percent":0,"indexer.settings.storage_mode":"plasma","indexer.settings.recovery.max_rollbacks":2,"indexer.settings.num_replica":0,"indexer.settings.memory_quota":1073741824,"indexer.settings.compaction.abort_exceed_interval":false}. Any setting change will get applied once Indexer is ready.
5. Retry with projector after 5 seconds works fine and indexer proceeds with rest of the warmup. This includes registering with ns-server
2021-03-31T23:47:33.814-07:00 [Info] RebalanceMgr::NewRebalanceMgr false <nil>
2021-03-31T23:47:33.814-07:00 [Info] RebalanceMgr::initService Init
2021-03-31T23:47:33.816-07:00 [Info] ServiceMgr::registerWithServer nodeuuid 945900b90b8db3c17cb59af21f15545b
2021-03-31T23:47:33.817-07:00 [Info] Indexer::NewIndexer Status Active
6. Step5 happens concurrently with applying the pending settings change
2021-03-31T23:47:33.815-07:00 [Info] SettingsMgr::handleIndexerReady apply pending setting changes
2021-03-31T23:47:33.824-07:00 [Info] New settings received:
{"indexer.settings.compaction.days_of_week":"Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday","indexer.settings.rebalance.redistribute_indexes":false,"indexer.settings.compaction.interval":"00:00,00:00","indexer.settings.compaction.compaction_mode":"circular","indexer.settings.log_level":"info","indexer.settings.persisted_snapshot.interval":5000,"indexer.settings.compaction.min_frag":30,"indexer.settings.inmemory_snapshot.interval":200,"indexer.settings.max_cpu_percent":0,"indexer.settings.storage_mode":"plasma","indexer.settings.recovery.max_rollbacks":2,"indexer.settings.num_replica":0,"indexer.settings.memory_quota":1073741824,"indexer.settings.compaction.abort_exceed_interval":false}
7. Once the storage mode setting gets applied, indexer gets restarted
2021-03-31T23:47:33.828-07:00 [Info] Indexer::updateStorageMode Try setting storage mode to plasma
2021-03-31T23:47:33.828-07:00 [Info] Indexer::updateStorageMode Storage Mode Set plasma. Restarting indexer
8. Projector got started at 2021-03-31T23:47:27.714
2021-03-31T23:47:27.714-07:00 [Info] Projector started with command line:
9. Indexer sends the request roughly at the same time 2021-03-31T23:47:27.748
2021-03-31T23:47:27.748-07:00 [Info] KVSender::sendShutdownTopic Projector 172.23.105.211:9999 Topic MAINT_STREAM_TOPIC_945900b90b8db3c17cb59af21f15545b
Projector might have taken 1sec before it was ready to accept TCP connections, leading to "connection refused".
On co-located setups, with kv+index service together on a node, this failure is possible.
This race condition has always been there in the product. Not specific to 7.0. This is the first time it is reported. Potentially deferrable as changing the bootstrapping sequence might be a bit risky at this point.
Jeelan Poola
added a comment - This race condition has always been there in the product. Not specific to 7.0. This is the first time it is reported. Potentially deferrable as changing the bootstrapping sequence might be a bit risky at this point.
Build couchbase-server-7.1.0-1156 contains indexing commit 048f2f1 with commit message: MB-45390 skip restart for plasma/memdb on storage mode change
Couchbase Build Team
added a comment - Build couchbase-server-7.1.0-1156 contains indexing commit 048f2f1 with commit message:
MB-45390 skip restart for plasma/memdb on storage mode change
Could be a duplicate