Uploaded image for project: 'Couchbase Server'
  1. Couchbase Server
  2. MB-39444

FTS: Rebalance failure : service_rebalance_failed while ejecting nodes

    XMLWordPrintable

Details

    • Untriaged
    • 1
    • Unknown

    Description

      Build: 7.0.0-2083
      Test suite: centos-fts_stabletopologyP0-scorch
      Test failed: http://qa.sc.couchbase.com/job/test_suite_executor/211936/consoleText

      ./testrunner -i test.ini -p get-cbcollect-info=True,get-logs=False,stop-on-failure=False,index_type=scorch,fts_quota=750 -t fts.stable_topology_fts.StableTopFTS.create_simple_default_index,items=1000,cluster=D:F,standard_buckets=3,sasl_buckets=3,index_per_bucket=3,GROUP=P0,disable_HTP=True,get-cbcollect-info=True,index_type=scorch,fts_quota=750,cluster=D:F,disable_HTP=True,get-cbcollect-info=True,GROUP=P0,index_type=scorch,fts_quota=750

      Steps in the test:

      • Create a cluster with n1:fts+kv+index+n1ql and n2:fts
      • Create default, sasl_bucket_1, sasl_bucket_2, sasl_bucket_3, standard_bucket_1, standard_bucket_2, standard_bucket_3
      • Create fts indexes : default_index_1, default_index_2, default_index_3, sasl_bucket_1_index_1, sasl_bucket_1_index_2, sasl_bucket_1_index_3, sasl_bucket_2_index_1, sasl_bucket_2_index_2, sasl_bucket_2_index_3, sasl_bucket_3_index_1, sasl_bucket_3_index_2, sasl_bucket_3_index_3,standard_bucket_1_index_1,standard_bucket_1_index_2,standard_bucket_1_index_3,standard_bucket_2_index_1,standard_bucket_2_index_2,standard_bucket_2_index_3,standard_bucket_3_index_1,standard_bucket_3_index_2,standard_bucket_3_index_3

        {"type": "fulltext-index", "name": "default_index_1", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "default", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "default_index_2", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "default", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "default_index_3", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "default", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "sasl_bucket_1_index_1", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "sasl_bucket_1", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "sasl_bucket_1_index_2", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "sasl_bucket_1", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "sasl_bucket_1_index_3", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "sasl_bucket_1", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "sasl_bucket_2_index_1", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "sasl_bucket_2", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "sasl_bucket_2_index_2", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "sasl_bucket_2", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "sasl_bucket_2_index_3", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "sasl_bucket_2", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "sasl_bucket_3_index_1", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "sasl_bucket_3", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "sasl_bucket_3_index_2", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "sasl_bucket_3", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "sasl_bucket_3_index_3", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "sasl_bucket_3", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "standard_bucket_1_index_1", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "standard_bucket_1", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "standard_bucket_1_index_2", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "standard_bucket_1", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "standard_bucket_1_index_3", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "standard_bucket_1", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "standard_bucket_2_index_1", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "standard_bucket_2", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "standard_bucket_2_index_2", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "standard_bucket_2", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "standard_bucket_2_index_3", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "standard_bucket_2", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "standard_bucket_3_index_1", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "standard_bucket_3", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "standard_bucket_3_index_2", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "standard_bucket_3", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
         
        {"type": "fulltext-index", "name": "standard_bucket_3_index_3", "uuid": "", "params": {"store": {"kvStoreName": "mossStore", "mossStoreOptions": {}, "indexType": "scorch"}}, "sourceType": "couchbase", "sourceName": "standard_bucket_3", "sourceUUID": "", "planParams": {"numReplicas": 0, "maxPartitionsPerPIndex": 171}, "sourceParams": {}}
        

      • Load all the buckets with 1000 docs and wait for all the indexes to complete
      • delete all the indexes one after the other and wait for index delete to complete
      • delete all the buckets created and wait for delete bucket to complete
      • rebalancing all nodes in order to remove nodes. We see below error:

        2020-05-18 12:56:27 | INFO | MainProcess | test_thread | [rest_client.rebalance] rebalance params : {'knownNodes': 'ns_1@172.23.121.65,ns_1@172.23.121.66', 'ejectedNodes': 'ns_1@172.23.121.66', 'user': 'Administrator', 'password': 'password'}
        2020-05-18 12:56:27 | INFO | MainProcess | test_thread | [rest_client.rebalance] rebalance operation started
        2020-05-18 12:56:27 | INFO | MainProcess | test_thread | [rest_client._rebalance_status_and_progress] rebalance percentage : 0.00 %
        2020-05-18 12:56:37 | INFO | MainProcess | test_thread | [rest_client._rebalance_status_and_progress] rebalance percentage : 12.00 %
        2020-05-18 12:56:47 | INFO | MainProcess | test_thread | [rest_client._rebalance_status_and_progress] rebalance percentage : 12.00 %
        2020-05-18 12:56:57 | INFO | MainProcess | test_thread | [rest_client._rebalance_status_and_progress] rebalance percentage : 12.00 %
        2020-05-18 12:57:07 | INFO | MainProcess | test_thread | [rest_client._rebalance_status_and_progress] rebalance percentage : 12.00 %
        2020-05-18 12:57:17 | INFO | MainProcess | test_thread | [rest_client._rebalance_status_and_progress] rebalance percentage : 12.00 %
        2020-05-18 12:57:27 | ERROR | MainProcess | test_thread | [rest_client._rebalance_status_and_progress] {'status': 'none', 'errorMessage': 'Rebalance failed. See logs for detailed reason. You can try again.'} - rebalance failed
        2020-05-18 12:57:28 | INFO | MainProcess | test_thread | [rest_client.print_UI_logs] Latest logs from UI on 172.23.121.65:
        2020-05-18 12:57:28 | ERROR | MainProcess | test_thread | [rest_client.print_UI_logs] {'node': 'ns_1@172.23.121.65', 'type': 'critical', 'code': 0, 'module': 'ns_orchestrator', 'tstamp': 1589831847648, 'shortText': 'message', 'text': 'Rebalance exited with reason {service_rebalance_failed,fts,\n                              {agent_died,<0.8356.12>,\n                               {linked_process_died,<0.5949.15>,\n                                {timeout,\n                                 {gen_server,call,\n                                  [<0.8522.12>,\n                                   {call,"ServiceAPI.StartTopologyChange",\n                                    #Fun<json_rpc_connection.0.102434519>},\n                                   60000]}}}}}.\nRebalance Operation Id = ba2bf82f2a86c9e88b88245efe042717', 'serverTime': '2020-05-18T12:57:27.648Z'}
        2020-05-18 12:57:28 | ERROR | MainProcess | test_thread | [rest_client.print_UI_logs] {'node': 'ns_1@172.23.121.65', 'type': 'info', 'code': 0, 'module': 'ns_orchestrator', 'tstamp': 1589831787536, 'shortText': 'message', 'text': "Starting rebalance, KeepNodes = ['ns_1@172.23.121.65'], EjectNodes = ['ns_1@172.23.121.66'], Failed over and being ejected nodes = []; no delta recovery nodes; Operation Id = ba2bf82f2a86c9e88b88245efe042717", 'serverTime': '2020-05-18T12:56:27.536Z'}
        2020-05-18 12:57:28 | ERROR | MainProcess | test_thread | [rest_client.print_UI_logs] {'node': 'ns_1@172.23.121.65', 'type': 'warning', 'code': 102, 'module': 'menelaus_web', 'tstamp': 1589831787483, 'shortText': 'client-side error report', 'text': 'Client-side error-report for user "Administrator" on node \'ns_1@172.23.121.65\':\nUser-Agent:Python-httplib2/0.13.1 (gzip)\nStarting rebalance from test, ejected nodes [\'ns_1@172.23.121.66\']', 'serverTime': '2020-05-18T12:56:27.483Z'}
        2020-05-18 12:57:28 | ERROR | MainProcess | test_thread | [rest_client.print_UI_logs] {'node': 'ns_1@172.23.121.65', 'type': 'info', 'code': 11, 'module': 'menelaus_web', 'tstamp': 1589831787156, 'shortText': 'message', 'text': 'Deleted bucket "standard_bucket_3"\n', 'serverTime': '2020-05-18T12:56:27.156Z'}
        2020-05-18 12:57:28 | ERROR | MainProcess | test_thread | [rest_client.print_UI_logs] {'node': 'ns_1@172.23.121.65', 'type': 'info', 'code': 0, 'module': 'ns_memcached', 'tstamp': 1589831786400, 'shortText': 'message', 'text': 'Shutting down bucket "standard_bucket_3" on \'ns_1@172.23.121.65\' for deletion', 'serverTime': '2020-05-18T12:56:26.400Z'}
        2020-05-18 12:57:28 | ERROR | MainProcess | test_thread | [rest_client.print_UI_logs] {'node': 'ns_1@172.23.121.65', 'type': 'warning', 'code': 0, 'module': 'compaction_daemon', 'tstamp': 1589831786312, 'shortText': 'message', 'text': 'Compactor for database `standard_bucket_3` (pid [{type,database},\n                                                 {important,true},\n                                                 {name,\n                                                  <<"standard_bucket_3">>},\n                                                 {fa,\n                                                  {#Fun<compaction_daemon.4.9063353>,\n                                                   [<<"standard_bucket_3">>,\n                                                    {config,\n                                                     {30,undefined},\n                                                     {30,undefined},\n                                                     undefined,false,false,\n                                                     {daemon_config,30,\n                                                      131072,20971520}},\n                                                    false,\n                                                    {[{type,bucket}]}]}}]) terminated unexpectedly: {bucket_not_present,\n                                                                                                     <<"standard_bucket_3">>}', 'serverTime': '2020-05-18T12:56:26.312Z'}
        2020-05-18 12:57:28 | ERROR | MainProcess | test_thread | [rest_client.print_UI_logs] {'node': 'ns_1@172.23.121.65', 'type': 'info', 'code': 11, 'module': 'menelaus_web', 'tstamp': 1589831786192, 'shortText': 'message', 'text': 'Deleted bucket "standard_bucket_2"\n', 'serverTime': '2020-05-18T12:56:26.192Z'}
        2020-05-18 12:57:28 | ERROR | MainProcess | test_thread | [rest_client.print_UI_logs] {'node': 'ns_1@172.23.121.65', 'type': 'info', 'code': 0, 'module': 'ns_memcached', 'tstamp': 1589831785378, 'shortText': 'message', 'text': 'Shutting down bucket "standard_bucket_2" on \'ns_1@172.23.121.65\' for deletion', 'serverTime': '2020-05-18T12:56:25.378Z'}
        2020-05-18 12:57:28 | ERROR | MainProcess | test_thread | [rest_client.print_UI_logs] {'node': 'ns_1@172.23.121.65', 'type': 'info', 'code': 11, 'module': 'menelaus_web', 'tstamp': 1589831785120, 'shortText': 'message', 'text': 'Deleted bucket "standard_bucket_1"\n', 'serverTime': '2020-05-18T12:56:25.120Z'}
        2020-05-18 12:57:28 | ERROR | MainProcess | test_thread | [rest_client.print_UI_logs] {'node': 'ns_1@172.23.121.65', 'type': 'info', 'code': 0, 'module': 'ns_memcached', 'tstamp': 1589831784191, 'shortText': 'message', 'text': 'Shutting down bucket "standard_bucket_1" on \'ns_1@172.23.121.65\' for deletion', 'serverTime': '2020-05-18T12:56:24.191Z'}
        

      Log snippet from logs:

       
      2020-05-18T12:56:27.536-07:00, ns_orchestrator:0:info:message(ns_1@172.23.121.65) - Starting rebalance, KeepNodes = ['ns_1@172.23.121.65'], EjectNodes = ['ns_1@172.23.121.66'], Failed over and being ejected nodes = []; no delta recovery nodes; Operation Id = ba2bf82f2a86c9e88b88245efe042717
      2020-05-18T12:57:27.648-07:00, ns_orchestrator:0:critical:message(ns_1@172.23.121.65) - Rebalance exited with reason {service_rebalance_failed,fts,
                                    {agent_died,<0.8356.12>,
                                     {linked_process_died,<0.5949.15>,
                                      {timeout,
                                       {gen_server,call,
                                        [<0.8522.12>,
                                         {call,"ServiceAPI.StartTopologyChange",
                                          #Fun<json_rpc_connection.0.102434519>},
                                         60000]}}}}}.
      Rebalance Operation Id = ba2bf82f2a86c9e88b88245efe042717
      -------------------------------
       
       
      per_node_processes('ns_1@172.23.121.65') =
           {<0.9576.15>,
            [{backtrace,
                 [<<"Program counter: 0x00007f1cb7fbeb80 (diag_handler:'-collect_diag_per_node/1-fun-1-'/2 + 136)">>,
                  <<"CP: 0x0000000000000000 (invalid)">>,<<"arity = 0">>,<<>>,
                  <<"0x00007f1c660dcf20 Return addr 0x00007f1d164a9ea0 (proc_lib:init_p/3 + 288)">>,
                  <<"y(0)     <0.9575.15>">>,<<>>,
                  <<"0x00007f1c660dcf30 Return addr 0x0000000000942608 (<terminate process normally>)">>,
                  <<"y(0)     Catch 0x00007f1d164a9ec0 (proc_lib:init_p/3 + 320)">>,
                  <<"y(1)     []">>,<<>>]},
             {messages,[]},
             {dictionary,
                 [{'$initial_call',
                      {diag_handler,'-collect_diag_per_node/1-fun-1-',0}},
                  {'$ancestors',[<0.9575.15>]}]},
             {registered_name,[]},
             {status,waiting},
             {initial_call,{proc_lib,init_p,3}},
             {error_handler,error_handler},
             {garbage_collection,
                 [{max_heap_size,#{error_logger => true,kill => true,size => 0}},
                  {min_bin_vheap_size,46422},
                  {min_heap_size,233},
                  {fullsweep_after,512},
                  {minor_gcs,0}]},
             {garbage_collection_info,
                 [{old_heap_block_size,0},
                  {heap_block_size,233},
                  {mbuf_size,0},
                  {recent_size,0},
                  {stack_size,5},
                  {old_heap_size,0},
                  {heap_size,32},
                  {bin_vheap_size,0},
                  {bin_vheap_block_size,46422},
                  {bin_old_vheap_size,0},
                  {bin_old_vheap_block_size,46422}]},
             {links,[<0.9575.15>]},
             {monitors,[{process,<0.290.0>},{process,<0.9575.15>}]},
             {monitored_by,[]},
             {memory,2888},
             {message_queue_len,0},
             {reductions,9},
             {trap_exit,false},
             {current_location,
                 {diag_handler,'-collect_diag_per_node/1-fun-1-',2,
                     [{file,"src/diag_handler.erl"},{line,238}]}}]}
           {<0.9575.15>,
            [{backtrace,
                 [<<"Program counter: 0x00007f1d0ecc4ee0 (unknown function)">>,
                  <<"CP: 0x00007f1cb7fb67f0 (diag_handler:grab_process_info/1 + 72)">>,
                  <<>>,
                  <<"0x00007f1c8673e598 Return addr 0x00007f1cb7fb8100 (diag_handler:grab_process_infos_loop/2 + 120)">>,
                  <<"y(0)     []">>,<<"y(1)     []">>,<<"y(2)     []">>,<<>>,
                  <<"0x00007f1c8673e5b8 Return addr 0x00007f1cb7fb7810 (diag_handler:collect_diag_per_node_body/1 + 216)">>,
                  <<"(0)     [{<0.9574.15>,[{backtrace,[<<\"Program counter: 0x00007f1d165351a0 (gen:do_call/4 + 392)\">>,<<\"CP: 0x000000000000">>,
                  <<"y(1)     [<0.9576.15>]">>,<<"y(2)     <0.9575.15>">>,
                  <<"y(3)     Catch 0x00007f1cb7fb8100 (diag_handler:grab_process_infos_loop/2 + 120)">>,
                  <<>>,
                  <<"0x00007f1c8673e5e0 Return addr 0x00007f1cb7fbea08 (diag_handler:'-collect_diag_per_node/1-fun-2-'/2 + 224)">>,
                  <<"y(0)     []">>,<<"y(1)     []">>,<<"y(2)     []">>,
                  <<"y(3)     #Fun<diag_handler.36.12421120>">>,<<>>,
                  <<"0x00007f1c8673e608 Return addr 0x0000000000942608 (<terminate process normally>)">>,
                  <<"y(0)     []">>,
                  <<"y(1)     Catch 0x00007f1cb7fbea28 (diag_handler:'-collect_diag_per_node/1-fun-2-'/2 + 256)">>,
                  <<"y(2)     #Fun<diag_handler.36.12421120>">>,<<>>]},
             {messages,[]},
             {dictionary,[]},
      
      

      Attachments

        Issue Links

          Activity

            People

              abhinav Abhi Dangeti
              girish.benakappa Girish Benakappa
              Votes:
              0 Vote for this issue
              Watchers:
              6 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved:

                PagerDuty