Uploaded image for project: 'Couchbase Server'
  1. Couchbase Server
  2. MB-45973

Upgrade 6.6.1-> 7.0.0 : Timer not firing after upgrade, worker count change and service crash

    XMLWordPrintable

Details

    Description

      Online Upgrade from 6.6.1-9123 -> 7.0.0-5046

      ./testrunner.py -i /tmp/win10-bucket-ops.ini -t eventing.eventing_upgrade.EventingUpgrade.test_online_upgrade_with_regular_rebalance_with_eventing,nodes_init=4,services_init=kv-eventing-index-n1ql,dataset=default,groups=simple,upgrade_test=True,initial_version=6.6.1-9213,doc-per-day=10,upgrade_version=7.0.0-5046 
       
      Current : 19581 Expected : 20160

      All timers created successfully 

      [
          {
              "dcp_feed_boundary": "everything",
              "event_processing_stats": {
                  "adhoc_timer_response_received": 2,
                  "agg_messages_sent_to_worker": 21625,
                  "agg_queue_memory": 0,
                  "agg_queue_memory_cap": 53477376,
                  "agg_queue_size": 0,
                  "agg_queue_size_cap": 300000,
                  "agg_timer_feedback_queue_cap": 1500,
                  "dcp_mutation": 20160,
                  "dcp_mutation_sent_to_worker": 20160,
                  "dcp_snapshot": 1024,
                  "dcp_stream_req_counter": 1024,
                  "dcp_streamreq": 1024,
                  "execution_stats": 84,
                  "failure_stats": 84,
                  "latency_stats": 84,
                  "lcb_exception_stats": 84,
                  "log_level": 3,
                  "num_processed_events": 20160,
                  "processed_events_size": 13934248,
                  "thr_count": 3,
                  "thr_map": 3,
                  "timer_responses_received": 20160,
                  "v8_init": 3,
                  "v8_load": 3
              },
              "events_remaining": {
                  "dcp_backlog": 0
              },
              "execution_stats": {
                  "agg_queue_memory": 0,
                  "agg_queue_size": 0,
                  "curl": {
                      "delete": 0,
                      "get": 0,
                      "head": 0,
                      "post": 0,
                      "put": 0
                  },
                  "dcp_delete_msg_counter": 0,
                  "dcp_delete_parse_failure": 0,
                  "dcp_mutation_msg_counter": 20160,
                  "dcp_mutation_parse_failure": 0,
                  "enqueued_dcp_delete_msg_counter": 0,
                  "enqueued_dcp_mutation_msg_counter": 20160,
                  "enqueued_timer_msg_counter": 0,
                  "feedback_queue_size": 0,
                  "filtered_dcp_delete_counter": 0,
                  "filtered_dcp_mutation_counter": 0,
                  "lcb_retry_failure": 0,
                  "messages_parsed": 21613,
                  "num_processed_events": 20160,
                  "on_delete_failure": 0,
                  "on_delete_success": 0,
                  "on_update_failure": 0,
                  "on_update_success": 20160,
                  "processed_events_size": 13934248,
                  "timer_cancel_counter": 0,
                  "timer_create_counter": 20160,
                  "timer_create_failure": 0,
                  "timer_msg_counter": 0,
                  "timer_responses_sent": 0,
                  "timestamp": {
                      "118126": "2021-04-28T09:14:00Z",
                      "118131": "2021-04-28T09:14:00Z",
                      "118142": "2021-04-28T09:14:00Z"
                  },
                  "uv_msg_parse_failure": 0,
                  "uv_try_write_failure_counter": 0
              },
              "failure_stats": {
                  "app_worker_setting_events_lost": 0,
                  "bkt_ops_cas_mismatch_count": 0,
                  "bucket_op_exception_count": 0,
                  "checkpoint_failure_count": 0,
                  "curl_non_200_response": 0,
                  "dcp_events_lost": 0,
                  "debugger_events_lost": 0,
                  "delete_events_lost": 0,
                  "mutation_events_lost": 0,
                  "n1ql_op_exception_count": 0,
                  "timeout_count": 0,
                  "timer_callback_missing_counter": 0,
                  "timer_context_size_exceeded_counter": 0,
                  "timer_events_lost": 0,
                  "timestamp": {
                      "118126": "2021-04-28T09:14:00Z",
                      "118131": "2021-04-28T09:14:00Z",
                      "118142": "2021-04-28T09:14:00Z"
                  },
                  "v8worker_events_lost": 0
              },
              "function_id": 2335275409,
              "function_name": "timers",
              "gocb_creds_request_counter": 8,
              "internal_vb_distribution_stats": {
                  "worker_timers_0": "[0-341]",
                  "worker_timers_1": "[342-682]",
                  "worker_timers_2": "[683-1023]"
              },
              "latency_percentile_stats": {
                  "100": 22300,
                  "50": 900,
                  "80": 1200,
                  "90": 1400,
                  "95": 1700,
                  "99": 3500
              },
              "lcb_creds_request_counter": 24,
              "lcb_exception_stats": {},
              "metastore_stats": {
                  "metastore_delete_err": 0,
                  "metastore_deletes": 0,
                  "metastore_not_found": 0,
                  "metastore_scan": 0,
                  "metastore_scan_due": 0,
                  "metastore_scan_err": 0,
                  "metastore_set": 0,
                  "metastore_set_err": 0
              },
              "planner_stats": [
                  {
                      "host_name": "172.23.106.67:8096",
                      "start_vb": 0,
                      "vb_count": 1024
                  }
              ],
              "vb_distribution_stats_from_metadata": {},
              "worker_pids": {
                  "worker_timers_0": 118126,
                  "worker_timers_1": 118131,
                  "worker_timers_2": 118142
              }
          }
      ]  

      Rebalanc in 7.0.0 and out all 6.6.1 nodes. 

      Stats after upgrade

      [
          {
              "dcp_feed_boundary": "everything",
              "event_processing_stats": {
                  "agg_messages_sent_to_worker": 4327,
                  "agg_queue_memory": 0,
                  "agg_queue_memory_cap": 53477376,
                  "agg_queue_size": 0,
                  "agg_queue_size_cap": 100000,
                  "agg_timer_feedback_queue_cap": 500,
                  "dcp_stream_req_counter": 1024,
                  "dcp_streamreq": 1024,
                  "execution_stats": 659,
                  "failure_stats": 659,
                  "latency_stats": 659,
                  "lcb_exception_stats": 659,
                  "log_level": 1,
                  "num_processed_events": 0,
                  "processed_events_size": 0,
                  "thr_count": 1,
                  "thr_map": 1,
                  "v8_init": 1,
                  "v8_load": 1
              },
              "events_remaining": {
                  "dcp_backlog": 0
              },
              "execution_stats": {
                  "agg_queue_memory": 0,
                  "agg_queue_size": 0,
                  "curl": {
                      "delete": 0,
                      "get": 0,
                      "head": 0,
                      "post": 0,
                      "put": 0
                  },
                  "curl_success_count": 0,
                  "dcp_delete_msg_counter": 0,
                  "dcp_delete_parse_failure": 0,
                  "dcp_mutation_msg_counter": 0,
                  "dcp_mutation_parse_failure": 0,
                  "enqueued_dcp_delete_msg_counter": 0,
                  "enqueued_dcp_mutation_msg_counter": 0,
                  "enqueued_timer_msg_counter": 0,
                  "feedback_queue_size": 0,
                  "filtered_dcp_delete_counter": 0,
                  "filtered_dcp_mutation_counter": 0,
                  "lcb_retry_failure": 0,
                  "messages_parsed": 4323,
                  "no_op_counter": 0,
                  "num_processed_events": 0,
                  "on_delete_failure": 0,
                  "on_delete_success": 0,
                  "on_update_failure": 0,
                  "on_update_success": 0,
                  "processed_events_size": 0,
                  "timer_callback_failure": 0,
                  "timer_callback_success": 0,
                  "timer_cancel_counter": 0,
                  "timer_create_counter": 0,
                  "timer_create_failure": 0,
                  "timer_msg_counter": 0,
                  "timer_responses_sent": 0,
                  "timestamp": {
                      "29123": "2021-04-28T09:40:03Z"
                  },
                  "uv_msg_parse_failure": 0,
                  "uv_try_write_failure_counter": 0
              },
              "failure_stats": {
                  "app_worker_setting_events_lost": 0,
                  "bkt_ops_cas_mismatch_count": 0,
                  "bucket_op_exception_count": 0,
                  "checkpoint_failure_count": 0,
                  "curl_failure_count": 0,
                  "curl_max_resp_size_exceeded": 0,
                  "curl_non_200_response": 0,
                  "curl_timeout_count": 0,
                  "dcp_events_lost": 0,
                  "debugger_events_lost": 0,
                  "delete_events_lost": 0,
                  "mutation_events_lost": 0,
                  "n1ql_op_exception_count": 0,
                  "timeout_count": 0,
                  "timer_callback_missing_counter": 0,
                  "timer_context_size_exceeded_counter": 0,
                  "timer_events_lost": 0,
                  "timestamp": {
                      "29123": "2021-04-28T09:40:03Z"
                  },
                  "v8worker_events_lost": 0
              },
              "function_name": "timers",
              "gocb_creds_request_counter": 5,
              "function_id": 2335275409,
              "internal_vb_distribution_stats": {
                  "worker_timers_0": "[0-1023]"
              },
              "latency_percentile_stats": {
                  "50": 0,
                  "80": 0,
                  "90": 0,
                  "95": 0,
                  "99": 0,
                  "100": 0
              },
              "lcb_creds_request_counter": 8,
              "lcb_exception_stats": {},
              "planner_stats": [
                  {
                      "host_name": "172.23.106.76:8096",
                      "start_vb": 0,
                      "vb_count": 1024
                  }
              ],
              "metastore_stats": {
                  "metastore_delete_err": 0,
                  "metastore_deletes": 0,
                  "metastore_not_found": 0,
                  "metastore_scan": 0,
                  "metastore_scan_due": 0,
                  "metastore_scan_err": 0,
                  "metastore_set": 0,
                  "metastore_set_err": 0
              },
              "vb_distribution_stats_from_metadata": {
                  "172.23.106.76:8096": {
                      "worker_timers_0": "[0-1023]"
                  }
              },
              "worker_pids": {
                  "worker_timers_0": 29123
              }
          }
      ] 

      The eventing service restarted with 1 worker instead of 3 as we deployed it with 3 workers 

      2021-04-28T02:28:54.198-07:00, ns_log:0:info:message(ns_1@172.23.106.76) - Service 'eventing' exited with status 2. Restarting. Messages:
      goroutine 16 [running]:
      github.com/couchbase/eventing/supervisor.(*bucketWatchStruct).RefreshBucketManifestOnUIDChange(0xc0001c63c0, 0x2001d70, 0x1, 0x7ffef5100933, 0x4, 0xc0001a5101, 0xc0001a51b8, 0xc000917f78)
      	/home/couchbase/jenkins/workspace/couchbase-server-unix/goproj/src/github.com/couchbase/eventing/supervisor/util.go:524 +0x176
      github.com/couchbase/eventing/supervisor.(*SuperSupervisor).FetchBucketManifestInfo(0xc000114800, 0xc0008f21f0, 0x8, 0x2001d70, 0x1, 0x0, 0x0, 0x0)
      	/home/couchbase/jenkins/workspace/couchbase-server-unix/goproj/src/github.com/couchbase/eventing/supervisor/util.go:272 +0x11a
      github.com/couchbase/eventing/supervisor.(*SuperSupervisor).watchBucketChanges(0xc000114800)
      	/home/couchbase/jenkins/workspace/couchbase-server-unix/goproj/src/github.com/couchbase/eventing/supervisor/util.go:395 +0x612
      created by github.com/couchbase/eventing/supervisor.NewSuperSupervisor
      	/home/couchbase/jenkins/workspace/couchbase-server-unix/goproj/src/github.com/couchbase/eventing/supervisor/super_supervisor.go:90 +0xc5b 

       

      Attachments

        Issue Links

          No reviews matched the request. Check your Options in the drop-down menu of this sections header.

          Activity

            ankit.prabhu Ankit Prabhu added a comment -

            Here eventing won't write settings to metakv which are not provided by user and which won't affect normal functionality of the service(we write num_timer_partitions which are essential for timer firing). So in this case test is not giving worker_count when it stores function settings which will leave this field empty and this field will take the default value of the eventing server version(for 7.x its 1).
            For UI there was one minor bug during the display of readonly page of the setting which will be addressed by http://review.couchbase.org/c/eventing/+/152778

            ankit.prabhu Ankit Prabhu added a comment - Here eventing won't write settings to metakv which are not provided by user and which won't affect normal functionality of the service(we write num_timer_partitions which are essential for timer firing). So in this case test is not giving worker_count when it stores function settings which will leave this field empty and this field will take the default value of the eventing server version(for 7.x its 1). For UI there was one minor bug during the display of readonly page of the setting which will be addressed by http://review.couchbase.org/c/eventing/+/152778

            Release note - If there a 1 KV node in the cluster, user should not swap rebalance with just 1 kv node.
            2nd part - Ankit is working on the timer count fix. The fix will need to go to 6.6.x version.
            – Corner case with API, if the value is explicitly set by API, this issue with be seen. Also for 6.6, API are not GA.
            – If the user does the setting via UI, then all the values are consistent.

            ritam.sharma Ritam Sharma added a comment - Release note - If there a 1 KV node in the cluster, user should not swap rebalance with just 1 kv node. 2nd part - Ankit is working on the timer count fix. The fix will need to go to 6.6.x version. – Corner case with API, if the value is explicitly set by API, this issue with be seen. Also for 6.6, API are not GA. – If the user does the setting via UI, then all the values are consistent.

            Build couchbase-server-7.0.0-5169 contains eventing commit 112120f with commit message:
            MB-45973: Fix default worker count on ui

            build-team Couchbase Build Team added a comment - Build couchbase-server-7.0.0-5169 contains eventing commit 112120f with commit message: MB-45973 : Fix default worker count on ui
            sujay.gad Sujay Gad added a comment -

            Worker count issue seen for offline upgrade as well while verifying MB-46092

            sujay.gad Sujay Gad added a comment - Worker count issue seen for offline upgrade as well while verifying MB-46092
            sujay.gad Sujay Gad added a comment -

            Test is passing on upgrade from 6.6.1-9123 to 7.1.0-2223. Closing.

            sujay.gad Sujay Gad added a comment - Test is passing on upgrade from 6.6.1-9123 to 7.1.0-2223. Closing.

            People

              ankit.prabhu Ankit Prabhu
              vikas.chaudhary Vikas Chaudhary
              Votes:
              0 Vote for this issue
              Watchers:
              6 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved:

                Gerrit Reviews

                  There are no open Gerrit changes

                  PagerDuty