Uploaded image for project: 'Couchbase Server'
  1. Couchbase Server
  2. MB-44328

Eventing: N1ql queries failing with CAS mismatch (LCB 209)

    XMLWordPrintable

Details

    • Untriaged
    • 1
    • No

    Description

      build: 7.0.0-4454

      Test

       ./testrunner -i /tmp/testexec.120861.ini -p get-cbcollect-info=True,GROUP=bucket_op,skip_log_scan=False,bucket_storage=couchstore -t eventing.eventing_rebalance_collection.EventingRebalanceCollection.test_kv_rebalance_in_when_existing_eventing_node_is_processing_mutations,nodes_init=6,services_init=kv-kv-eventing-eventing-eventing-index:n1ql,dataset=default,groups=simple,reset_services=True,doc-per-day=20,GROUP=bucket_op;bucket_op_windows,java_sdk_client=True
       
      Exception: missing data in destination bucket. Current : 40318 Expected : 40320  dcp_backlog : 0  TIMERS_IN_PAST : 0 lcb_exceptions : {'172.23.122.117': {}, '172.23.122.90': {}, '172.23.122.97': {}}

      Stats

       {
              "dcp_feed_boundary": "everything",
              "event_processing_stats": {
                  "adhoc_timer_response_received": 2,
                  "agg_messages_sent_to_worker": 17766,
                  "agg_queue_memory": 0,
                  "agg_queue_memory_cap": 20971520,
                  "agg_queue_size": 0,
                  "agg_queue_size_cap": 100000,
                  "agg_timer_feedback_queue_cap": 500,
                  "dcp_mutation": 54088,
                  "dcp_mutation_sent_to_worker": 13432,
                  "dcp_snapshot": 40540,
                  "dcp_stream_req_counter": 1021,
                  "dcp_stream_req_err_counter": 453,
                  "dcp_streamend": 170,
                  "dcp_streamreq": 568,
                  "dcp_system_event": 3410,
                  "execution_stats": 729,
                  "failure_stats": 729,
                  "latency_stats": 729,
                  "lcb_exception_stats": 729,
                  "log_level": 1,
                  "num_processed_events": 13432,
                  "processed_events_size": 12023960,
                  "thr_count": 1,
                  "thr_map": 1,
                  "v8_init": 1,
                  "v8_load": 1
              },
              "events_remaining": {
                  "dcp_backlog": 0
              },
              "execution_stats": {
                  "agg_queue_memory": 0,
                  "agg_queue_size": 0,
                  "curl": {
                      "delete": 0,
                      "get": 0,
                      "head": 0,
                      "post": 0,
                      "put": 0
                  },
                  "curl_success_count": 0,
                  "dcp_delete_msg_counter": 0,
                  "dcp_delete_parse_failure": 0,
                  "dcp_mutation_msg_counter": 13432,
                  "dcp_mutation_parse_failure": 0,
                  "enqueued_dcp_delete_msg_counter": 0,
                  "enqueued_dcp_mutation_msg_counter": 13432,
                  "enqueued_timer_msg_counter": 0,
                  "feedback_queue_size": 0,
                  "filtered_dcp_delete_counter": 0,
                  "filtered_dcp_mutation_counter": 0,
                  "lcb_retry_failure": 0,
                  "messages_parsed": 17762,
                  "num_processed_events": 13432,
                  "on_delete_failure": 0,
                  "on_delete_success": 0,
                  "on_update_failure": 1,
                  "on_update_success": 13431,
                  "processed_events_size": 12023960,
                  "timer_callback_failure": 0,
                  "timer_callback_success": 0,
                  "timer_cancel_counter": 0,
                  "timer_create_counter": 0,
                  "timer_create_failure": 0,
                  "timer_msg_counter": 0,
                  "timer_responses_sent": 0,
                  "timestamp": {
                      "47458": "2021-02-13T05:24:38Z"
                  },
                  "uv_msg_parse_failure": 0,
                  "uv_try_write_failure_counter": 0
              },
              "failure_stats": {
                  "app_worker_setting_events_lost": 0,
                  "bkt_ops_cas_mismatch_count": 0,
                  "bucket_op_exception_count": 0,
                  "checkpoint_failure_count": 0,
                  "curl_failure_count": 0,
                  "curl_max_resp_size_exceeded": 0,
                  "curl_non_200_response": 0,
                  "curl_timeout_count": 0,
                  "dcp_events_lost": 0,
                  "debugger_events_lost": 0,
                  "delete_events_lost": 0,
                  "mutation_events_lost": 0,
                  "n1ql_op_exception_count": 1,
                  "timeout_count": 0,
                  "timer_callback_missing_counter": 0,
                  "timer_context_size_exceeded_counter": 0,
                  "timer_events_lost": 0,
                  "timestamp": {
                      "47458": "2021-02-13T05:24:38Z"
                  },
                  "v8worker_events_lost": 0
              },
              "function_id": 1890800864,
              "function_name": "n1ql",
              "gocb_creds_request_counter": 58,
              "internal_vb_distribution_stats": {
                  "worker_n1ql_0": "[342-682]"
              },
              "latency_percentile_stats": {
                  "100": 142000,
                  "50": 3400,
                  "80": 4700,
                  "90": 5800,
                  "95": 7100,
                  "99": 11800
              },
              "lcb_creds_request_counter": 26931,
              "lcb_exception_stats": {
                  "12009": 1,
                  "209": 1
              },
              "metastore_stats": {},
              "planner_stats": [
                  {
                      "host_name": "172.23.122.117:8096",
                      "start_vb": 0,
                      "vb_count": 342
                  },
                  {
                      "host_name": "172.23.122.90:8096",
                      "start_vb": 342,
                      "vb_count": 341
                  },
                  {
                      "host_name": "172.23.122.97:8096",
                      "start_vb": 683,
                      "vb_count": 341
                  }
              ],
              "vb_distribution_stats_from_metadata": {
                  "172.23.122.117:8096": {
                      "worker_n1ql_0": "[0-341]"
                  },
                  "172.23.122.90:8096": {
                      "worker_n1ql_0": "[342-682]"
                  },
                  "172.23.122.97:8096": {
                      "worker_n1ql_0": "[683-1023]"
                  }
              },
              "worker_pids": {
                  "worker_n1ql_0": 47458
              }
          }

      Other tests with similar failures

      ./testrunner -i /tmp/testexec.120861.ini -p get-cbcollect-info=True,GROUP=bucket_op,skip_log_scan=False,bucket_storage=couchstore -t eventing.eventing_rebalance_collection.EventingRebalanceCollection.test_kv_rebalance_out_when_existing_eventing_node_is_processing_mutations,nodes_init=7,services_init=kv-kv-kv-eventing-eventing-eventing-index:n1ql,dataset=default,groups=simple,reset_services=True,doc-per-day=20,GROUP=bucket_op;bucket_op_windows,java_sdk_client=True
       
      Exception: missing data in destination bucket. Current : 40311 Expected : 40320  dcp_backlog : 0  TIMERS_IN_PAST : 0 lcb_exceptions : {'172.23.122.117': {}, '172.23.122.82': {}, '172.23.122.97': {}} 

      {
              "dcp_feed_boundary": "everything",
              "event_processing_stats": {
                  "adhoc_timer_response_received": 3,
                  "agg_messages_sent_to_worker": 18480,
                  "agg_queue_memory": 0,
                  "agg_queue_memory_cap": 20971520,
                  "agg_queue_size": 0,
                  "agg_queue_size_cap": 100000,
                  "agg_timer_feedback_queue_cap": 500,
                  "dcp_mutation": 66943,
                  "dcp_mutation_sent_to_worker": 13389,
                  "dcp_snapshot": 40639,
                  "dcp_stream_req_counter": 5805,
                  "dcp_stream_req_err_counter": 4820,
                  "dcp_streamend": 351,
                  "dcp_streamreq": 985,
                  "dcp_system_event": 3410,
                  "execution_stats": 810,
                  "failure_stats": 810,
                  "latency_stats": 810,
                  "lcb_exception_stats": 810,
                  "log_level": 1,
                  "num_processed_events": 13389,
                  "processed_events_size": 11986244,
                  "thr_count": 1,
                  "thr_map": 1,
                  "v8_init": 1,
                  "v8_load": 1
              },
              "events_remaining": {
                  "dcp_backlog": 0
              },
              "execution_stats": {
                  "agg_queue_memory": 0,
                  "agg_queue_size": 0,
                  "curl": {
                      "delete": 0,
                      "get": 0,
                      "head": 0,
                      "post": 0,
                      "put": 0
                  },
                  "curl_success_count": 0,
                  "dcp_delete_msg_counter": 0,
                  "dcp_delete_parse_failure": 0,
                  "dcp_mutation_msg_counter": 13389,
                  "dcp_mutation_parse_failure": 0,
                  "enqueued_dcp_delete_msg_counter": 0,
                  "enqueued_dcp_mutation_msg_counter": 13389,
                  "enqueued_timer_msg_counter": 0,
                  "feedback_queue_size": 0,
                  "filtered_dcp_delete_counter": 0,
                  "filtered_dcp_mutation_counter": 0,
                  "lcb_retry_failure": 0,
                  "messages_parsed": 18476,
                  "num_processed_events": 13389,
                  "on_delete_failure": 0,
                  "on_delete_success": 0,
                  "on_update_failure": 9,
                  "on_update_success": 13380,
                  "processed_events_size": 11986244,
                  "timer_callback_failure": 0,
                  "timer_callback_success": 0,
                  "timer_cancel_counter": 0,
                  "timer_create_counter": 0,
                  "timer_create_failure": 0,
                  "timer_msg_counter": 0,
                  "timer_responses_sent": 0,
                  "timestamp": {
                      "50026": "2021-02-13T05:49:05Z"
                  },
                  "uv_msg_parse_failure": 0,
                  "uv_try_write_failure_counter": 0
              },
              "failure_stats": {
                  "app_worker_setting_events_lost": 0,
                  "bkt_ops_cas_mismatch_count": 0,
                  "bucket_op_exception_count": 0,
                  "checkpoint_failure_count": 0,
                  "curl_failure_count": 0,
                  "curl_max_resp_size_exceeded": 0,
                  "curl_non_200_response": 0,
                  "curl_timeout_count": 0,
                  "dcp_events_lost": 0,
                  "debugger_events_lost": 0,
                  "delete_events_lost": 0,
                  "mutation_events_lost": 0,
                  "n1ql_op_exception_count": 9,
                  "timeout_count": 0,
                  "timer_callback_missing_counter": 0,
                  "timer_context_size_exceeded_counter": 0,
                  "timer_events_lost": 0,
                  "timestamp": {
                      "50026": "2021-02-13T05:49:05Z"
                  },
                  "v8worker_events_lost": 0
              },
              "function_id": 1994061678,
              "function_name": "n1ql",
              "gocb_creds_request_counter": 88,
              "internal_vb_distribution_stats": {
                  "worker_n1ql_0": "[683-1023]"
              },
              "latency_percentile_stats": {
                  "100": 107900,
                  "50": 3400,
                  "80": 4700,
                  "90": 6000,
                  "95": 7600,
                  "99": 13800
              },
              "lcb_creds_request_counter": 26866,
              "lcb_exception_stats": {
                  "12009": 9,
                  "209": 9
              },
              "metastore_stats": {},
              "planner_stats": [
                  {
                      "host_name": "172.23.122.117:8096",
                      "start_vb": 0,
                      "vb_count": 342
                  },
                  {
                      "host_name": "172.23.122.82:8096",
                      "start_vb": 342,
                      "vb_count": 341
                  },
                  {
                      "host_name": "172.23.122.97:8096",
                      "start_vb": 683,
                      "vb_count": 341
                  }
              ],
              "vb_distribution_stats_from_metadata": {
                  "172.23.122.117:8096": {
                      "worker_n1ql_0": "[0-341]"
                  },
                  "172.23.122.82:8096": {
                      "worker_n1ql_0": "[342-682]"
                  },
                  "172.23.122.97:8096": {
                      "worker_n1ql_0": "[683-1023]"
                  }
              },
              "worker_pids": {
                  "worker_n1ql_0": 50026
              }
          } 

      ./testrunner -i /tmp/testexec.120861.ini -p get-cbcollect-info=True,GROUP=bucket_op,skip_log_scan=False,bucket_storage=couchstore -t eventing.eventing_rebalance_collection.EventingRebalanceCollection.test_kv_swap_rebalance_when_existing_eventing_node_is_processing_mutations,nodes_init=6,services_init=kv-kv-eventing-eventing-eventing-index:n1ql,dataset=default,groups=simple,reset_services=True,doc-per-day=20,GROUP=bucket_op;bucket_op_windows,java_sdk_client=True
       
      Exception: missing data in destination bucket. Current : 40319 Expected : 40320  dcp_backlog : 0  TIMERS_IN_PAST : 0 lcb_exceptions : {'172.23.122.117': {}, '172.23.122.90': {}, '172.23.122.97': {}} 

      Attachments

        1. 172.23.104.97-20210227-1719-diag.zip
          23.83 MB
        2. 172.23.105.238-20210227-1719-diag.zip
          19.47 MB
        3. 172.23.105.240-20210227-1719-diag.zip
          20.54 MB
        4. 172.23.105.247-20210227-1719-diag.zip
          22.99 MB
        5. 172.23.105.249-20210227-1719-diag.zip
          16.33 MB
        6. 172.23.106.1-20210227-1719-diag.zip
          17.66 MB
        7. 172.23.107.65-20210227-1719-diag.zip
          15.88 MB
        8. 172.23.121.213-20210227-1719-diag.zip
          18.96 MB
        9. 172.23.122.116-20210212-2218-diag.zip
          6.24 MB
        10. 172.23.122.117-20210212-2218-diag.zip
          12.01 MB
        11. 172.23.122.118-20210212-2218-diag.zip
          17.49 MB
        12. 172.23.122.156-20210217-0043-diag.zip
          25.08 MB
        13. 172.23.122.158-20210217-0043-diag.zip
          23.10 MB
        14. 172.23.122.159-20210217-0043-diag.zip
          20.12 MB
        15. 172.23.122.160-20210217-0043-diag.zip
          14.57 MB
        16. 172.23.122.161-20210217-0043-diag.zip
          23.98 MB
        17. 172.23.122.82-20210212-2218-diag.zip
          24.42 MB
        18. 172.23.122.90-20210212-2218-diag.zip
          34.40 MB
        19. 172.23.122.91-20210217-0043-diag.zip
          32.03 MB
        20. 172.23.122.92-20210217-0043-diag.zip
          18.98 MB
        21. 172.23.122.93-20210217-0043-diag.zip
          20.00 MB
        22. 172.23.122.97-20210212-2218-diag.zip
          41.19 MB
        23. 172.23.123.61-20210212-2218-diag.zip
          26.76 MB
        24. 172.23.123.62-20210212-2218-diag.zip
          28.60 MB
        25. image-2021-03-16-09-03-02-836.png
          image-2021-03-16-09-03-02-836.png
          133 kB
        26. screenshot-1.png
          screenshot-1.png
          18 kB
        27. screenshot-2.png
          screenshot-2.png
          42 kB
        28. screenshot-3.png
          screenshot-3.png
          19 kB
        29. test_2_4554.log
          259 kB
        30. test_4.log.txt
          408 kB
        31. test_5.log
          412 kB
        32. test_5.log.txt
          413 kB
        33. test.log
          412 kB
        No reviews matched the request. Check your Options in the drop-down menu of this sections header.

        Activity

          marco.greco Marco Greco added a comment -

          On node 172.23.122.117 you have lots of messages along the lines of

          _time=2021-02-12T20:31:36.290-08:00 _level=ERROR _msg=Failed to perform INSERT on key <ud>doc_1267</ud> for Keyspace default:dst_bucket.scope_1.coll_4. Error - <ud>Duplicate Key doc_1267</ud>

          suggesting that this either wasn't a run done against a blank slate, or previous OnDelete() executions didn't go through, although I find no evidence of deletes failing.
          I'd try again making sure, that you don't have any data left behind.

          marco.greco Marco Greco added a comment - On node 172.23.122.117 you have lots of messages along the lines of _time=2021-02-12T20:31:36.290-08:00 _level=ERROR _msg=Failed to perform INSERT on key <ud>doc_1267</ud> for Keyspace default:dst_bucket.scope_1.coll_4. Error - <ud>Duplicate Key doc_1267</ud> suggesting that this either wasn't a run done against a blank slate, or previous OnDelete() executions didn't go through, although I find no evidence of deletes failing. I'd try again making sure, that you don't have any data left behind.

          Jeelan Poola Looks like internally eventing is retrying query. 

          vikas.chaudhary Vikas Chaudhary added a comment - Jeelan Poola  Looks like internally eventing is retrying query. 
          jeelan.poola Jeelan Poola added a comment - - edited

          Vikas Chaudhary Eventing retries only if we get TMPFAIL from SDK. But the issue here is presence of docs in dst_bucket. Does the test ensure empty dst_bucket before starting the test? Could you please rerun the test with Build couchbase-server-7.0.0-4475 or later?

          jeelan.poola Jeelan Poola added a comment - - edited Vikas Chaudhary Eventing retries only if we get TMPFAIL from SDK. But the issue here is presence of docs in dst_bucket. Does the test ensure empty dst_bucket before starting the test? Could you please rerun the test with Build couchbase-server-7.0.0-4475 or later?

          Jeelan Poola we created the bucket at the starting of every test. Still seeing on 7.0.0-4476.

          Node 172.23.122.156

           {
                  "dcp_feed_boundary": "everything",
                  "event_processing_stats": {
                      "adhoc_timer_response_received": 3,
                      "agg_messages_sent_to_worker": 18387,
                      "agg_queue_memory": 0,
                      "agg_queue_memory_cap": 20971520,
                      "agg_queue_size": 0,
                      "agg_queue_size_cap": 100000,
                      "agg_timer_feedback_queue_cap": 500,
                      "dcp_mutation": 67159,
                      "dcp_mutation_sent_to_worker": 13499,
                      "dcp_snapshot": 40847,
                      "dcp_stream_req_counter": 7450,
                      "dcp_stream_req_err_counter": 6515,
                      "dcp_streamend": 351,
                      "dcp_streamreq": 935,
                      "dcp_system_event": 3420,
                      "execution_stats": 769,
                      "failure_stats": 769,
                      "latency_stats": 769,
                      "lcb_exception_stats": 769,
                      "log_level": 1,
                      "num_processed_events": 13499,
                      "processed_events_size": 12082884,
                      "thr_count": 1,
                      "thr_map": 1,
                      "v8_init": 1,
                      "v8_load": 1
                  },
                  "events_remaining": {
                      "dcp_backlog": 0
                  },
                  "execution_stats": {
                      "agg_queue_memory": 0,
                      "agg_queue_size": 0,
                      "curl": {
                          "delete": 0,
                          "get": 0,
                          "head": 0,
                          "post": 0,
                          "put": 0
                      },
                      "curl_success_count": 0,
                      "dcp_delete_msg_counter": 0,
                      "dcp_delete_parse_failure": 0,
                      "dcp_mutation_msg_counter": 13499,
                      "dcp_mutation_parse_failure": 0,
                      "enqueued_dcp_delete_msg_counter": 0,
                      "enqueued_dcp_mutation_msg_counter": 13499,
                      "enqueued_timer_msg_counter": 0,
                      "feedback_queue_size": 0,
                      "filtered_dcp_delete_counter": 0,
                      "filtered_dcp_mutation_counter": 0,
                      "lcb_retry_failure": 0,
                      "messages_parsed": 18383,
                      "num_processed_events": 13499,
                      "on_delete_failure": 0,
                      "on_delete_success": 0,
                      "on_update_failure": 4,
                      "on_update_success": 13495,
                      "processed_events_size": 12082884,
                      "timer_callback_failure": 0,
                      "timer_callback_success": 0,
                      "timer_cancel_counter": 0,
                      "timer_create_counter": 0,
                      "timer_create_failure": 0,
                      "timer_msg_counter": 0,
                      "timer_responses_sent": 0,
                      "timestamp": {
                          "104867": "2021-02-17T08:39:52Z"
                      },
                      "uv_msg_parse_failure": 0,
                      "uv_try_write_failure_counter": 0
                  },
                  "failure_stats": {
                      "app_worker_setting_events_lost": 0,
                      "bkt_ops_cas_mismatch_count": 0,
                      "bucket_op_exception_count": 0,
                      "checkpoint_failure_count": 0,
                      "curl_failure_count": 0,
                      "curl_max_resp_size_exceeded": 0,
                      "curl_non_200_response": 0,
                      "curl_timeout_count": 0,
                      "dcp_events_lost": 0,
                      "debugger_events_lost": 0,
                      "delete_events_lost": 0,
                      "mutation_events_lost": 0,
                      "n1ql_op_exception_count": 4,
                      "timeout_count": 0,
                      "timer_callback_missing_counter": 0,
                      "timer_context_size_exceeded_counter": 0,
                      "timer_events_lost": 0,
                      "timestamp": {
                          "104867": "2021-02-17T08:39:52Z"
                      },
                      "v8worker_events_lost": 0
                  },
                  "function_id": 22036014,
                  "function_name": "n1ql",
                  "gocb_creds_request_counter": 105,
                  "internal_vb_distribution_stats": {
                      "worker_n1ql_0": "[0-341]"
                  },
                  "latency_percentile_stats": {
                      "100": 125600,
                      "50": 3400,
                      "80": 4700,
                      "90": 5700,
                      "95": 7100,
                      "99": 12200
                  },
                  "lcb_creds_request_counter": 27077,
                  "lcb_exception_stats": {
                      "12009": 4,
                      "209": 4
                  },
                  "metastore_stats": {},
                  "planner_stats": [
                      {
                          "host_name": "172.23.122.156:8096",
                          "start_vb": 0,
                          "vb_count": 342
                      },
                      {
                          "host_name": "172.23.122.161:8096",
                          "start_vb": 342,
                          "vb_count": 341
                      },
                      {
                          "host_name": "172.23.122.92:8096",
                          "start_vb": 683,
                          "vb_count": 341
                      }
                  ],
                  "vb_distribution_stats_from_metadata": {
                      "172.23.122.156:8096": {
                          "worker_n1ql_0": "[0-341]"
                      },
                      "172.23.122.161:8096": {
                          "worker_n1ql_0": "[342-682]"
                      },
                      "172.23.122.92:8096": {
                          "worker_n1ql_0": "[683-1023]"
                      }
                  },
                  "worker_pids": {
                      "worker_n1ql_0": 104867
                  }
              }

           

          vikas.chaudhary Vikas Chaudhary added a comment - Jeelan Poola  we created the bucket at the starting of every test. Still seeing on 7.0.0-4476. Node 172.23.122.156 { "dcp_feed_boundary": "everything", "event_processing_stats": { "adhoc_timer_response_received": 3, "agg_messages_sent_to_worker": 18387, "agg_queue_memory": 0, "agg_queue_memory_cap": 20971520, "agg_queue_size": 0, "agg_queue_size_cap": 100000, "agg_timer_feedback_queue_cap": 500, "dcp_mutation": 67159, "dcp_mutation_sent_to_worker": 13499, "dcp_snapshot": 40847, "dcp_stream_req_counter": 7450, "dcp_stream_req_err_counter": 6515, "dcp_streamend": 351, "dcp_streamreq": 935, "dcp_system_event": 3420, "execution_stats": 769, "failure_stats": 769, "latency_stats": 769, "lcb_exception_stats": 769, "log_level": 1, "num_processed_events": 13499, "processed_events_size": 12082884, "thr_count": 1, "thr_map": 1, "v8_init": 1, "v8_load": 1 }, "events_remaining": { "dcp_backlog": 0 }, "execution_stats": { "agg_queue_memory": 0, "agg_queue_size": 0, "curl": { "delete": 0, "get": 0, "head": 0, "post": 0, "put": 0 }, "curl_success_count": 0, "dcp_delete_msg_counter": 0, "dcp_delete_parse_failure": 0, "dcp_mutation_msg_counter": 13499, "dcp_mutation_parse_failure": 0, "enqueued_dcp_delete_msg_counter": 0, "enqueued_dcp_mutation_msg_counter": 13499, "enqueued_timer_msg_counter": 0, "feedback_queue_size": 0, "filtered_dcp_delete_counter": 0, "filtered_dcp_mutation_counter": 0, "lcb_retry_failure": 0, "messages_parsed": 18383, "num_processed_events": 13499, "on_delete_failure": 0, "on_delete_success": 0, "on_update_failure": 4, "on_update_success": 13495, "processed_events_size": 12082884, "timer_callback_failure": 0, "timer_callback_success": 0, "timer_cancel_counter": 0, "timer_create_counter": 0, "timer_create_failure": 0, "timer_msg_counter": 0, "timer_responses_sent": 0, "timestamp": { "104867": "2021-02-17T08:39:52Z" }, "uv_msg_parse_failure": 0, "uv_try_write_failure_counter": 0 }, "failure_stats": { "app_worker_setting_events_lost": 0, "bkt_ops_cas_mismatch_count": 0, "bucket_op_exception_count": 0, "checkpoint_failure_count": 0, "curl_failure_count": 0, "curl_max_resp_size_exceeded": 0, "curl_non_200_response": 0, "curl_timeout_count": 0, "dcp_events_lost": 0, "debugger_events_lost": 0, "delete_events_lost": 0, "mutation_events_lost": 0, "n1ql_op_exception_count": 4, "timeout_count": 0, "timer_callback_missing_counter": 0, "timer_context_size_exceeded_counter": 0, "timer_events_lost": 0, "timestamp": { "104867": "2021-02-17T08:39:52Z" }, "v8worker_events_lost": 0 }, "function_id": 22036014, "function_name": "n1ql", "gocb_creds_request_counter": 105, "internal_vb_distribution_stats": { "worker_n1ql_0": "[0-341]" }, "latency_percentile_stats": { "100": 125600, "50": 3400, "80": 4700, "90": 5700, "95": 7100, "99": 12200 }, "lcb_creds_request_counter": 27077, "lcb_exception_stats": { "12009": 4, "209": 4 }, "metastore_stats": {}, "planner_stats": [ { "host_name": "172.23.122.156:8096", "start_vb": 0, "vb_count": 342 }, { "host_name": "172.23.122.161:8096", "start_vb": 342, "vb_count": 341 }, { "host_name": "172.23.122.92:8096", "start_vb": 683, "vb_count": 341 } ], "vb_distribution_stats_from_metadata": { "172.23.122.156:8096": { "worker_n1ql_0": "[0-341]" }, "172.23.122.161:8096": { "worker_n1ql_0": "[342-682]" }, "172.23.122.92:8096": { "worker_n1ql_0": "[683-1023]" } }, "worker_pids": { "worker_n1ql_0": 104867 } }  

          4 N1QL exceptions seen on Eventing node 172.23.122.156 - worker 0

          2021-02-17T00:27:41.723-08:00 [Info] eventing-consumer [worker_n1ql_0:/tmp/127.0.0.1:8091_0_22036014.sock:104867] {  "message": "SDK error : LCB_ERR_CAS_MISMATCH (209) Query error : {\n\"requestID\": \"7c0820b5-282e-4ef7-b664-28877867022d\",\n\"clientContextID\": \"4@n1ql.js(OnUpdate)\",\n\"signature\": null,\n\"results\": [\n],\n\"errors\": [{\"code\":12009,\"msg\":\"DML Error, possible causes include CAS mismatch or concurrent modificationFailed to perform INSERT - cause: Key doc_36144 - cause: unable to complete action after 6 attemps\"}],\n\"status\": \"errors\",\n\"metrics\": {\"elapsedTime\": \"53.812153ms\",\"executionTime\": \"53.786117ms\",\"resultCount\": 0,\"resultSize\": 0,\"serviceLoad\": 25,\"errorCount\": 1}\n}\n",  "stack": "Error\n    at OnUpdate (n1ql.js:4:17)"} Location: n1ql.js:4 Code: var query = N1QL('INSERT INTO dst_bucket.scope_1.coll_4 ( KEY, VALUE ) VALUES ( $docId ,\'Hello World\');', {'$docId':docId}, { 'consistency': 'none' }); Stack: Error    at OnUpdate (n1ql.js:4:17)
          

          At the same time the following error is seen on Query log in 172.23.122.160:

          _time=2021-02-17T00:27:41.722-08:00 _level=ERROR _msg=Failed to perform INSERT on key <ud>doc_36144</ud> for Keyspace default:dst_bucket.scope_1.coll_4. Error - <ud>Key doc_36144 - cause: unable to complete action after 6 attemps</ud> 
          _time=2021-02-17T00:27:43.356-08:00 _level=ERROR _msg=Failed to perform INSERT on key <ud>doc_38129</ud> for Keyspace default:dst_bucket.scope_1.coll_4. Error - <ud>Key doc_38129 - cause: unable to complete action after 6 attemps</ud> 
          _time=2021-02-17T00:27:43.565-08:00 _level=ERROR _msg=Failed to perform INSERT on key <ud>doc_39216</ud> for Keyspace default:dst_bucket.scope_1.coll_4. Error - <ud>Key doc_39216 - cause: unable to complete action after 6 attemps</ud> 
          _time=2021-02-17T00:27:44.106-08:00 _level=ERROR _msg=Failed to perform INSERT on key <ud>doc_39148</ud> for Keyspace default:dst_bucket.scope_1.coll_4. Error - <ud>Key doc_39148 - cause: unable to complete action after 6 attemps</ud> 
          

          Seems like it could've been caused by something other than a CAS mismatch in the second run (because the message seems different from the one Marco mentioned). No errors in the memcached logs during 2021-02-17T00:27:XX. Need to investigate further. Cc Marco Greco

          vinayaka.kamath Vinayaka Kamath (Inactive) added a comment - - edited 4 N1QL exceptions seen on Eventing node 172.23.122.156 - worker 0 2021-02-17T00:27:41.723-08:00 [Info] eventing-consumer [worker_n1ql_0:/tmp/127.0.0.1:8091_0_22036014.sock:104867] { "message": "SDK error : LCB_ERR_CAS_MISMATCH (209) Query error : {\n\"requestID\": \"7c0820b5-282e-4ef7-b664-28877867022d\",\n\"clientContextID\": \"4@n1ql.js(OnUpdate)\",\n\"signature\": null,\n\"results\": [\n],\n\"errors\": [{\"code\":12009,\"msg\":\"DML Error, possible causes include CAS mismatch or concurrent modificationFailed to perform INSERT - cause: Key doc_36144 - cause: unable to complete action after 6 attemps\"}],\n\"status\": \"errors\",\n\"metrics\": {\"elapsedTime\": \"53.812153ms\",\"executionTime\": \"53.786117ms\",\"resultCount\": 0,\"resultSize\": 0,\"serviceLoad\": 25,\"errorCount\": 1}\n}\n", "stack": "Error\n at OnUpdate (n1ql.js:4:17)"} Location: n1ql.js:4 Code: var query = N1QL('INSERT INTO dst_bucket.scope_1.coll_4 ( KEY, VALUE ) VALUES ( $docId ,\'Hello World\');', {'$docId':docId}, { 'consistency': 'none' }); Stack: Error at OnUpdate (n1ql.js:4:17) At the same time the following error is seen on Query log in 172.23.122.160: _time=2021-02-17T00:27:41.722-08:00 _level=ERROR _msg=Failed to perform INSERT on key <ud>doc_36144</ud> for Keyspace default:dst_bucket.scope_1.coll_4. Error - <ud>Key doc_36144 - cause: unable to complete action after 6 attemps</ud> _time=2021-02-17T00:27:43.356-08:00 _level=ERROR _msg=Failed to perform INSERT on key <ud>doc_38129</ud> for Keyspace default:dst_bucket.scope_1.coll_4. Error - <ud>Key doc_38129 - cause: unable to complete action after 6 attemps</ud> _time=2021-02-17T00:27:43.565-08:00 _level=ERROR _msg=Failed to perform INSERT on key <ud>doc_39216</ud> for Keyspace default:dst_bucket.scope_1.coll_4. Error - <ud>Key doc_39216 - cause: unable to complete action after 6 attemps</ud> _time=2021-02-17T00:27:44.106-08:00 _level=ERROR _msg=Failed to perform INSERT on key <ud>doc_39148</ud> for Keyspace default:dst_bucket.scope_1.coll_4. Error - <ud>Key doc_39148 - cause: unable to complete action after 6 attemps</ud> Seems like it could've been caused by something other than a CAS mismatch in the second run (because the message seems different from the one Marco mentioned). No errors in the memcached logs during 2021-02-17T00:27:XX. Need to investigate further. Cc Marco Greco
          jeelan.poola Jeelan Poola added a comment -

          Marco Greco Could you please look into the failures (after 6 retries) in query logs in latest repro? This is a standard test and eventing does not do anything specific here except fire queries through lcb. Please assign it back to eventing if some aspect needs to be analysed. Thank you!

          jeelan.poola Jeelan Poola added a comment - Marco Greco Could you please look into the failures (after 6 retries) in query logs in latest repro? This is a standard test and eventing does not do anything specific here except fire queries through lcb. Please assign it back to eventing if some aspect needs to be analysed. Thank you!
          marco.greco Marco Greco added a comment -

          The 4 connection errors after 6 retries can come out of one of the following:

          • NOT_MY_VBUCKET
          • NOT_SUPPORTED
          • ENOMEM
          • TMPFAIL
          • connection reset
          • connection refused
          • broken pipe

          So I guess the question is - was something happening at the time like nodes being rebalanced in / out, or...?

          marco.greco Marco Greco added a comment - The 4 connection errors after 6 retries can come out of one of the following: NOT_MY_VBUCKET NOT_SUPPORTED ENOMEM TMPFAIL connection reset connection refused broken pipe So I guess the question is - was something happening at the time like nodes being rebalanced in / out, or...?

          Ankit Prabhu can you look into these, how rebalance in/out will cause these failures

          vikas.chaudhary Vikas Chaudhary added a comment - Ankit Prabhu  can you look into these, how rebalance in/out will cause these failures
          jeelan.poola Jeelan Poola added a comment - - edited

          Marco Greco The test is about kv rebalance-in (test_kv_rebalance_in_when_existing_eventing_node_is_processing_mutations). Perhaps, its a good idea to check with KV Team on the conditions you listed. Cc Daniel Owen

          jeelan.poola Jeelan Poola added a comment - - edited Marco Greco The test is about kv rebalance-in (test_kv_rebalance_in_when_existing_eventing_node_is_processing_mutations). Perhaps, its a good idea to check with KV Team on the conditions you listed. Cc Daniel Owen

          Marco Greco Here are the test steps 

          • Create 6 node cluster kv-kv-eventing-eventing-eventing-index:n1ql
          • Deploy 5 handler (bucket op ,n1ql ,curl,sbm,timers)
          • load data into source buckets
          • While the load is going on rebalance in 1 kV node/1 kv node rebalance out / swap of 1 kv node 
          • once rebalance complete check for the documents in the destination bucket - n1ql mutation failed due to this

          function OnUpdate(doc,meta) {    
          var docId = meta.id;    v
          ar query = INSERT INTO dst_bucket.scope_1.coll_4 ( KEY, VALUE ) VALUES ( $docId ,'Hello World');
          }
          function OnDelete(meta,options){    
          var id=meta.id;    
          delete from dst_bucket.scope_1.coll_4 where META().id=$id;
          } 

          vikas.chaudhary Vikas Chaudhary added a comment - Marco Greco  Here are the test steps  Create 6 node cluster kv-kv-eventing-eventing-eventing-index:n1ql Deploy 5 handler (bucket op ,n1ql ,curl,sbm,timers) load data into source buckets While the load is going on rebalance in 1 kV node/1 kv node rebalance out / swap of 1 kv node  once rebalance complete check for the documents in the destination bucket - n1ql mutation failed due to this function OnUpdate(doc,meta) { var docId = meta.id; v ar query = INSERT INTO dst_bucket.scope_1.coll_4 ( KEY, VALUE ) VALUES ( $docId ,'Hello World'); } function OnDelete(meta,options){ var id=meta.id; delete from dst_bucket.scope_1.coll_4 where META().id=$id; }
          marco.greco Marco Greco added a comment - - edited

          Guys - I am not entirely sure what you are expecting here: during rebalance, mutations may fail. go-couchbase will retry them but it can't retry forever: our current limit for the test setup is 6 times.
          I will add some more diagnostics and a bucket refresh on specific errors, but the correct question is, why is the KV failing for so many retries.
          In particular, this code has not changed much since version 4, and the only changes that have occurred were to add more retries scenarios for specific corner cases. If this used to work, and now it doesn't, it seems to me like the KV behaviour has changed.

          marco.greco Marco Greco added a comment - - edited Guys - I am not entirely sure what you are expecting here: during rebalance, mutations may fail. go-couchbase will retry them but it can't retry forever: our current limit for the test setup is 6 times. I will add some more diagnostics and a bucket refresh on specific errors, but the correct question is, why is the KV failing for so many retries. In particular, this code has not changed much since version 4, and the only changes that have occurred were to add more retries scenarios for specific corner cases. If this used to work, and now it doesn't, it seems to me like the KV behaviour has changed.
          Donald.haggart Donald Haggart added a comment - - edited

          160, ns_server.query.log:

          _time=2021-02-17T00:27:41.722-08:00 _level=ERROR _msg=Failed to perform INSERT on key <ud>doc_36144</ud> for Keyspace default:dst_bucket.scope_1.coll_4. Error - <ud>Key doc_36144 - cause: unable to complete action after 6 attemps</ud>

          159, memcached.log:

          2021-02-17T00:27:40.614933-08:00 INFO (dst_bucket) collections: vb:169 create scope:id:0x9 name:scope_1,replica:true, seqno:2, manifest:0x2, force:false

          2021-02-17T00:27:40.615119-08:00 INFO (dst_bucket) collections: vb:169 create collection:id:0xc, name:coll_4 to scope:0x9, maxTTL:false 0, replica:true, seqno:10, manifest:0xa, force:false

          ...

          2021-02-17T00:28:15.080642-08:00 INFO (src_bucket) collections: vb:0 create collection:id:0xc, name:coll_4 to scope:0x9, maxTTL:false 0, replica:true, seqno:10, manifest:0xa, force:false

           

          So entirely possible the VB for "doc_36144" didn't yet exist at the time of the N1QL statement.

          Possibly simply timing?

          Donald.haggart Donald Haggart added a comment - - edited 160, ns_server.query.log: _time=2021-02-17T 00:27:41 .722-08:00 _level=ERROR _msg=Failed to perform INSERT on key <ud>doc_36144</ud> for Keyspace default:dst_bucket.scope_1.coll_4. Error - <ud>Key doc_36144 - cause: unable to complete action after 6 attemps</ud> 159, memcached.log: 2021-02-17T 00:27:40 .614933-08:00 INFO (dst_bucket) collections: vb:169 create scope:id:0x9 name:scope_1,replica:true, seqno:2, manifest:0x2, force:false 2021-02-17T00:27:40.615119-08:00 INFO (dst_bucket) collections: vb:169 create collection:id:0xc, name:coll_4 to scope:0x9, maxTTL:false 0, replica:true, seqno:10, manifest:0xa, force:false ... 2021-02-17T 00:28:15 .080642-08:00 INFO (src_bucket) collections: vb:0 create collection:id:0xc, name:coll_4 to scope:0x9, maxTTL:false 0, replica:true, seqno:10, manifest:0xa, force:false   So entirely possible the VB for "doc_36144" didn't yet exist at the time of the N1QL statement. Possibly simply timing?
          marco.greco Marco Greco added a comment -

          Actually, Donald has a point here - if you are using collections and creating collections as part of the test, bear in mind that there's a 3 process race condition in place here, where the actors are n1ql, ns_server, kv: the fact that a collection has been created via n1ql (and therefore n1ql knows about it) doesn't mean to say that that same collection is yet visible to the KV: this will only happen once ns_server manages to alert KV of the fact, so in the interim period, any gets / puts against the collection will fail.
          This is particularly relevant here, because from the logs, 159 is alerted of dst_bucket.scope_1.coll_4 existing at 00:27:40.9

          021-02-17T00:27:40.954005-08:00 INFO (dst_bucket) collections: vb:160 create collection:id:0xc, name:coll_4 to scope:0x9, maxTTL:false 0, replica:true, seqno:10, manifest:0xa, force:false
          

          and query, on node 160 fails to insert less than a second later: it is quite possible that the clocks on the two nodes are not synchronised, 160 is slightly ahead and it is in fact performing the insert before 159 has been made aware of coll_4.

          At any rate, I have slightly tightened the retry code, and reporting the error on which n1ql fails after retrying repeatedly.
          Please rerun after I merge.

          marco.greco Marco Greco added a comment - Actually, Donald has a point here - if you are using collections and creating collections as part of the test, bear in mind that there's a 3 process race condition in place here, where the actors are n1ql, ns_server, kv: the fact that a collection has been created via n1ql (and therefore n1ql knows about it) doesn't mean to say that that same collection is yet visible to the KV: this will only happen once ns_server manages to alert KV of the fact, so in the interim period, any gets / puts against the collection will fail. This is particularly relevant here, because from the logs, 159 is alerted of dst_bucket.scope_1.coll_4 existing at 00:27:40.9 021-02-17T00:27:40.954005-08:00 INFO (dst_bucket) collections: vb:160 create collection:id:0xc, name:coll_4 to scope:0x9, maxTTL:false 0, replica:true, seqno:10, manifest:0xa, force:false and query, on node 160 fails to insert less than a second later: it is quite possible that the clocks on the two nodes are not synchronised, 160 is slightly ahead and it is in fact performing the insert before 159 has been made aware of coll_4. At any rate, I have slightly tightened the retry code, and reporting the error on which n1ql fails after retrying repeatedly. Please rerun after I merge.
          ritam.sharma Ritam Sharma added a comment -

          Please reassign after you merge.

          ritam.sharma Ritam Sharma added a comment - Please reassign after you merge.
          marco.greco Marco Greco added a comment -

          Please don't forget to add a delay between creating collections and starting the test.

          marco.greco Marco Greco added a comment - Please don't forget to add a delay between creating collections and starting the test.
          jeelan.poola Jeelan Poola added a comment -

          Marco Greco Would also be great if the error can explicitly mention that keyspace does not exist, if possible (instead of CAS mismatch which is a bit misleading). Thank you!

          jeelan.poola Jeelan Poola added a comment - Marco Greco Would also be great if the error can explicitly mention that keyspace does not exist, if possible (instead of CAS mismatch which is a bit misleading). Thank you!

          Marco Greco Donald Haggart Here the timeline for the events, where the time difference between collection creation and loading of data (eventing will start mutation, not before loading) is ~ 2min. And then we have to rebalance started at 2021-02-17 00:27:40 , the Same time when querying failed

          Collection creation
          [2021-02-17 00:25:20,345] - [rest_client] [139626970871552] - INFO - http://172.23.122.156:8091/pools/default/buckets/dst_bucket/scopes/scope_1/collections with params: name=coll_4
          [2021-02-17 00:25:20,345] - [rest_client] [139626970871552] - INFO - Collection created dst_bucket->scope_1->coll_4 manifest:{'uid': 'a'}
           
           
          Handler creation and deployment
          [2021-02-17 00:25:26,634] - [eventing_base] [139626970871552] - INFO - saving function n1ql
          [2021-02-17 00:27:08,907] - [basetestcase] [139626970871552] - INFO - sleep for 5 secs. Waiting for n1ql to deployed... ...
           
           
          Load started
          [2021-02-17 00:27:14,843] - [task] [139626962478848] - INFO - java -jar java_sdk_client/collections/target/javaclient/javaclient.jar -i 172.23.122.91 -u Administrator -p password -b src_bucket -s _default -c _default -n 40320 -pc 100 -pu 0 -pd 0 -l uniform -dsn 1 -dpx doc_ -dt Person -de 0 -ds 500 -ac False -st 0 -en 40319 -o False -sd False
          [2021-02-17 00:27:16,979] - [rest_client] [139626970871552] - INFO - rebalance reached >100% in 3.005150318145752 seconds 
          [2021-02-17 00:27:27,402] - [task] [139626962478848] - INFO - java -jar java_sdk_client/collections/target/javaclient/javaclient.jar -i 172.23.122.91 -u Administrator -p password -b src_bucket -s scope_1 -c coll_1 -n 40320 -pc 100 -pu 0 -pd 0 -l uniform -dsn 1 -dpx doc_ -dt Person -de 0 -ds 500 -ac False -st 0 -en 40319 -o False -sd False
           
           
          Rebalance started
          [2021-02-17 00:27:40,082] - [rest_client] [139626962478848] - INFO - rebalance params : {'knownNodes': 'ns_1@172.23.122.156,ns_1@172.23.122.158,ns_1@172.23.122.159,ns_1@172.23.122.160,ns_1@172.23.122.161,ns_1@172.23.122.91,ns_1@172.23.122.92', 'ejectedNodes': 'ns_1@172.23.122.158', 'user': 'Administrator', 'password': 'password'}
          [2021-02-17 00:27:40,111] - [rest_client] [139626962478848] - INFO - rebalance operation started
          [2021-02-17 00:27:40,114] - [rest_client] [139626962478848] - INFO - rebalance percentage : 0.00 %
          [2021-02-17 00:27:40,114] - [task] [139626962478848] - INFO - Rebalance - status: running, progress: 0.00%
          [2021-02-17 00:27:50,141] - [rest_client] [139626962478848] - INFO - rebalance percentage : 14.00 %
          [2021-02-17 00:27:50,141] - [task] [139626962478848] - INFO - Rebalance - status: running, progress: 14.00%

          vikas.chaudhary Vikas Chaudhary added a comment - Marco Greco Donald Haggart  Here the timeline for the events, where the time difference between collection creation and loading of data (eventing will start mutation, not before loading) is ~ 2min. And then we have to rebalance started at  2021-02-17 00:27:40  , the Same time when querying failed Collection creation [2021-02-17 00:25:20,345] - [rest_client] [139626970871552] - INFO - http://172.23.122.156:8091/pools/default/buckets/dst_bucket/scopes/scope_1/collections with params: name=coll_4 [2021-02-17 00:25:20,345] - [rest_client] [139626970871552] - INFO - Collection created dst_bucket->scope_1->coll_4 manifest:{'uid': 'a'}     Handler creation and deployment [2021-02-17 00:25:26,634] - [eventing_base] [139626970871552] - INFO - saving function n1ql [2021-02-17 00:27:08,907] - [basetestcase] [139626970871552] - INFO - sleep for 5 secs. Waiting for n1ql to deployed... ...     Load started [2021-02-17 00:27:14,843] - [task] [139626962478848] - INFO - java -jar java_sdk_client/collections/target/javaclient/javaclient.jar -i 172.23.122.91 -u Administrator -p password -b src_bucket -s _default -c _default -n 40320 -pc 100 -pu 0 -pd 0 -l uniform -dsn 1 -dpx doc_ -dt Person -de 0 -ds 500 -ac False -st 0 -en 40319 -o False -sd False [2021-02-17 00:27:16,979] - [rest_client] [139626970871552] - INFO - rebalance reached >100% in 3.005150318145752 seconds [2021-02-17 00:27:27,402] - [task] [139626962478848] - INFO - java -jar java_sdk_client/collections/target/javaclient/javaclient.jar -i 172.23.122.91 -u Administrator -p password -b src_bucket -s scope_1 -c coll_1 -n 40320 -pc 100 -pu 0 -pd 0 -l uniform -dsn 1 -dpx doc_ -dt Person -de 0 -ds 500 -ac False -st 0 -en 40319 -o False -sd False     Rebalance started [2021-02-17 00:27:40,082] - [rest_client] [139626962478848] - INFO - rebalance params : {'knownNodes': 'ns_1@172.23.122.156,ns_1@172.23.122.158,ns_1@172.23.122.159,ns_1@172.23.122.160,ns_1@172.23.122.161,ns_1@172.23.122.91,ns_1@172.23.122.92', 'ejectedNodes': 'ns_1@172.23.122.158', 'user': 'Administrator', 'password': 'password'} [2021-02-17 00:27:40,111] - [rest_client] [139626962478848] - INFO - rebalance operation started [2021-02-17 00:27:40,114] - [rest_client] [139626962478848] - INFO - rebalance percentage : 0.00 % [2021-02-17 00:27:40,114] - [task] [139626962478848] - INFO - Rebalance - status: running, progress: 0.00% [2021-02-17 00:27:50,141] - [rest_client] [139626962478848] - INFO - rebalance percentage : 14.00 % [2021-02-17 00:27:50,141] - [task] [139626962478848] - INFO - Rebalance - status: running, progress: 14.00%
          marco.greco Marco Greco added a comment -

          Jeelan Poola the error mentioning cas is generated by the SDK - the error coming from N1QL is just "could not perform insert after x attempts"
          I do agree it's misleading though.

          marco.greco Marco Greco added a comment - Jeelan Poola the error mentioning cas is generated by the SDK - the error coming from N1QL is just "could not perform insert after x attempts" I do agree it's misleading though.
          jeelan.poola Jeelan Poola added a comment -

          Cc Matt IngenthronSergey Avseyev for help with error improvements in libcouchbase.

          jeelan.poola Jeelan Poola added a comment - Cc Matt Ingenthron Sergey Avseyev for help with error improvements in libcouchbase.
          marco.greco Marco Greco added a comment -

          Vikas Chaudhary the point still remains that at node 159, KV only becomes aware of the collection at 00:27:40

          021-02-17T00:27:40.954005-08:00 INFO (dst_bucket) collections: vb:160 create collection:id:0xc, name:coll_4 to scope:0x9, maxTTL:false 0, replica:true, seqno:10, manifest:0xa, force:false

          marco.greco Marco Greco added a comment - Vikas Chaudhary the point still remains that at node 159, KV only becomes aware of the collection at 00:27:40 021-02-17T00:27:40.954005-08:00 INFO (dst_bucket) collections: vb:160 create collection:id:0xc, name:coll_4 to scope:0x9, maxTTL:false 0, replica:true, seqno:10, manifest:0xa, force:false

          Marco Greco Then we should have other failures also for normal bucket op on 159 but all data is there in 

          dst_bucket.scope_1.coll_0 and dst_bucket.scope_1.coll_3, not seeing any "lcb_retry_failure" in eventing for bucket op. We definitely need someone from kV to look into this behaviour. Thank you for all the analysis Marco.

          Daniel Owen Can you please help in this why vb:160 on 159 is not active during rebalancing where the time between collection creation and rebalance is ~2min

           

           

          vikas.chaudhary Vikas Chaudhary added a comment - Marco Greco  Then we should have other failures also for normal bucket op on 159 but all data is there in  dst_bucket.scope_1.coll_0 and dst_bucket.scope_1.coll_3, not seeing any "lcb_retry_failure" in eventing for bucket op. We definitely need someone from kV to look into this behaviour. Thank you for all the analysis Marco. Daniel Owen  Can you please help in this why vb:160 on 159 is not active during rebalancing where the time between collection creation and rebalance is ~2min    
          marco.greco Marco Greco added a comment -

          Vikas Chaudhary let's wait for the next run with a definite error from KV before Daniel Owen gets involved.

          marco.greco Marco Greco added a comment - Vikas Chaudhary let's wait for the next run with a definite error from KV before Daniel Owen gets involved.
          vikas.chaudhary Vikas Chaudhary added a comment - Marco Greco  Sure

          Build couchbase-server-7.0.0-4536 contains go-couchbase commit 5740cd3 with commit message:
          MB-44328 tighten op retry, report error cause, go fmt

          build-team Couchbase Build Team added a comment - Build couchbase-server-7.0.0-4536 contains go-couchbase commit 5740cd3 with commit message: MB-44328 tighten op retry, report error cause, go fmt

          Vikas Chaudhary please share the results from a newer run, preferably with a build > 4536. cc Kamini Jagtiani

          mihir.kamdar Mihir Kamdar (Inactive) added a comment - Vikas Chaudhary please share the results from a newer run, preferably with a build > 4536. cc Kamini Jagtiani
          ritam.sharma Ritam Sharma added a comment -

          Lynn Straus - Removing the must pass label for now. There are few test cases failing, but that okay for beta refresh.

          ritam.sharma Ritam Sharma added a comment - Lynn Straus - Removing the must pass label for now. There are few test cases failing, but that okay for beta refresh.

          Marco Greco Still seeing the same with the latest run on 7.0.0-4554.

          ./testrunner -i /tmp/testexec.91066.ini -p get-cbcollect-info=True,GROUP=bucket_op,skip_log_scan=False,bucket_storage=couchstore -t eventing.eventing_rebalance_collection.EventingRebalanceCollection.test_eventing_rebalance_with_multiple_kv_nodes,doc-per-day=20,dataset=default,nodes_init=5,services_init=kv-kv-kv-eventing-index:n1ql,groups=simple,reset_services=True,GROUP=bucket_op,java_sdk_client=True,get-cbcollect-info=True,skip_log_scan=False,bucket_storage=couchstor
           
          	Exception: missing data in destination bucket. Current : 40316 Expected : 40320  dcp_backlog : 0  TIMERS_IN_PAST : 0 lcb_exceptions : {'172.23.105.247': {}}
           
          "lcb_exception_stats": {
                      "12009": 4,
                      "209": 4
                  } 

          vikas.chaudhary Vikas Chaudhary added a comment - Marco Greco  Still seeing the same with the latest run on 7.0.0-4554. ./testrunner -i /tmp/testexec.91066.ini -p get-cbcollect-info=True,GROUP=bucket_op,skip_log_scan=False,bucket_storage=couchstore -t eventing.eventing_rebalance_collection.EventingRebalanceCollection.test_eventing_rebalance_with_multiple_kv_nodes,doc-per-day=20,dataset=default,nodes_init=5,services_init=kv-kv-kv-eventing-index:n1ql,groups=simple,reset_services=True,GROUP=bucket_op,java_sdk_client=True,get-cbcollect-info=True,skip_log_scan=False,bucket_storage=couchstor   Exception: missing data in destination bucket. Current : 40316 Expected : 40320  dcp_backlog : 0  TIMERS_IN_PAST : 0 lcb_exceptions : {'172.23.105.247': {}}   "lcb_exception_stats": { "12009": 4, "209": 4 }
          marco.greco Marco Greco added a comment -

          The failures are all down to KV responding NOT_MY_VBUCKET - this is kind of expected doing a rebalance.
          We've retried 6 times - every time refreshing the bucket in order to get a new vbmap, so the question to be answered by ns_server is why is the vbmap not updated.
          N1QL is just a victim here.

          marco.greco Marco Greco added a comment - The failures are all down to KV responding NOT_MY_VBUCKET - this is kind of expected doing a rebalance. We've retried 6 times - every time refreshing the bucket in order to get a new vbmap, so the question to be answered by ns_server is why is the vbmap not updated. N1QL is just a victim here.

          Dave Finlay can someone from the ns_server team investigate why vbmap is not getting updated.

          vikas.chaudhary Vikas Chaudhary added a comment - Dave Finlay  can someone from the ns_server team investigate why vbmap is not getting updated.

          Build couchbase-server-7.0.0-4571 contains go-couchbase commit 5537227 with commit message:
          MB-44328, only give response Status in case of memcached errors

          build-team Couchbase Build Team added a comment - Build couchbase-server-7.0.0-4571 contains go-couchbase commit 5537227 with commit message: MB-44328 , only give response Status in case of memcached errors
          dfinlay Dave Finlay added a comment -

          Marco Greco: Would you mind pointing to logs that indicate you are getting a NMVB from KV? (There are a lot of logs attached to the ticket - can you indicate which cbcollect you are looking at?)

          dfinlay Dave Finlay added a comment - Marco Greco : Would you mind pointing to logs that indicate you are getting a NMVB from KV? (There are a lot of logs attached to the ticket - can you indicate which cbcollect you are looking at?)

          Dave Finlay, I believe cbcollect_info_ns_1@172.23.106.1_20210228-011950.zip is the one to focus on for the query-received errors in the last test.

          cbcollect_info_ns_1@172.23.106.1_20210228-011950/ns_server.query.log line 953-956.

          2021-02-27T17:03:24.507 the errors are received (noting 10 attempts); we can see vbucket movement messages at this time in e.g. cbcollect_info_ns_1@172.23.104.97_20210228-011951.zip

          HTH.

          Donald.haggart Donald Haggart added a comment - Dave Finlay , I believe cbcollect_info_ns_1@172.23.106.1_20210228-011950.zip is the one to focus on for the query-received errors in the last test. cbcollect_info_ns_1@172.23.106.1_20210228-011950/ns_server.query.log line 953-956. 2021-02-27T17:03:24.507 the errors are received (noting 10 attempts); we can see vbucket movement messages at this time in e.g. cbcollect_info_ns_1@172.23.104.97_20210228-011951.zip HTH.
          dfinlay Dave Finlay added a comment -

          Hi Donald - I don't see such a log attached to this ticket. I see this one: https://issues.couchbase.com/secure/attachment/129055/172.23.106.1-20210227-1719-diag.zip, but not the one you mention.

          dfinlay Dave Finlay added a comment - Hi Donald - I don't see such a log attached to this ticket. I see this one: https://issues.couchbase.com/secure/attachment/129055/172.23.106.1-20210227-1719-diag.zip , but not the one you mention.

          Hello Dave Finlay ,

          Sorry - that is the one.  172.23.106.1-20210227-1719-diag.zip contains cbcollect_info_ns_1@172.23.106.1_20210228-011950; I back-formed from my extracted data incorrectly.   (I had just downloaded all 8 most recent files and extracted all.)

          Donald.haggart Donald Haggart added a comment - Hello Dave Finlay , Sorry - that is the one.  172.23.106.1-20210227-1719-diag.zip contains cbcollect_info_ns_1@172.23.106.1_20210228-011950; I back-formed from my extracted data incorrectly.   (I had just downloaded all 8 most recent files and extracted all.)

          Build couchbase-server-7.0.0-4603 contains go-couchbase commit 5537227 with commit message:
          MB-44328, only give response Status in case of memcached errors

          build-team Couchbase Build Team added a comment - Build couchbase-server-7.0.0-4603 contains go-couchbase commit 5537227 with commit message: MB-44328 , only give response Status in case of memcached errors
          dfinlay Dave Finlay added a comment - - edited

          Hi Donald Haggart, Marco Greco

          Earlier on the ticket Marco said:

          The failures are all down to KV responding NOT_MY_VBUCKET - this is kind of expected doing a rebalance.
          We've retried 6 times - every time refreshing the bucket in order to get a new vbmap, so the question to be answered by ns_server is why is the vbmap not updated.

          Can we get a log trace copied into the ticket that indicates that query has received a NMVB - would be great if it identifies the problem vbucket. This will allow us to begin the investigation and correlate this trace with what's going on in the rebalance at that time and reply to Marco's question.

          dfinlay Dave Finlay added a comment - - edited Hi Donald Haggart , Marco Greco Earlier on the ticket Marco said: The failures are all down to KV responding NOT_MY_VBUCKET - this is kind of expected doing a rebalance. We've retried 6 times - every time refreshing the bucket in order to get a new vbmap, so the question to be answered by ns_server is why is the vbmap not updated. Can we get a log trace copied into the ticket that indicates that query has received a NMVB - would be great if it identifies the problem vbucket. This will allow us to begin the investigation and correlate this trace with what's going on in the rebalance at that time and reply to Marco's question.
          marco.greco Marco Greco added a comment -

          In 172.23.106.1-20210227-1719-diag.zip/cbcollect_info_ns_1@172.23.106.1_20210228-011950/ns_server.query.log we have for entries like this one, one after the other, one for each failed key:

          _time=2021-02-27T17:03:24.507-08:00 _level=ERROR _msg=Failed to perform INSERT on key <ud>doc_36457</ud> for Keyspace default:dst_bucket.scope_1.coll_4. Error - <ud>Key doc_36457 - cause: unable to complete action after 10 attemps: %!(EXTRA *gomemcached.MCResponse=MCResponse status=NOT_MY_VBUCKET, opcode=ADD, opaque=2, msg: {"rev":266,"name":"dst_bucket","nodeLocator":"vbucket","uuid":"c8bc84c2c026a8722d8b145c14cae999","uri":"/pools/default/buckets/dst_bucket?bucket_uuid=c8bc84c2c026a8722d8b145c14cae999","streamingUri":"/pools/default/bucketsStreaming/dst_bucket?bucket_uuid=c8bc84c2c026a8722d8b145c14cae999","bucketCapabilitiesVer":"","bucketCapabilities":["collections","durableWrite","tombstonedUserXAttrs","couchapi","subdoc.ReplaceBodyWithXattr","subdoc.DocumentMacroSupport","dcp","cbhello","touch","cccp","xdcrCheckpointing","nodesExt","xattr"],"collectionsManifestUid":"a","ddocs":{"uri":"/pools/default/buckets/dst_bucket/ddocs"},"vBucketServerMap":{"hashAlgorithm":"CRC","numReplicas":0,"serverList":["172.23.104.97:11210","172.23.105.238:11210","172.23.105.240:11210","172.23.107.65:11210","172.23.121.213:11210"],"vBucketMap":[[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4]
          ,[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3]],"vBucketMapForward":[[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[
          4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3]]},"nodes":[{"couchApiBase":"http://172.23.104.97:8092/dst_bucket%2Bc8bc84c2c026a8722d8b145c14cae999","hostname":"172.23.104.97:8091","ports":{"direct":11210}},{"couchApiBase":"http://172.23.105.238:8092/dst_bucket%2Bc8bc84c2c026a8722d8b145c14cae999","hostname":"172.23.105.238:8091","ports":{"direct":11210}},{"couchApiBase":"http://172.23.105.240:8092/dst_bucket%2Bc8bc84c2c026a8722d8b145c14cae999","hostname":"172.23.105.240:8091","ports":{"direct":11210}},{"couchApiBase":"http://172.23.107.65:8092/dst_bucket%2Bc8bc84c2c026a8722d8b145c14cae999","hostname":"172.23.107.65:8091","ports":{"direct":11210}},{"couchApiBase":"http://172.23.121.213:8092/dst_bucket%2Bc8bc84c2c026a8722d8b145c14cae999","hostname":"172.23.121.213:8091","ports":{"direct":11210}}],"nodesExt":[{"services":{"capi":8092,"capiSSL":18092,"kv":11210,"kvSSL":11207,"mgmt":8091,"mgmtSSL":18091,"projector":9999},"hostname":"172.23.104.97"},{"services":{"capi":8092,"capiSSL":18092,"kv":11210,"kvSSL":11207,"mgmt":8091,"mgmtSSL":18091,"projector":9999},"hostname":"172.23.105.238"},{"services":{"capi":8092,"capiSSL":18092,"kv":11210,"kvSSL":11207,"mgmt":8091,"mgmtSSL":18091,"projector":9999},"hostname":"172.23.105.240"},{"services":{"capi":8092,"capiSSL":18092,"kv":11210,"kvSSL":11207,"mgmt":8091,"mgmtSSL":18091,"projector":9999},"hostname":"172.23.107.65"},{"services":{"capi":8092,"capiSSL":18092,"kv":11210,"kvSSL":11207,"mgmt":8091,"mgmtSSL":18091,"projector":9999},"thisNode":true,"hostname":"172.23.121.213"},{"services":{"eventingAdminPort":8096,"eventingDebug":9140,"eventingSSL":18096,"mgmt":8091,"mgmtSSL":18091},"hostname":"172.23.105.247"},{"services":{"indexAdmin":9100,"indexHttp":9102,"indexHttps":19102,"indexScan":9101,"indexStreamCatchup":9104,"indexStreamInit":9103,"indexStreamMaint":9105,"mgmt":8091,"mgmtSSL":18091,"n1ql":8093,"n1qlSSL":18093},"hostname":"172.23.106.1"}],"clusterCapabilitiesVer":[1,0],"clusterCapabilities":{"n1ql":["costBasedOptimizer","indexAdvisor","javaScriptFunctions","inlineFunctions","enhancedPreparedStatements"]}})</ud>
          

          Note that when we see a NOT_MY_VBUCKET, we refresh the bucket, and get a new vbMap: in this case we've done it 10 times for 4 keys: go-couchbase/client.go

          so ns_server must be providing us with the old vbMap.
          The last error is at _time=2021-02-27T17:03:27.991-08:00, so about 3 seconds after the first error, we finally get the correct vbMap.

          marco.greco Marco Greco added a comment - In 172.23.106.1-20210227-1719-diag.zip/cbcollect_info_ns_1@172.23.106.1_20210228-011950/ns_server.query.log we have for entries like this one, one after the other, one for each failed key: _time=2021-02-27T17:03:24.507-08:00 _level=ERROR _msg=Failed to perform INSERT on key <ud>doc_36457</ud> for Keyspace default:dst_bucket.scope_1.coll_4. Error - <ud>Key doc_36457 - cause: unable to complete action after 10 attemps: %!(EXTRA *gomemcached.MCResponse=MCResponse status=NOT_MY_VBUCKET, opcode=ADD, opaque=2, msg: {"rev":266,"name":"dst_bucket","nodeLocator":"vbucket","uuid":"c8bc84c2c026a8722d8b145c14cae999","uri":"/pools/default/buckets/dst_bucket?bucket_uuid=c8bc84c2c026a8722d8b145c14cae999","streamingUri":"/pools/default/bucketsStreaming/dst_bucket?bucket_uuid=c8bc84c2c026a8722d8b145c14cae999","bucketCapabilitiesVer":"","bucketCapabilities":["collections","durableWrite","tombstonedUserXAttrs","couchapi","subdoc.ReplaceBodyWithXattr","subdoc.DocumentMacroSupport","dcp","cbhello","touch","cccp","xdcrCheckpointing","nodesExt","xattr"],"collectionsManifestUid":"a","ddocs":{"uri":"/pools/default/buckets/dst_bucket/ddocs"},"vBucketServerMap":{"hashAlgorithm":"CRC","numReplicas":0,"serverList":["172.23.104.97:11210","172.23.105.238:11210","172.23.105.240:11210","172.23.107.65:11210","172.23.121.213:11210"],"vBucketMap":[[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4] ,[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3]],"vBucketMapForward":[[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[ 4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3]]},"nodes":[{"couchApiBase":"http://172.23.104.97:8092/dst_bucket%2Bc8bc84c2c026a8722d8b145c14cae999","hostname":"172.23.104.97:8091","ports":{"direct":11210}},{"couchApiBase":"http://172.23.105.238:8092/dst_bucket%2Bc8bc84c2c026a8722d8b145c14cae999","hostname":"172.23.105.238:8091","ports":{"direct":11210}},{"couchApiBase":"http://172.23.105.240:8092/dst_bucket%2Bc8bc84c2c026a8722d8b145c14cae999","hostname":"172.23.105.240:8091","ports":{"direct":11210}},{"couchApiBase":"http://172.23.107.65:8092/dst_bucket%2Bc8bc84c2c026a8722d8b145c14cae999","hostname":"172.23.107.65:8091","ports":{"direct":11210}},{"couchApiBase":"http://172.23.121.213:8092/dst_bucket%2Bc8bc84c2c026a8722d8b145c14cae999","hostname":"172.23.121.213:8091","ports":{"direct":11210}}],"nodesExt":[{"services":{"capi":8092,"capiSSL":18092,"kv":11210,"kvSSL":11207,"mgmt":8091,"mgmtSSL":18091,"projector":9999},"hostname":"172.23.104.97"},{"services":{"capi":8092,"capiSSL":18092,"kv":11210,"kvSSL":11207,"mgmt":8091,"mgmtSSL":18091,"projector":9999},"hostname":"172.23.105.238"},{"services":{"capi":8092,"capiSSL":18092,"kv":11210,"kvSSL":11207,"mgmt":8091,"mgmtSSL":18091,"projector":9999},"hostname":"172.23.105.240"},{"services":{"capi":8092,"capiSSL":18092,"kv":11210,"kvSSL":11207,"mgmt":8091,"mgmtSSL":18091,"projector":9999},"hostname":"172.23.107.65"},{"services":{"capi":8092,"capiSSL":18092,"kv":11210,"kvSSL":11207,"mgmt":8091,"mgmtSSL":18091,"projector":9999},"thisNode":true,"hostname":"172.23.121.213"},{"services":{"eventingAdminPort":8096,"eventingDebug":9140,"eventingSSL":18096,"mgmt":8091,"mgmtSSL":18091},"hostname":"172.23.105.247"},{"services":{"indexAdmin":9100,"indexHttp":9102,"indexHttps":19102,"indexScan":9101,"indexStreamCatchup":9104,"indexStreamInit":9103,"indexStreamMaint":9105,"mgmt":8091,"mgmtSSL":18091,"n1ql":8093,"n1qlSSL":18093},"hostname":"172.23.106.1"}],"clusterCapabilitiesVer":[1,0],"clusterCapabilities":{"n1ql":["costBasedOptimizer","indexAdvisor","javaScriptFunctions","inlineFunctions","enhancedPreparedStatements"]}})</ud> Note that when we see a NOT_MY_VBUCKET, we refresh the bucket, and get a new vbMap: in this case we've done it 10 times for 4 keys: go-couchbase/client.go so ns_server must be providing us with the old vbMap. The last error is at _time=2021-02-27T17:03:27.991-08:00, so about 3 seconds after the first error, we finally get the correct vbMap.
          marco.greco Marco Greco added a comment -

          Dave Finlay attached.

          marco.greco Marco Greco added a comment - Dave Finlay attached.
          dfinlay Dave Finlay added a comment -

          Thanks Marco - we'll take a look.

          dfinlay Dave Finlay added a comment - Thanks Marco - we'll take a look.
          dfinlay Dave Finlay added a comment - - edited

          Marco Greco, Vikas Chaudhary: So, this is actually normal - I think the issue is that the go-couchbase client doesn't do a very good job of handling the NMVBs. Let's go through one of the examples.

          Rebalance begins:

          [user:info,2021-02-27T17:03:18.231-08:00,ns_1@172.23.104.97:<0.715.0>:ns_orchestrator:idle:776]Starting rebalance, KeepNodes = ['ns_1@172.23.104.97','ns_1@172.23.105.238',
                                           'ns_1@172.23.105.240','ns_1@172.23.105.247',
                                           'ns_1@172.23.106.1','ns_1@172.23.107.65',
                                           'ns_1@172.23.121.213'], EjectNodes = [], Failed over and being ejected nodes = []; no delta recovery nodes; Operation Id = 300d01a1ee7ac33e0999b6547d34ccf8
          

          The "dst_bucket" begins its rebalance and the plan for vbucket 996 is as follows:

          [user:info,2021-02-27T17:03:22.274-08:00,ns_1@172.23.104.97:<0.5238.1>:ns_vbucket_mover:init:130]Bucket "dst_bucket" rebalance does not seem to be swap rebalance
          [ns_server:debug,2021-02-27T17:03:22.290-08:00,ns_1@172.23.104.97:<0.5238.1>:vbucket_move_scheduler:prepare:228]The following moves are planned:
           {996,['ns_1@172.23.121.213'],['ns_1@172.23.107.65'],[]},
          

          That is, vbucket 996 will move from .213 to .65 as part of the rebalance. The move for vbucket 996 starts here:

          [rebalance:debug,2021-02-27T17:03:24.175-08:00,ns_1@172.23.104.97:<0.5238.1>:ns_single_vbucket_mover:spawn_mover:28]Spawned single vbucket mover: [<0.5238.1>,"dst_bucket",996,
                                         ['ns_1@172.23.121.213'],
                                         ['ns_1@172.23.107.65'],
                                         []] (<0.9547.1>)
          

          It proceeds rapidly (I guess there isn't a lot of data) and we get to the takeover point. On node .65 we see:

          [rebalance:debug,2021-02-27T17:03:24.278-08:00,ns_1@172.23.107.65:<0.18588.0>:dcp_consumer_conn:handle_cast:289]Partition 996 is about to change status to active
          

          Back on the orchestrator we then update the vbucket map:

          [ns_server:debug,2021-02-27T17:03:24.287-08:00,ns_1@172.23.104.97:<0.9809.1>:ns_single_vbucket_mover:update_vbucket_map:570]Updating vbucket map for bucket "dst_bucket", vbucket 996
          [ns_server:debug,2021-02-27T17:03:24.373-08:00,ns_1@172.23.104.97:<0.5238.1>:ns_vbucket_mover:map_sync:324]Batched 7 vbucket map syncs together.
          [ns_server:debug,2021-02-27T17:03:24.496-08:00,ns_1@172.23.104.97:<0.9809.1>:ns_single_vbucket_mover:update_vbucket_map:577]Updated vbucket map for bucket "dst_bucket", vbucket 996 in 208897 us
          

          Even though the vbucket map has been updated on a majority of nodes at this point, it still may be the case that it will take a few milliseconds for these updates to hit all of the nodes. This is what we see in this case. The error in query is:

          _time=2021-02-27T17:03:24.507-08:00 _level=ERROR _msg=Failed to perform INSERT on key <ud>doc_36457</ud> for Keyspace default:dst_bucket.scope_1.coll_4. Error - <ud>Key doc_36457 - cause: unable to complete action after 10 attemps: %!(EXTRA *gomemcached.MCResponse=MCResponse status=NOT_MY_VBUCKET, opcode=ADD, opaque=2, msg: {"rev":266,"name":"dst_bucket","nodeLocator":"vbucket","uuid":"c8bc84c2c026a8722d8b145c14cae999","uri":"/pools/default/buckets/dst_bucket?
          

          The key is doc_36457 which you can validate belongs in bucket 996. This request has gone to node .213 - the old node. At this point the old node is no longer the master and correctly responds with an NMVB.

          We can actually look at the response payload logged in query which includes the vbucket map. We can see that there are 5 servers:

          $ cat vbmap.json | jq .vBucketServerMap.serverList
          [
            "172.23.104.97:11210",
            "172.23.105.238:11210",
            "172.23.105.240:11210",
            "172.23.107.65:11210",
            "172.23.121.213:11210"
          ]
          

          And the current server that hosts 996 according to the vbucket map is "4", which is .213:

          $ cat vbmap.json | jq ".vBucketServerMap.vBucketMap[996]"
          [
            4
          ]
          

          We can look at the forward map too - it is also included in the vbucket map:

          $ cat vbmap.json | jq '.vBucketServerMap.vBucketMapForward[996]'
          [
            3
          ]
          

          Which tells us that the new home for the vbucket will be on .65.

          The fact that it took 200+ ms to update the vbucket map is I think what can happen from time to time. The histogram of vbucket update times from the 3 buckets being rebalanced in this rebalance is as follows:

          This is in milliseconds and each bucket is 10 milliseconds wide. The orchestrator is busy, but not ridiculously so:

          There will be occasions when it'll be worse.

          Let's come now to the go-couchbase client. The client tries 2 * (number of nodes in the bucket server list) which in this case is going to be 10 times as Marco says. The issue is that there's no backoff - these 10 requests will probably execute in a handful of milliseconds and there's a very good chances with vbucket map updates taking on average 120 ms but with a long tail that all 10 tries will fail.

          The good news is that the vbucket forward map is present in the reply from memcached. If this is consulted the go-couchbase client will see that it should try the server that hosts the vbucket after it moves. If this were consulted, the request would have succeed after the second try. This strategy is used by most if not all of the SDK clients. CC: Michael Nitschinger, Brett Lawson.

          Second, there may be occasions where there will be no "forward map". In those instances it seems like it would be good to have some kind of backoff on retry when a NMVB is encountered. Would be good to hear from Michael or Brett on their strategies in general for dealing with NMVBs.

          In essence, if we are to try and reduce these kinds of NMVBs to a minimum we need to improve the go-couchbase client.

          dfinlay Dave Finlay added a comment - - edited Marco Greco , Vikas Chaudhary : So, this is actually normal - I think the issue is that the go-couchbase client doesn't do a very good job of handling the NMVBs. Let's go through one of the examples. Rebalance begins: [user:info,2021-02-27T17:03:18.231-08:00,ns_1@172.23.104.97:<0.715.0>:ns_orchestrator:idle:776]Starting rebalance, KeepNodes = ['ns_1@172.23.104.97','ns_1@172.23.105.238', 'ns_1@172.23.105.240','ns_1@172.23.105.247', 'ns_1@172.23.106.1','ns_1@172.23.107.65', 'ns_1@172.23.121.213'], EjectNodes = [], Failed over and being ejected nodes = []; no delta recovery nodes; Operation Id = 300d01a1ee7ac33e0999b6547d34ccf8 The "dst_bucket" begins its rebalance and the plan for vbucket 996 is as follows: [user:info,2021-02-27T17:03:22.274-08:00,ns_1@172.23.104.97:<0.5238.1>:ns_vbucket_mover:init:130]Bucket "dst_bucket" rebalance does not seem to be swap rebalance [ns_server:debug,2021-02-27T17:03:22.290-08:00,ns_1@172.23.104.97:<0.5238.1>:vbucket_move_scheduler:prepare:228]The following moves are planned: {996,['ns_1@172.23.121.213'],['ns_1@172.23.107.65'],[]}, That is, vbucket 996 will move from .213 to .65 as part of the rebalance. The move for vbucket 996 starts here: [rebalance:debug,2021-02-27T17:03:24.175-08:00,ns_1@172.23.104.97:<0.5238.1>:ns_single_vbucket_mover:spawn_mover:28]Spawned single vbucket mover: [<0.5238.1>,"dst_bucket",996, ['ns_1@172.23.121.213'], ['ns_1@172.23.107.65'], []] (<0.9547.1>) It proceeds rapidly (I guess there isn't a lot of data) and we get to the takeover point. On node .65 we see: [rebalance:debug,2021-02-27T17:03:24.278-08:00,ns_1@172.23.107.65:<0.18588.0>:dcp_consumer_conn:handle_cast:289]Partition 996 is about to change status to active Back on the orchestrator we then update the vbucket map: [ns_server:debug,2021-02-27T17:03:24.287-08:00,ns_1@172.23.104.97:<0.9809.1>:ns_single_vbucket_mover:update_vbucket_map:570]Updating vbucket map for bucket "dst_bucket", vbucket 996 [ns_server:debug,2021-02-27T17:03:24.373-08:00,ns_1@172.23.104.97:<0.5238.1>:ns_vbucket_mover:map_sync:324]Batched 7 vbucket map syncs together. [ns_server:debug,2021-02-27T17:03:24.496-08:00,ns_1@172.23.104.97:<0.9809.1>:ns_single_vbucket_mover:update_vbucket_map:577]Updated vbucket map for bucket "dst_bucket", vbucket 996 in 208897 us Even though the vbucket map has been updated on a majority of nodes at this point, it still may be the case that it will take a few milliseconds for these updates to hit all of the nodes. This is what we see in this case. The error in query is: _time=2021-02-27T17:03:24.507-08:00 _level=ERROR _msg=Failed to perform INSERT on key <ud>doc_36457</ud> for Keyspace default:dst_bucket.scope_1.coll_4. Error - <ud>Key doc_36457 - cause: unable to complete action after 10 attemps: %!(EXTRA *gomemcached.MCResponse=MCResponse status=NOT_MY_VBUCKET, opcode=ADD, opaque=2, msg: {"rev":266,"name":"dst_bucket","nodeLocator":"vbucket","uuid":"c8bc84c2c026a8722d8b145c14cae999","uri":"/pools/default/buckets/dst_bucket? The key is doc_36457 which you can validate belongs in bucket 996. This request has gone to node .213 - the old node. At this point the old node is no longer the master and correctly responds with an NMVB. We can actually look at the response payload logged in query which includes the vbucket map. We can see that there are 5 servers: $ cat vbmap.json | jq .vBucketServerMap.serverList [ "172.23.104.97:11210", "172.23.105.238:11210", "172.23.105.240:11210", "172.23.107.65:11210", "172.23.121.213:11210" ] And the current server that hosts 996 according to the vbucket map is "4", which is .213: $ cat vbmap.json | jq ".vBucketServerMap.vBucketMap[996]" [ 4 ] We can look at the forward map too - it is also included in the vbucket map: $ cat vbmap.json | jq '.vBucketServerMap.vBucketMapForward[996]' [ 3 ] Which tells us that the new home for the vbucket will be on .65. The fact that it took 200+ ms to update the vbucket map is I think what can happen from time to time. The histogram of vbucket update times from the 3 buckets being rebalanced in this rebalance is as follows: This is in milliseconds and each bucket is 10 milliseconds wide. The orchestrator is busy, but not ridiculously so: There will be occasions when it'll be worse. Let's come now to the go-couchbase client. The client tries 2 * (number of nodes in the bucket server list) which in this case is going to be 10 times as Marco says. The issue is that there's no backoff - these 10 requests will probably execute in a handful of milliseconds and there's a very good chances with vbucket map updates taking on average 120 ms but with a long tail that all 10 tries will fail. The good news is that the vbucket forward map is present in the reply from memcached. If this is consulted the go-couchbase client will see that it should try the server that hosts the vbucket after it moves. If this were consulted, the request would have succeed after the second try. This strategy is used by most if not all of the SDK clients. CC: Michael Nitschinger , Brett Lawson . Second, there may be occasions where there will be no "forward map". In those instances it seems like it would be good to have some kind of backoff on retry when a NMVB is encountered. Would be good to hear from Michael or Brett on their strategies in general for dealing with NMVBs. In essence, if we are to try and reduce these kinds of NMVBs to a minimum we need to improve the go-couchbase client.
          marco.greco Marco Greco added a comment -

          We do have a plan to rewrite the n1ql datastore package to have a primitives, and rewrite the memcached primitives implementation to use the memcached response, in fact I do have some POC code.
          This has been delayed to post CC though.

          marco.greco Marco Greco added a comment - We do have a plan to rewrite the n1ql datastore package to have a primitives, and rewrite the memcached primitives implementation to use the memcached response, in fact I do have some POC code. This has been delayed to post CC though.
          marco.greco Marco Greco added a comment -

          Vikas Chaudhary the state of affairs is that NMVB may show up during rebalances.
          N1QL may handle them better in vNext, but currently relies on ns_server to provide an updated map, and this is not always available.

          marco.greco Marco Greco added a comment - Vikas Chaudhary the state of affairs is that NMVB may show up during rebalances. N1QL may handle them better in vNext, but currently relies on ns_server to provide an updated map, and this is not always available.
          jeelan.poola Jeelan Poola added a comment -

          Marco Greco Could you please confirm if the same problem exists in 6.6.x or if the problem is new to 7.0? If later, then this will be considered a regression and eventing customers upgrading to 7.x may run into it. And our important eventing customers use N1QL-in-eventing heavily. Thank you!

          jeelan.poola Jeelan Poola added a comment - Marco Greco Could you please confirm if the same problem exists in 6.6.x or if the problem is new to 7.0? If later, then this will be considered a regression and eventing customers upgrading to 7.x may run into it. And our important eventing customers use N1QL-in-eventing heavily. Thank you!
          marco.greco Marco Greco added a comment -

          The NMVB handling has been like that since 4.5 - it's got progressively better over the versions, but this is far from a regression.

          marco.greco Marco Greco added a comment - The NMVB handling has been like that since 4.5 - it's got progressively better over the versions, but this is far from a regression.
          jeelan.poola Jeelan Poola added a comment -

          Vikas Chaudhary Could you please confirm if the issue exists in 6.6.x/6.5.x? Could you please share cbcollects from 6.6.x/6.5.x runs? I understand it will be run with bkts and not collections. But that should not have any impact on Rebalance/NMVB behaviour. Thank you!

          jeelan.poola Jeelan Poola added a comment - Vikas Chaudhary Could you please confirm if the issue exists in 6.6.x/6.5.x? Could you please share cbcollects from 6.6.x/6.5.x runs? I understand it will be run with bkts and not collections. But that should not have any impact on Rebalance/NMVB behaviour. Thank you!
          dfinlay Dave Finlay added a comment -

          Note: Even if fewer NMVBs show in earlier releases, it doesn't change the fact that this is simply a race between the a client performing gets and when it gets an updated copy of the vbucket map as the state changes in memcached first and the vbucket map gets updated later. If we want this to always work during normal (i.e. non-failed, non-stopped rebalances) the client is going to need to change in some fashion.

          dfinlay Dave Finlay added a comment - Note: Even if fewer NMVBs show in earlier releases, it doesn't change the fact that this is simply a race between the a client performing gets and when it gets an updated copy of the vbucket map as the state changes in memcached first and the vbucket map gets updated later. If we want this to always work during normal (i.e. non-failed, non-stopped rebalances) the client is going to need to change in some fashion.

          Jeelan Poola We haven't seen this behaviour with buckets(6.6.x), i strongly believe we will be going to have lots of CBSE's around this. CAS mismatch with UPSERT will definitely raise many questions.

          vikas.chaudhary Vikas Chaudhary added a comment - Jeelan Poola  We haven't seen this behaviour with buckets(6.6.x), i strongly believe we will be going to have lots of CBSE's around this. CAS mismatch with UPSERT will definitely raise many questions.
          marco.greco Marco Greco added a comment -

          Vikas Chaudhary the behaviour is understood and expected: N1QL relies on ns_sever for he vbMap, and if this is not updated, N1QL will fail after a while.
          The only way the behaviour has changed in between versions is that more checks and retry cases have been added.
          I can't tell you why you haven't seen it in 6.6, but it definitely is there.
          Also - N1QL does not return a "CAS mismatch" error, the SDK does, and I believe that the error text is misleading, since there is no CAS mismatch, so part of the fix here is to have the SDKs change the error message.
          There is no plan to change how N1QL handles NMVB in Cheshire Cat: if you feel strongly that it should be changed now, then this should be agreed between QE and development management.

          marco.greco Marco Greco added a comment - Vikas Chaudhary the behaviour is understood and expected: N1QL relies on ns_sever for he vbMap, and if this is not updated, N1QL will fail after a while. The only way the behaviour has changed in between versions is that more checks and retry cases have been added. I can't tell you why you haven't seen it in 6.6, but it definitely is there. Also - N1QL does not return a "CAS mismatch" error, the SDK does, and I believe that the error text is misleading, since there is no CAS mismatch, so part of the fix here is to have the SDKs change the error message. There is no plan to change how N1QL handles NMVB in Cheshire Cat: if you feel strongly that it should be changed now, then this should be agreed between QE and development management.

          Jeelan Poola Jon Strabala Marco Greco We should correct the error message at least so that customer is not confused with the behavior. Who should be the right team for error message fix ?

          vikas.chaudhary Vikas Chaudhary added a comment - Jeelan Poola Jon Strabala Marco Greco  We should correct the error message at least so that customer is not confused with the behavior. Who should be the right team for error message fix ?
          marco.greco Marco Greco added a comment -

          The message comes from libcouchbase, so the SDK team.

          marco.greco Marco Greco added a comment - The message comes from libcouchbase, so the SDK team.
          dfinlay Dave Finlay added a comment - - edited

          I checked the vbucket map update times in the 6.6.x run - and they are shorter by a reasonable amount. I see an average vbucket map update time of about 73 ms as compared with 125 ms in 7.0. The histogram looks as follows:

          Also for comparison, the orchestrator CPU is a bit less busy than the 7.0 graph, but it's roughly similar.

          Again the width of each bucket is 10 ms. The difference in the average vbucket map update time is almost certainly due to the fact that chronicle needs to persist the configuration change. I don't believe there is much that we can do to optimize this sufficiently to gain back 50 ms on average - and in any case it's an unfair comparison. Disk ops are going to be slower the in-memory ops.

          That said, even in 6.6 there are plenty of vbucket map update times in excess of 100 ms - and so if a query is received at the wrong time it will almost certainly result in a NVMB. It's slightly less likely to be exploited but the same race exists in in both 6.6.x and 7.0.

          dfinlay Dave Finlay added a comment - - edited I checked the vbucket map update times in the 6.6.x run - and they are shorter by a reasonable amount. I see an average vbucket map update time of about 73 ms as compared with 125 ms in 7.0. The histogram looks as follows: Also for comparison, the orchestrator CPU is a bit less busy than the 7.0 graph, but it's roughly similar. Again the width of each bucket is 10 ms. The difference in the average vbucket map update time is almost certainly due to the fact that chronicle needs to persist the configuration change. I don't believe there is much that we can do to optimize this sufficiently to gain back 50 ms on average - and in any case it's an unfair comparison. Disk ops are going to be slower the in-memory ops. That said, even in 6.6 there are plenty of vbucket map update times in excess of 100 ms - and so if a query is received at the wrong time it will almost certainly result in a NVMB. It's slightly less likely to be exploited but the same race exists in in both 6.6.x and 7.0.
          jeelan.poola Jeelan Poola added a comment - - edited

          Thanks a lot Dave Finlay for your insights on increase in average-vbucket-map-update-time[1] in 7.0 due to chronicle!

          Marco Greco Given above, is there anything that query can change in it's retry logic to better adapt to increase in [1] in 7.0?

          Matt Ingenthron Brett Lawson Michael Nitschinger Similarly, given above, is there anything that SDKs can change in their retry logic to better adapt to increase in [1] in 7.0? We also need to take care of returning correct error message for failures due to NMVB instead of reporting CAS_MISMATCH in lcb.

          IMHO, it's definitely necessary that affected CB sub systems do take care of the changes in behaviour in [1] in order to minimise potentially perceivable impact on customer deployments / use cases when they upgrade to 7.0.

          Cc Arunkumar Senthilnathan.

          jeelan.poola Jeelan Poola added a comment - - edited Thanks a lot Dave Finlay for your insights on increase in average-vbucket-map-update-time [1] in 7.0 due to chronicle! Marco Greco Given above, is there anything that query can change in it's retry logic to better adapt to increase in [1] in 7.0? Matt Ingenthron Brett Lawson Michael Nitschinger Similarly, given above, is there anything that SDKs can change in their retry logic to better adapt to increase in [1] in 7.0? We also need to take care of returning correct error message for failures due to NMVB instead of reporting CAS_MISMATCH in lcb. IMHO, it's definitely necessary that affected CB sub systems do take care of the changes in behaviour in [1] in order to minimise potentially perceivable impact on customer deployments / use cases when they upgrade to 7.0. Cc Arunkumar Senthilnathan .
          marco.greco Marco Greco added a comment -

          Jeelan Poola, as stated query will change the way it handles retries, but that's slated for after cheshire cat.
          If you feel strongly that the work should be done for cheshire cat, this has to be discussed with Kamini / QE.

          marco.greco Marco Greco added a comment - Jeelan Poola , as stated query will change the way it handles retries, but that's slated for after cheshire cat. If you feel strongly that the work should be done for cheshire cat, this has to be discussed with Kamini / QE.
          daschl Michael Nitschinger added a comment - - edited

          (updating this comment after getting more clarification from marco) Jeelan Poola from an SDK point of view there is not much we can do. the query engine returns with error 12009 which we map to a cas mismatch since it is one of the potential causes (https://github.com/couchbaselabs/sdk-rfcs/blob/master/rfc/0058-error-handling.md#9-casmismatch).

          In this case it looks like it isn't a cas mismatch, but this is not something the SDK has any knowledge about so it can't do anything about it.

          daschl Michael Nitschinger added a comment - - edited (updating this comment after getting more clarification from marco) Jeelan Poola from an SDK point of view there is not much we can do. the query engine returns with error 12009 which we map to a cas mismatch since it is one of the potential causes ( https://github.com/couchbaselabs/sdk-rfcs/blob/master/rfc/0058-error-handling.md#9-casmismatch ). In this case it looks like it isn't a cas mismatch, but this is not something the SDK has any knowledge about so it can't do anything about it.
          jeelan.poola Jeelan Poola added a comment - - edited

          Marco Greco It seems like the inaccurate-error-reported is back into query's domain . Could you please confirm if something can be done in query to give a better error code for NMVB related failures which can enable SDKs to retry with some backoff logic? Please let me know if we should hop onto a call to discuss further.
          Cc Kamini Jagtiani

          jeelan.poola Jeelan Poola added a comment - - edited Marco Greco It seems like the inaccurate-error-reported is back into query's domain . Could you please confirm if something can be done in query to give a better error code for NMVB related failures which can enable SDKs to retry with some backoff logic? Please let me know if we should hop onto a call to discuss further. Cc Kamini Jagtiani
          marco.greco Marco Greco added a comment -

          Jeelan Poola the SDK is not supposed to retry on this kind of errors, N1QL is.
          And it has, 2x the number of nodes, it does even say that it has in the error text!

          The issue is if the SDK retries n times an operation that query will retry n times, we may basically end up retrying in a cartesian manner, which probably wouldn't be a good user experience.
          It also would be a risky change to make in the SDK so late.
          Also, as part of this defect, I have augmented the error text to have the memcached status of the last error, so, in case a NMVB the error text returned would already read something like "unable to complete action after 10 attempts: 7" (where "7" is the NMVB status).

          The way to handle this slowdown in ns_server refresh is to have n1ql handle the retry differently, which we plan to do after cheshire_cat.
          I don't have an issue to code this now, but we are very late in the development cycle, so the change would be kind of dangerous: you would have to get Kamini and QE to agree with that.

          marco.greco Marco Greco added a comment - Jeelan Poola the SDK is not supposed to retry on this kind of errors, N1QL is. And it has, 2x the number of nodes, it does even say that it has in the error text! The issue is if the SDK retries n times an operation that query will retry n times, we may basically end up retrying in a cartesian manner, which probably wouldn't be a good user experience. It also would be a risky change to make in the SDK so late. Also, as part of this defect, I have augmented the error text to have the memcached status of the last error, so, in case a NMVB the error text returned would already read something like "unable to complete action after 10 attempts: 7" (where "7" is the NMVB status). The way to handle this slowdown in ns_server refresh is to have n1ql handle the retry differently, which we plan to do after cheshire_cat. I don't have an issue to code this now, but we are very late in the development cycle, so the change would be kind of dangerous: you would have to get Kamini and QE to agree with that.
          jeelan.poola Jeelan Poola added a comment - - edited

          Thank you Marco Greco! It's much more clearer to me now.
          I still think the main error should not state 'LCB_ERR_CAS_MISMATCH'. It's misleading.

          Kamini JagtianiMihir Kamdar I strongly think we should fix the retry logic in query for NMVB errors in 7.0. Otherwise, we risk customers running into this problem when (even while) they upgrade to 7.0 (affects production readiness). Most important eventing use cases use a lot of n1ql-in-eventing and expect no event loss/failures during rebalance. We do understand the same race exists in 6.6.x but the probability of customers running into it has increased with increase-in-average-vbucket-map-update-time-due-to-chronicle. And other CB sub systems must take care of it as much as possible and try our best to not pass on the side effects to users.

          jeelan.poola Jeelan Poola added a comment - - edited Thank you Marco Greco ! It's much more clearer to me now. I still think the main error should not state 'LCB_ERR_CAS_MISMATCH'. It's misleading. Kamini Jagtiani Mihir Kamdar I strongly think we should fix the retry logic in query for NMVB errors in 7.0. Otherwise, we risk customers running into this problem when (even while) they upgrade to 7.0 (affects production readiness). Most important eventing use cases use a lot of n1ql-in-eventing and expect no event loss/failures during rebalance. We do understand the same race exists in 6.6.x but the probability of customers running into it has increased with increase-in-average-vbucket-map-update-time-due-to-chronicle. And other CB sub systems must take care of it as much as possible and try our best to not pass on the side effects to users.
          kamini.jagtiani Kamini Jagtiani added a comment - - edited

          Hi Jeelan Poola

          Marco Greco is trying out a fix for better error messages.

          memcached status will now be reflected in the error message during retry and also CAS will be mentioned only during update and upsert errors

          Hope this helps.

          A full-blown change for handling NMVB is planned post CC.

          kamini.jagtiani Kamini Jagtiani added a comment - - edited Hi Jeelan Poola Marco Greco  is trying out a fix for better error messages. memcached status will now be reflected in the error message during retry and also CAS will be mentioned only during update and upsert errors Hope this helps. A full-blown change for handling NMVB is planned post CC.

          Build couchbase-server-7.0.0-4720 contains go-couchbase commit 1e26bfa with commit message:
          MB-44328 clarify memcached status in retry error

          build-team Couchbase Build Team added a comment - Build couchbase-server-7.0.0-4720 contains go-couchbase commit 1e26bfa with commit message: MB-44328 clarify memcached status in retry error

          Build couchbase-server-7.0.0-4720 contains query commit eec2cba with commit message:
          MB-44328 only mention CAS on update and upsert error with key_eexist

          build-team Couchbase Build Team added a comment - Build couchbase-server-7.0.0-4720 contains query commit eec2cba with commit message: MB-44328 only mention CAS on update and upsert error with key_eexist
          jeelan.poola Jeelan Poola added a comment -

          Thank you Kamini Jagtiani! A better error which clearly states the underlying reason (failed due to NMVB after retries) for failure would be of great help.

          However, I still think we should fix retries on NMVB failures better in 7.0 itself. I would like to better understand the risk associated with the fix. As mentioned earlier, n1ql-in-eventing is dominant use case and any event loss during rebalance is not expected. Thank you!

          jeelan.poola Jeelan Poola added a comment - Thank you Kamini Jagtiani ! A better error which clearly states the underlying reason (failed due to NMVB after retries) for failure would be of great help. However, I still think we should fix retries on NMVB failures better in 7.0 itself. I would like to better understand the risk associated with the fix. As mentioned earlier, n1ql-in-eventing is dominant use case and any event loss during rebalance is not expected. Thank you!
          marco.greco Marco Greco added a comment -

          The risk goes as follows: refreshing VBmaps is central to the whole query operation, by altering the code that refreshes the maps so as to accomodate maps coming from memcached responses, we risk introducing regressions in the forms of panics, hangs and other bucket related failures, but during normal operations as well!

          marco.greco Marco Greco added a comment - The risk goes as follows: refreshing VBmaps is central to the whole query operation, by altering the code that refreshes the maps so as to accomodate maps coming from memcached responses, we risk introducing regressions in the forms of panics, hangs and other bucket related failures, but during normal operations as well!
          jeelan.poola Jeelan Poola added a comment - - edited

          Thanks a lot Marco Greco!
          Vikas Chaudhary Could you please help trigger a test run with the Toy from Marco and share the results? Thank you!

          jeelan.poola Jeelan Poola added a comment - - edited Thanks a lot Marco Greco ! Vikas Chaudhary Could you please help trigger a test run with the Toy from Marco and share the results? Thank you!

          Since Vikas is out, requesting Vinayaka Kamath to trigger a run with failing tests on dev-cluster and share the results. Thank you!

          jeelan.poola Jeelan Poola added a comment - Since Vikas is out, requesting Vinayaka Kamath to trigger a run with failing tests on dev-cluster and share the results. Thank you!
          jeelan.poola Jeelan Poola added a comment -

          Vinayaka could not schedule the run with Toy today as tests seem to have been commented out for the time being by Vikas. Cc Ritam Sharma Chanabasappa Ghali in case they can help.

          jeelan.poola Jeelan Poola added a comment - Vinayaka could not schedule the run with Toy today as tests seem to have been commented out for the time being by Vikas. Cc Ritam Sharma Chanabasappa Ghali in case they can help.
          vikas.chaudhary Vikas Chaudhary added a comment - - edited

          Marco Greco Jeelan Poola not seeing any error with functional tests but seeing error during volume 

          2021-03-22T05:54:26.577-07:00 [INFO] "Query failed: " {"message":"SDK error : LCB_ERR_CAS_MISMATCH (209) Query error : {\n\"requestID\": \"86a62735-f4e0-4713-bfe8-c19d608d7196\",\n\"clientContextID\": \"4@n1ql0_0.js(OnUpdate)\",\n\"signature\": null,\n\"results\": [\n],\n\"errors\": [{\"code\":12009,\"msg\":\"DML Error, possible causes include concurrent modificationFailed to perform UPSERT - cause: unable to complete action after 12 attemps: NOT_MY_VBUCKET\"}],\n\"status\": \"errors\",\n\"metrics\": {\"elapsedTime\": \"44.373269ms\",\"executionTime\": \"44.335353ms\",\"resultCount\": 0,\"resultSize\": 0,\"serviceLoad\": 40,\"errorCount\": 1}\n}\n","stack":"Error\n    at OnUpdate (n1ql0_0.js:4:21)"} 2021-03-22T05:54:24.643-07:00 [INFO] "Query failed: " {"message":"SDK error : LCB_ERR_CAS_MISMATCH (209) Query error : {\n\"requestID\": \"fa27ed71-2b09-4647-86b1-1d47cbf150ff\",\n\"clientContextID\": \"4@n1ql0_0.js(OnUpdate)\",\n\"signature\": null,\n\"results\": [\n],\n\"errors\": [{\"code\":12009,\"msg\":\"DML Error, possible causes include concurrent modificationFailed to perform UPSERT - cause: unable to complete action after 12 attemps: NOT_MY_VBUCKET\"}],\n\"status\": \"errors\",\n\"metrics\": {\"elapsedTime\": \"33.240896ms\",\"executionTime\": \"33.206222ms\",\"resultCount\": 0,\"resultSize\": 0,\"serviceLoad\": 43,\"errorCount\": 1}\n}\n","stack":"Error\n    at OnUpdate (n1ql0_0.js:4:21)"} 2021-03-22T05:54:24.550-07:00 [INFO] "Query failed: " {"message":"SDK error : LCB_ERR_CAS_MISMATCH (209) Query error : {\n\"requestID\": \"5b68b040-8493-416e-b23c-0af5f8411d24\",\n\"clientContextID\": \"4@n1ql0_0.js(OnUpdate)\",\n\"signature\": null,\n\"results\": [\n],\n\"errors\": [{\"code\":12009,\"msg\":\"DML Error, possible causes include concurrent modificationFailed to perform UPSERT - cause: unable to complete action after 12 attemps: NOT_MY_VBUCKET\"}],\n\"status\": \"errors\",\n\"metrics\": {\"elapsedTime\": \"52.814149ms\",\"executionTime\": \"52.786265ms\",\"resultCount\": 0,\"resultSize\": 0,\"serviceLoad\": 31,\"errorCount\": 1}\n}\n","stack":"Error\n    at OnUpdate (n1ql0_0.js:4:21)"}  

          vikas.chaudhary Vikas Chaudhary added a comment - - edited Marco Greco Jeelan Poola  not seeing any error with functional tests but seeing error during volume  2021-03-22T05:54:26.577-07:00 [INFO] "Query failed: " {"message":"SDK error : LCB_ERR_CAS_MISMATCH (209) Query error : {\n\"requestID\": \"86a62735-f4e0-4713-bfe8-c19d608d7196\",\n\"clientContextID\": \"4@n1ql0_0.js(OnUpdate)\",\n\"signature\": null,\n\"results\": [\n],\n\"errors\": [{\"code\":12009,\"msg\":\"DML Error, possible causes include concurrent modificationFailed to perform UPSERT - cause: unable to complete action after 12 attemps: NOT_MY_VBUCKET\"}],\n\"status\": \"errors\",\n\"metrics\": {\"elapsedTime\": \"44.373269ms\",\"executionTime\": \"44.335353ms\",\"resultCount\": 0,\"resultSize\": 0,\"serviceLoad\": 40,\"errorCount\": 1}\n}\n","stack":"Error\n at OnUpdate (n1ql0_0.js:4:21)"} 2021-03-22T05:54:24.643-07:00 [INFO] "Query failed: " {"message":"SDK error : LCB_ERR_CAS_MISMATCH (209) Query error : {\n\"requestID\": \"fa27ed71-2b09-4647-86b1-1d47cbf150ff\",\n\"clientContextID\": \"4@n1ql0_0.js(OnUpdate)\",\n\"signature\": null,\n\"results\": [\n],\n\"errors\": [{\"code\":12009,\"msg\":\"DML Error, possible causes include concurrent modificationFailed to perform UPSERT - cause: unable to complete action after 12 attemps: NOT_MY_VBUCKET\"}],\n\"status\": \"errors\",\n\"metrics\": {\"elapsedTime\": \"33.240896ms\",\"executionTime\": \"33.206222ms\",\"resultCount\": 0,\"resultSize\": 0,\"serviceLoad\": 43,\"errorCount\": 1}\n}\n","stack":"Error\n at OnUpdate (n1ql0_0.js:4:21)"} 2021-03-22T05:54:24.550-07:00 [INFO] "Query failed: " {"message":"SDK error : LCB_ERR_CAS_MISMATCH (209) Query error : {\n\"requestID\": \"5b68b040-8493-416e-b23c-0af5f8411d24\",\n\"clientContextID\": \"4@n1ql0_0.js(OnUpdate)\",\n\"signature\": null,\n\"results\": [\n],\n\"errors\": [{\"code\":12009,\"msg\":\"DML Error, possible causes include concurrent modificationFailed to perform UPSERT - cause: unable to complete action after 12 attemps: NOT_MY_VBUCKET\"}],\n\"status\": \"errors\",\n\"metrics\": {\"elapsedTime\": \"52.814149ms\",\"executionTime\": \"52.786265ms\",\"resultCount\": 0,\"resultSize\": 0,\"serviceLoad\": 31,\"errorCount\": 1}\n}\n","stack":"Error\n at OnUpdate (n1ql0_0.js:4:21)"}
          marco.greco Marco Greco added a comment -

          So we're saying it's better, but not a complete fix?
          This would tie with the fact that the vbMap returned by memcached is forward and not all vBucket have migrated.

          marco.greco Marco Greco added a comment - So we're saying it's better, but not a complete fix? This would tie with the fact that the vbMap returned by memcached is forward and not all vBucket have migrated.
          jeelan.poola Jeelan Poola added a comment -

          Vikas Chaudhary Could you please try the new Toy from Marco with functional & volume tests and share the results? Thank you!

          jeelan.poola Jeelan Poola added a comment - Vikas Chaudhary Could you please try the new Toy from Marco with functional & volume tests and share the results? Thank you!
          marco.greco Marco Greco added a comment -

          Jeelan Poola I have opened CBD-4008 re changing the SDK error code returned to eventing.
          Please follow up with the SDK team.
          Arunkumar Senthilnathan I understand that you have a script that can open SDK defects for each SDK from one CBD?
          Could I ask you to create all of the underlying SDK defects?

          marco.greco Marco Greco added a comment - Jeelan Poola I have opened CBD-4008 re changing the SDK error code returned to eventing. Please follow up with the SDK team. Arunkumar Senthilnathan I understand that you have a script that can open SDK defects for each SDK from one CBD? Could I ask you to create all of the underlying SDK defects?

          that would be Arun Vijayraghavan Marco

          arunkumar Arunkumar Senthilnathan added a comment - that would be Arun Vijayraghavan Marco

          Marco Greco I have created Individual SDK tickets for I/0 based SDK's assuming the fix is in there.

          arun.vijayraghavan Arun Vijayraghavan added a comment - Marco Greco  I have created Individual SDK tickets for I/0 based SDK's assuming the fix is in there.
          vikas.chaudhary Vikas Chaudhary added a comment - - edited

          Jeelan Poola Seeing deployment hung on the new toy. logs → http://supportal.couchbase.com/snapshot/f4855f6fb20a8c47fe39943b93afa079::0

          Functional passed on the toy but volume is blocked because of MB-45053

          vikas.chaudhary Vikas Chaudhary added a comment - - edited Jeelan Poola  Seeing deployment hung on the new toy. logs →  http://supportal.couchbase.com/snapshot/f4855f6fb20a8c47fe39943b93afa079::0 Functional passed on the toy but volume is blocked because of MB-45053
          jeelan.poola Jeelan Poola added a comment - - edited

          Kamini Jagtiani MB-45053 blocker fixes are available in CC builds now. Would be great if someone from query team can recreate the Toy for testing. Ankit tried to create the Toy but the patches (http://review.couchbase.org/c/go-couchbase/+/149407 & http://review.couchbase.org/c/go-couchbase/+/148959) are running into some merge conflicts. We are not sure if both patches are needed for the latest Toy or only one of them is needed. Thank you!

          jeelan.poola Jeelan Poola added a comment - - edited Kamini Jagtiani MB-45053 blocker fixes are available in CC builds now. Would be great if someone from query team can recreate the Toy for testing. Ankit tried to create the Toy but the patches ( http://review.couchbase.org/c/go-couchbase/+/149407 & http://review.couchbase.org/c/go-couchbase/+/148959 ) are running into some merge conflicts. We are not sure if both patches are needed for the latest Toy or only one of them is needed. Thank you!
          marco.greco Marco Greco added a comment -

          Jeelan Poola I have rebased the two reviews for Ankit Prabhu at the beginning of my day and he confirm that he would build himself.
          In the last 8 hours he never got in touch to say that he had other problems, so I have to assume everything is OK?

          marco.greco Marco Greco added a comment - Jeelan Poola I have rebased the two reviews for Ankit Prabhu at the beginning of my day and he confirm that he would build himself. In the last 8 hours he never got in touch to say that he had other problems, so I have to assume everything is OK?
          jeelan.poola Jeelan Poola added a comment - - edited

          Thank you Marco Greco! Are you suggesting we only needed http://review.couchbase.org/c/query/+/149406 & http://review.couchbase.org/c/go-couchbase/+/149407 (and don't need http://review.couchbase.org/c/go-couchbase/+/148959). Ankit was under the impression all 3 patches were needed and thus was running into merge conflicts.

          jeelan.poola Jeelan Poola added a comment - - edited Thank you Marco Greco ! Are you suggesting we only needed http://review.couchbase.org/c/query/+/149406 & http://review.couchbase.org/c/go-couchbase/+/149407 (and don't need http://review.couchbase.org/c/go-couchbase/+/148959 ). Ankit was under the impression all 3 patches were needed and thus was running into merge conflicts.
          marco.greco Marco Greco added a comment -

          Jeelan Poola, no 148959 is the patch for the memcached response retry that we had found not to work, so that is not needed.

          marco.greco Marco Greco added a comment - Jeelan Poola , no 148959 is the patch for the memcached response retry that we had found not to work, so that is not needed.
          marco.greco Marco Greco added a comment -

          You can pick up toy 2534 when it completes.

          marco.greco Marco Greco added a comment - You can pick up toy 2534 when it completes.
          jeelan.poola Jeelan Poola added a comment - - edited

          Marco Greco Great, thanks!

          I triggered Toy job http://server.jenkins.couchbase.com/job/toy-unix-simple/2535/ now with http://review.couchbase.org/c/query/+/149406 & http://review.couchbase.org/c/go-couchbase/+/149407.

          Vikas Chaudhary Could you please trigger failing functional & volume tests once the build is available? Thank you!

          jeelan.poola Jeelan Poola added a comment - - edited Marco Greco Great, thanks! I triggered Toy job http://server.jenkins.couchbase.com/job/toy-unix-simple/2535/ now with http://review.couchbase.org/c/query/+/149406 & http://review.couchbase.org/c/go-couchbase/+/149407 . Vikas Chaudhary Could you please trigger failing functional & volume tests once the build is available? Thank you!
          vikas.chaudhary Vikas Chaudhary added a comment - Jeelan Poola Marco Greco  Ran functional and volume test, not seeing CAS issue with toy build http://latestbuilds.service.couchbase.com/builds/latestbuilds/couchbase-server/toybuilds/12535/couchbase-server-enterprise-7.0.0-12535-centos7.x86_64.rpm  
          jeelan.poola Jeelan Poola added a comment -

          Thank you Marco Greco Vikas Chaudhary for all your help on this one! Truly appreciate it!

          jeelan.poola Jeelan Poola added a comment - Thank you Marco Greco Vikas Chaudhary for all your help on this one! Truly appreciate it!

          Build couchbase-server-7.0.0-4818 contains go-couchbase commit 5bb57de with commit message:
          MB-44328 backoff on NMVB

          build-team Couchbase Build Team added a comment - Build couchbase-server-7.0.0-4818 contains go-couchbase commit 5bb57de with commit message: MB-44328 backoff on NMVB

          Build couchbase-server-7.0.0-4818 contains query commit 7c28ead with commit message:
          MB-44328 refresh bucket on update callback change

          build-team Couchbase Build Team added a comment - Build couchbase-server-7.0.0-4818 contains query commit 7c28ead with commit message: MB-44328 refresh bucket on update callback change

          Build couchbase-server-7.0.0-4819 contains query commit bb2bea2 with commit message:
          MB-44328 clean flags, limit system refresh to new manifest

          build-team Couchbase Build Team added a comment - Build couchbase-server-7.0.0-4819 contains query commit bb2bea2 with commit message: MB-44328 clean flags, limit system refresh to new manifest

          Vikas Chaudhary can you verify/close this issue if the regression job is now running fine. Thanks.

          pierre.regazzoni Pierre Regazzoni added a comment - Vikas Chaudhary  can you verify/close this issue if the regression job is now running fine. Thanks.

          Not seeing on 7.0.0-5017 run

          vikas.chaudhary Vikas Chaudhary added a comment - Not seeing on 7.0.0-5017 run

          Build sync_gateway-3.0.0-223 contains go-couchbase commit 5bb57de with commit message:
          MB-44328 backoff on NMVB

          build-team Couchbase Build Team added a comment - Build sync_gateway-3.0.0-223 contains go-couchbase commit 5bb57de with commit message: MB-44328 backoff on NMVB

          Build sync_gateway-3.0.0-223 contains go-couchbase commit 1e26bfa with commit message:
          MB-44328 clarify memcached status in retry error

          build-team Couchbase Build Team added a comment - Build sync_gateway-3.0.0-223 contains go-couchbase commit 1e26bfa with commit message: MB-44328 clarify memcached status in retry error

          Build sync_gateway-3.0.0-223 contains go-couchbase commit 5537227 with commit message:
          MB-44328, only give response Status in case of memcached errors

          build-team Couchbase Build Team added a comment - Build sync_gateway-3.0.0-223 contains go-couchbase commit 5537227 with commit message: MB-44328 , only give response Status in case of memcached errors

          Build sync_gateway-3.0.0-223 contains go-couchbase commit 5740cd3 with commit message:
          MB-44328 tighten op retry, report error cause, go fmt

          build-team Couchbase Build Team added a comment - Build sync_gateway-3.0.0-223 contains go-couchbase commit 5740cd3 with commit message: MB-44328 tighten op retry, report error cause, go fmt

          People

            vikas.chaudhary Vikas Chaudhary
            vikas.chaudhary Vikas Chaudhary
            Votes:
            0 Vote for this issue
            Watchers:
            19 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved:

              PagerDuty