Details
-
Bug
-
Status: Closed
-
Critical
-
Resolution: Fixed
-
Cheshire-Cat
-
Untriaged
-
1
-
Unknown
Description
Build: 7.0.0-4454
Test: Rebalance multiple kv nodes
./testrunner -i /tmp/testexec.120861.ini -p get-cbcollect-info=True,GROUP=bucket_op,skip_log_scan=False,bucket_storage=couchstore -t eventing.eventing_rebalance_collection.EventingRebalanceCollection.test_eventing_rebalance_with_multiple_kv_nodes,doc-per-day=5,dataset=default,nodes_init=5,services_init=kv-kv-kv-eventing-index:n1ql,groups=simple,reset_services=True,GROUP=bucket_op,doc-per-day=20,java_sdk_client=True
|
|
Exception: duplicated data in destination bucket which is not expected. Current : 1 Expected : 0 dcp_backlog : 0 TIMERS_IN_PAST : 0 lcb_exceptions : {'172.23.122.117': {}}
|
Seeing Timer callback failed on 172.23.122.117
{
|
"dcp_feed_boundary": "everything",
|
"event_processing_stats": {
|
"adhoc_timer_response_received": 12,
|
"agg_messages_sent_to_worker": 109370,
|
"agg_queue_memory": 0,
|
"agg_queue_memory_cap": 53477376,
|
"agg_queue_size": 0,
|
"agg_queue_size_cap": 100000,
|
"agg_timer_feedback_queue_cap": 500,
|
"dcp_deletion": 97933,
|
"dcp_deletion_sent_to_worker": 41338,
|
"dcp_mutation": 221975,
|
"dcp_mutation_sent_to_worker": 58596,
|
"dcp_snapshot": 183630,
|
"dcp_stream_req_counter": 8318,
|
"dcp_stream_req_err_counter": 5304,
|
"dcp_streamend": 1460,
|
"dcp_streamreq": 3014,
|
"dcp_system_event": 10240,
|
"execution_stats": 1096,
|
"failure_stats": 1096,
|
"latency_stats": 1096,
|
"lcb_exception_stats": 1096,
|
"log_level": 1,
|
"num_processed_events": 99934,
|
"processed_events_size": 63702196,
|
"thr_count": 1,
|
"thr_map": 1,
|
"timer_events": 80640,
|
"timer_responses_received": 80640,
|
"v8_init": 1,
|
"v8_load": 1
|
},
|
"events_remaining": {
|
"dcp_backlog": 0
|
},
|
"execution_stats": {
|
"agg_queue_memory": 0,
|
"agg_queue_size": 0,
|
"curl": {
|
"delete": 0,
|
"get": 0,
|
"head": 0,
|
"post": 0,
|
"put": 0
|
},
|
"curl_success_count": 0,
|
"dcp_delete_msg_counter": 41338,
|
"dcp_delete_parse_failure": 0,
|
"dcp_mutation_msg_counter": 58596,
|
"dcp_mutation_parse_failure": 0,
|
"enqueued_dcp_delete_msg_counter": 41338,
|
"enqueued_dcp_mutation_msg_counter": 58596,
|
"enqueued_timer_msg_counter": 0,
|
"feedback_queue_size": 0,
|
"filtered_dcp_delete_counter": 0,
|
"filtered_dcp_mutation_counter": 19294,
|
"lcb_retry_failure": 0,
|
"messages_parsed": 109361,
|
"num_processed_events": 99934,
|
"on_delete_failure": 0,
|
"on_delete_success": 40320,
|
"on_update_failure": 0,
|
"on_update_success": 40320,
|
"processed_events_size": 63702196,
|
"timer_callback_failure": 1,
|
"timer_callback_success": 80639,
|
"timer_cancel_counter": 0,
|
"timer_create_counter": 80640,
|
"timer_create_failure": 0,
|
"timer_msg_counter": 80640,
|
"timer_responses_sent": 0,
|
"timestamp": {
|
"84204": "2021-02-13T06:57:27Z"
|
},
|
"uv_msg_parse_failure": 0,
|
"uv_try_write_failure_counter": 0
|
},
|
"failure_stats": {
|
"app_worker_setting_events_lost": 0,
|
"bkt_ops_cas_mismatch_count": 0,
|
"bucket_op_exception_count": 1,
|
"checkpoint_failure_count": 0,
|
"curl_failure_count": 0,
|
"curl_max_resp_size_exceeded": 0,
|
"curl_non_200_response": 0,
|
"curl_timeout_count": 0,
|
"dcp_events_lost": 0,
|
"debugger_events_lost": 0,
|
"delete_events_lost": 0,
|
"mutation_events_lost": 0,
|
"n1ql_op_exception_count": 0,
|
"timeout_count": 0,
|
"timer_callback_missing_counter": 0,
|
"timer_context_size_exceeded_counter": 0,
|
"timer_events_lost": 0,
|
"timestamp": {
|
"84204": "2021-02-13T06:57:27Z"
|
},
|
"v8worker_events_lost": 0
|
},
|
"function_id": 1452344633,
|
"function_name": "timers",
|
"gocb_creds_request_counter": 133,
|
"internal_vb_distribution_stats": {
|
"worker_timers_0": "[0-1023]"
|
},
|
"latency_percentile_stats": {
|
"100": 511900,
|
"50": 1300,
|
"80": 1800,
|
"90": 2400,
|
"95": 3500,
|
"99": 9800
|
},
|
"lcb_creds_request_counter": 80787,
|
"lcb_exception_stats": {
|
"210": 1
|
},
|
"metastore_stats": {
|
"metastore_delete_err": 0,
|
"metastore_deletes": 0,
|
"metastore_not_found": 0,
|
"metastore_scan": 0,
|
"metastore_scan_due": 0,
|
"metastore_scan_err": 0,
|
"metastore_set": 0,
|
"metastore_set_err": 0
|
}
|
Attachments
- test_logs.txt
- 818 kB
- test.log
- 269 kB
- test-1.log
- 358 kB
Issue Links
Activity
Ankit Prabhu seeing the same issue again on latest run with the same issue when we do swap rebalance of kv nodes
2021-02-14 21:30:58 | INFO | MainProcess | Cluster_Thread | [rest_client.rebalance] rebalance params : {'knownNodes': 'ns_1@172.23.104.237,ns_1@172.23.104.243,ns_1@172.23.104.246,ns_1@172.23.104.247,ns_1@172.23.104.248,ns_1@172.23.104.249,ns_1@172.23.106.6', 'ejectedNodes': 'ns_1@172.23.104.248,ns_1@172.23.106.6', 'user': 'Administrator', 'password': 'password'}
|
The time when we got stats and all buckets are there till now
2021-02-14 21:43:51 | INFO | MainProcess | test_thread | [eventing_base.verify_eventing_results] Stats for Node 172.23.104.237 is
|
[
|
{
|
"dcp_feed_boundary": "everything",
|
"event_processing_stats": {
|
"adhoc_timer_response_received": 11,
|
"agg_messages_sent_to_worker": 50502,
|
"agg_queue_memory": 0,
|
"agg_queue_memory_cap": 106954752,
|
"agg_queue_size": 0,
|
"agg_queue_size_cap": 300000,
|
"agg_timer_feedback_queue_cap": 1500,
|
"dcp_deletion": 10080,
|
"dcp_deletion_sent_to_worker": 10080,
|
"dcp_mutation": 20160,
|
"dcp_mutation_sent_to_worker": 20160,
|
"dcp_snapshot": 30234,
|
"dcp_stream_req_counter": 56170,
|
"dcp_stream_req_err_counter": 52266,
|
"dcp_streamend": 2122,
|
"dcp_streamreq": 3904,
|
"execution_stats": 2994,
|
"failure_stats": 2994,
|
"latency_stats": 2994,
|
"lcb_exception_stats": 2994,
|
"log_level": 3,
|
"num_processed_events": 30240,
|
"processed_events_size": 17822664,
|
"thr_count": 3,
|
"thr_map": 3,
|
"timer_events": 30240,
|
"timer_responses_received": 30240,
|
"v8_init": 3,
|
"v8_load": 3
|
},
|
"events_remaining": {
|
"dcp_backlog": 0
|
},
|
"execution_stats": {
|
"agg_queue_memory": 0,
|
"agg_queue_size": 0,
|
"curl": {
|
"delete": 0,
|
"get": 0,
|
"head": 0,
|
"post": 0,
|
"put": 0
|
},
|
"curl_success_count": 0,
|
"dcp_delete_msg_counter": 10080,
|
"dcp_delete_parse_failure": 0,
|
"dcp_mutation_msg_counter": 20160,
|
"dcp_mutation_parse_failure": 0,
|
"enqueued_dcp_delete_msg_counter": 10080,
|
"enqueued_dcp_mutation_msg_counter": 20160,
|
"enqueued_timer_msg_counter": 0,
|
"feedback_queue_size": 0,
|
"filtered_dcp_delete_counter": 0,
|
"filtered_dcp_mutation_counter": 0,
|
"lcb_retry_failure": 0,
|
"messages_parsed": 50490,
|
"num_processed_events": 30240,
|
"on_delete_failure": 0,
|
"on_delete_success": 10080,
|
"on_update_failure": 0,
|
"on_update_success": 20160,
|
"processed_events_size": 17822664,
|
"timer_callback_failure": 1,
|
"timer_callback_success": 30239,
|
"timer_cancel_counter": 0,
|
"timer_create_counter": 30240,
|
"timer_create_failure": 0,
|
"timer_msg_counter": 30240,
|
"timer_responses_sent": 0,
|
"timestamp": {
|
"92566": "2021-02-15T05:43:50Z",
|
"92575": "2021-02-15T05:43:50Z",
|
"92584": "2021-02-15T05:43:50Z"
|
},
|
"uv_msg_parse_failure": 0,
|
"uv_try_write_failure_counter": 0
|
},
|
"failure_stats": {
|
"app_worker_setting_events_lost": 0,
|
"bkt_ops_cas_mismatch_count": 0,
|
"bucket_op_exception_count": 1,
|
"checkpoint_failure_count": 0,
|
"curl_failure_count": 0,
|
"curl_max_resp_size_exceeded": 0,
|
"curl_non_200_response": 0,
|
"curl_timeout_count": 0,
|
"dcp_events_lost": 0,
|
"debugger_events_lost": 0,
|
"delete_events_lost": 0,
|
"mutation_events_lost": 0,
|
"n1ql_op_exception_count": 0,
|
"timeout_count": 0,
|
"timer_callback_missing_counter": 0,
|
"timer_context_size_exceeded_counter": 0,
|
"timer_events_lost": 0,
|
"timestamp": {
|
"92566": "2021-02-15T05:43:50Z",
|
"92575": "2021-02-15T05:43:50Z",
|
"92584": "2021-02-15T05:43:50Z"
|
},
|
"v8worker_events_lost": 0
|
},
|
"function_id": 315539844,
|
"function_name": "Function_551935272_test_eventing_rebalance_with_multiple_kv_nodes",
|
"gocb_creds_request_counter": 77,
|
"internal_vb_distribution_stats": {
|
"worker_Function_551935272_test_eventing_rebalance_with_multiple_kv_nodes_0": "[0-341]",
|
"worker_Function_551935272_test_eventing_rebalance_with_multiple_kv_nodes_1": "[342-682]",
|
"worker_Function_551935272_test_eventing_rebalance_with_multiple_kv_nodes_2": "[683-1023]"
|
},
|
"latency_percentile_stats": {
|
"100": 23100,
|
"50": 1400,
|
"80": 1900,
|
"90": 2600,
|
"95": 4200,
|
"99": 9500
|
},
|
"lcb_creds_request_counter": 102,
|
"lcb_exception_stats": {
|
"210": 1
|
},
|
"metastore_stats": {
|
"metastore_delete_err": 0,
|
"metastore_deletes": 0,
|
"metastore_not_found": 0,
|
"metastore_scan": 0,
|
"metastore_scan_due": 0,
|
"metastore_scan_err": 0,
|
"metastore_set": 0,
|
"metastore_set_err": 0
|
},
|
"planner_stats": [
|
{
|
"host_name": "172.23.104.237:8096",
|
"start_vb": 0,
|
"vb_count": 1024
|
}
|
],
|
"vb_distribution_stats_from_metadata": {
|
"172.23.104.237:8096": {
|
"worker_Function_551935272_test_eventing_rebalance_with_multiple_kv_nodes_0": "[0-341]",
|
"worker_Function_551935272_test_eventing_rebalance_with_multiple_kv_nodes_1": "[342-682]",
|
"worker_Function_551935272_test_eventing_rebalance_with_multiple_kv_nodes_2": "[683-1023]"
|
}
|
},
|
"worker_pids": {
|
"worker_Function_551935272_test_eventing_rebalance_with_multiple_kv_nodes_0": 92566,
|
"worker_Function_551935272_test_eventing_rebalance_with_multiple_kv_nodes_1": 92575,
|
"worker_Function_551935272_test_eventing_rebalance_with_multiple_kv_nodes_2": 92584
|
}
|
}
|
]
|
Bucket delete after test fail
2021-02-14 21:44:28 | INFO | MainProcess | test_thread | [basetestcase.tearDown] ============== basetestcase cleanup was started for test #1 test_eventing_rebalance_with_multiple_kv_nodes ==============
|
2021-02-14 21:44:28 | INFO | MainProcess | test_thread | [bucket_helper.delete_all_buckets_or_assert] deleting existing buckets ['dst_bucket', 'metadata', 'src_bucket'] on 172.23.104.246
|
2021-02-14 21:44:29 | INFO | MainProcess | test_thread | [bucket_helper.wait_for_bucket_deletion] waiting for bucket deletion to complete....
|
Test Job : http://qa.sc.couchbase.com/job/test_suite_executor/314953/consoleText
Ankit Prabhu Could this also be due to lcb_destroy issue? I can see we may get TRANSIENT_ERROR during kv rebalance. We can request libcouchbase team once we rule out all possibilities on our end.
There is one bucket op failure due to LCB_ERR_BUCKET_NOT_FOUND.
During this time, node 172.23.106.6 and 172.23.104.248 is failed over and rebalances out.
2021-02-14T21:30:58.961-08:00, ns_orchestrator:0:info:message(ns_1@172.23.104.246) - Starting rebalance, KeepNodes = ['ns_1@172.23.104.237','ns_1@172.23.104.243',
|
'ns_1@172.23.104.246','ns_1@172.23.104.247',
|
'ns_1@172.23.104.249'], EjectNodes = ['ns_1@172.23.104.248',
|
'ns_1@172.23.106.6'], Failed over and being ejected nodes = []; no delta recovery nodes; Operation Id = 89ee9c1468b06511817f4480dd885bec
|
libcouchbase received Socket shutdown message on these 2 nodes.
2021-02-14T21:31:33.537-08:00 [Info] eventing-consumer [worker_Function_551935272_test_eventing_rebalance_with_multiple_kv_nodes_0:/tmp/127.0.0.1:8091_0_315539844.sock:92566] [lcb,server L:1108 I:1092370849] <172.23.106.6:11210> (CTX=0x7fb958041670,memcached,SRV=0x7fb96411d930,IX=2) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
2021-02-14T21:31:33.537-08:00 [Info] eventing-consumer [worker_Function_551935272_test_eventing_rebalance_with_multiple_kv_nodes_0:/tmp/127.0.0.1:8091_0_315539844.sock:92566] [lcb,connection L:485 I:1092370849] <172.23.106.6:11210> (SOCK=d1078c8ef844969b) Starting. Timeout=2500000us
|
2021-02-14T21:31:33.537-08:00 [Info] eventing-consumer [worker_Function_551935272_test_eventing_rebalance_with_multiple_kv_nodes_0:/tmp/127.0.0.1:8091_0_315539844.sock:92566] [lcb,server L:1108 I:1092370849] <172.23.104.248:11210> (CTX=0x7fb9580a92f0,memcached,SRV=0x7fb96411d680,IX=1) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
Eventing will retry this operation since LCB_ERR_SOCKET_SHUTDOWN is temporary error. But looking at SRV value(0x7fb96411d930) it looks like its trying again on node 172.23.106.6 even after it received LCB_ERR_SOCKET_SHUTDOWN.
2021-02-14T21:31:33.550-08:00 [Info] eventing-consumer [worker_Function_551935272_test_eventing_rebalance_with_multiple_kv_nodes_0:/tmp/127.0.0.1:8091_0_315539844.sock:92566] [lcb,server L:961 I:1092370849] <NOHOST:NOPORT> (CTX=(nil),,SRV=0x7fb96411d930,IX=2) Connection attempt failed. Received LCB_ERR_BUCKET_NOT_FOUND (210) from libcouchbase, received 0 from operating system
|
2021-02-14T21:31:33.550-08:00 [Info] eventing-consumer [worker_Function_551935272_test_eventing_rebalance_with_multiple_kv_nodes_0:/tmp/127.0.0.1:8091_0_315539844.sock:92566] [lcb,server L:818 I:1092370849] <NOHOST:NOPORT> (CTX=(nil),,SRV=0x7fb96411d930,IX=2) Failing command (pkt=0x7fb958019f60, opaque=6839, opcode=0x1) with error LCB_ERR_BUCKET_NOT_FOUND (210)
|
Sergey Avseyev, Could you please take a look?
Vikas Chaudhary Could you please rerun this test with Build couchbase-server-7.0.0-4475 or later?
Seeing this with following test on 7.0.0-4502
./testrunner -i /tmp/testexec.13228.ini -p get-cbcollect-info=True,GROUP=timer_op_curl,get-cbcollect-info=False -t eventing.eventing_rebalance.EventingRebalance.test_rebalance_out_with_different_topologies,doc-per-day=20,dataset=default,nodes_init=5,services_init=kv-kv:eventing-kv:eventing-kv:eventing-index:n1ql,groups=simple,server_out=3,reset_services=True,GROUP=timer_op_curl,handler_code=timer_op_curl_jenkins,curl=True,host=http://qa.sc.couchbase.com/,get-cbcollect-info=False
|
|
missing data in destination bucket. Current : 40318 Expected : 40320 dcp_backlog : 0 TIMERS_IN_PAST : 0 lcb_exceptions : {'172.23.122.178': {'210': 1}, '172.23.122.183': {}}
|
stats
{
|
"dcp_feed_boundary": "everything",
|
"event_processing_stats": {
|
"adhoc_timer_response_received": 9,
|
"agg_messages_sent_to_worker": 26044,
|
"agg_queue_memory": 0,
|
"agg_queue_memory_cap": 106954752,
|
"agg_queue_size": 0,
|
"agg_queue_size_cap": 300000,
|
"agg_timer_feedback_queue_cap": 1500,
|
"dcp_mutation": 13366,
|
"dcp_mutation_sent_to_worker": 13366,
|
"dcp_snapshot": 13350,
|
"dcp_stream_req_counter": 853,
|
"dcp_streamend": 341,
|
"dcp_streamreq": 853,
|
"execution_stats": 2292,
|
"failure_stats": 2292,
|
"latency_stats": 2292,
|
"lcb_exception_stats": 2292,
|
"log_level": 3,
|
"num_processed_events": 13366,
|
"processed_events_size": 9649096,
|
"thr_count": 3,
|
"thr_map": 3,
|
"timer_events": 19915,
|
"timer_responses_received": 13366,
|
"v8_init": 3,
|
"v8_load": 3
|
},
|
"events_remaining": {
|
"dcp_backlog": 0
|
},
|
"execution_stats": {
|
"agg_queue_memory": 0,
|
"agg_queue_size": 0,
|
"curl": {
|
"delete": 0,
|
"get": 19915,
|
"head": 0,
|
"post": 0,
|
"put": 0
|
},
|
"curl_success_count": 19915,
|
"dcp_delete_msg_counter": 0,
|
"dcp_delete_parse_failure": 0,
|
"dcp_mutation_msg_counter": 13366,
|
"dcp_mutation_parse_failure": 0,
|
"enqueued_dcp_delete_msg_counter": 0,
|
"enqueued_dcp_mutation_msg_counter": 13366,
|
"enqueued_timer_msg_counter": 0,
|
"feedback_queue_size": 0,
|
"filtered_dcp_delete_counter": 0,
|
"filtered_dcp_mutation_counter": 0,
|
"lcb_retry_failure": 0,
|
"messages_parsed": 26032,
|
"num_processed_events": 13366,
|
"on_delete_failure": 0,
|
"on_delete_success": 0,
|
"on_update_failure": 0,
|
"on_update_success": 13366,
|
"processed_events_size": 9649096,
|
"timer_callback_failure": 0,
|
"timer_callback_success": 19915,
|
"timer_cancel_counter": 0,
|
"timer_create_counter": 13366,
|
"timer_create_failure": 0,
|
"timer_msg_counter": 19915,
|
"timer_responses_sent": 0,
|
"timestamp": {
|
"35960": "2021-02-21T20:38:03Z",
|
"35969": "2021-02-21T20:38:03Z",
|
"35979": "2021-02-21T20:38:03Z"
|
},
|
"uv_msg_parse_failure": 0,
|
"uv_try_write_failure_counter": 0
|
},
|
"failure_stats": {
|
"app_worker_setting_events_lost": 0,
|
"bkt_ops_cas_mismatch_count": 0,
|
"bucket_op_exception_count": 1,
|
"checkpoint_failure_count": 0,
|
"curl_failure_count": 0,
|
"curl_max_resp_size_exceeded": 0,
|
"curl_non_200_response": 0,
|
"curl_timeout_count": 0,
|
"dcp_events_lost": 0,
|
"debugger_events_lost": 0,
|
"delete_events_lost": 0,
|
"mutation_events_lost": 0,
|
"n1ql_op_exception_count": 0,
|
"timeout_count": 0,
|
"timer_callback_missing_counter": 0,
|
"timer_context_size_exceeded_counter": 0,
|
"timer_events_lost": 0,
|
"timestamp": {
|
"35960": "2021-02-21T20:38:03Z",
|
"35969": "2021-02-21T20:38:03Z",
|
"35979": "2021-02-21T20:38:03Z"
|
},
|
"v8worker_events_lost": 0
|
},
|
"function_id": 2520185706,
|
"function_name": "Function_760427923_test_rebalance_out_with_different_topologies",
|
"gocb_creds_request_counter": 46,
|
"internal_vb_distribution_stats": {
|
"worker_Function_760427923_test_rebalance_out_with_different_topologies_0": "[0-170]",
|
"worker_Function_760427923_test_rebalance_out_with_different_topologies_1": "[171-341]",
|
"worker_Function_760427923_test_rebalance_out_with_different_topologies_2": "[342-511]"
|
},
|
"latency_percentile_stats": {
|
"100": 208800,
|
"50": 1200,
|
"80": 1500,
|
"90": 2000,
|
"95": 2900,
|
"99": 7600
|
},
|
"lcb_creds_request_counter": 60,
|
"lcb_exception_stats": {
|
"210": 1
|
},
|
"metastore_stats": {
|
"metastore_delete_err": 0,
|
"metastore_deletes": 0,
|
"metastore_not_found": 0,
|
"metastore_scan": 0,
|
"metastore_scan_due": 0,
|
"metastore_scan_err": 0,
|
"metastore_set": 0,
|
"metastore_set_err": 0
|
},
|
"planner_stats": [
|
{
|
"host_name": "172.23.122.178:8096",
|
"start_vb": 0,
|
"vb_count": 512
|
},
|
{
|
"host_name": "172.23.122.183:8096",
|
"start_vb": 512,
|
"vb_count": 512
|
}
|
],
|
"vb_distribution_stats_from_metadata": {
|
"172.23.122.178:8096": {
|
"worker_Function_760427923_test_rebalance_out_with_different_topologies_0": "[0-170]",
|
"worker_Function_760427923_test_rebalance_out_with_different_topologies_1": "[171-341]",
|
"worker_Function_760427923_test_rebalance_out_with_different_topologies_2": "[342-511]"
|
},
|
"172.23.122.183:8096": {
|
"worker_Function_760427923_test_rebalance_out_with_different_topologies_0": "[512-682]",
|
"worker_Function_760427923_test_rebalance_out_with_different_topologies_1": "[683-853]",
|
"worker_Function_760427923_test_rebalance_out_with_different_topologies_2": "[854-1023]"
|
}
|
},
|
"worker_pids": {
|
"worker_Function_760427923_test_rebalance_out_with_different_topologies_0": 35969,
|
"worker_Function_760427923_test_rebalance_out_with_different_topologies_1": 35960,
|
"worker_Function_760427923_test_rebalance_out_with_different_topologies_2": 35979
|
}
|
}
|
I think to look into this, we'll need your logs from the test run Vikas Chaudhary. Can you please provide a pointer and reassign to Sergey Avseyev?
Also, did you determine from the log that this same failure is related to the LCB_ERR_BUCKET_NOT_FOUND when you expect it to be there? As Ankit Prabhu points out above, just this being in the log is not necessarily an indication of the cause of the mismatched items, as eventing should retry.
Although looking at the messages above more carefully, is it the case Ankit Prabhu that you're wondering why the retry attempt goes to the connection which was shut down by the cluster side? Maybe that's something Sergey Avseyev can have a look into, but we should be doing that with the latest logs. Thanks!
The latest reproduction on 7.0.0-4514, seen during swap rebalance of kV nodes
./testrunner -i /tmp/testexec.30080.ini -p get-cbcollect-info=True,GROUP=bucket_op_with_timers,skip_log_scan=False,bucket_storage=couchstore -t eventing.eventing_rebalance.EventingRebalance.test_eventing_rebalance_with_multiple_kv_nodes,doc-per-day=5,dataset=default,nodes_init=5,services_init=kv-kv-kv-eventing-index:n1ql,groups=simple,reset_services=True,,handler_code=bucket_op_with_timers,GROUP=bucket_op_with_timers
|
|
Exception: missing data in destination bucket. Current : 10079 Expected : 10080 dcp_backlog : 0 TIMERS_IN_PAST : 0 lcb_exceptions : {'172.23.122.133': {'210': 1}}
|
logs attached
Deleting buckets at the end of the test 2021-02-22 17:47:45
[2021-02-22 17:47:45,316] - [basetestcase:572] INFO - ============== basetestcase cleanup was started for test #8 test_eventing_rebalance_with_multiple_kv_nodes ==============
|
[2021-02-22 17:47:45,347] - [bucket_helper:167] INFO - Could not find any buckets for node 172.23.123.42, nothing to delete
|
[2021-02-22 17:47:45,354] - [bucket_helper:167] INFO - Could not find any buckets for node 172.23.123.44, nothing to delete
|
[2021-02-22 17:47:45,627] - [bucket_helper:142] INFO - deleting existing buckets ['dst_bucket', 'metadata', 'src_bucket'] on 172.23.122.132
|
[2021-02-22 17:47:46,673] - [bucket_helper:233] INFO - waiting for bucket deletion to complete....
|
[2021-02-22 17:47:46,703] - [rest_client:137] INFO - node 172.23.122.132 existing buckets : ['metadata', 'src_bucket']
|
[2021-02-22 17:47:46,705] - [bucket_helper:165] INFO - deleted bucket : dst_bucket from 172.23.122.132
|
[2021-02-22 17:47:48,033] - [bucket_helper:233] INFO - waiting for bucket deletion to complete....
|
[2021-02-22 17:47:48,052] - [rest_client:137] INFO - node 172.23.122.132 existing buckets : ['src_bucket']
|
[2021-02-22 17:47:48,054] - [bucket_helper:165] INFO - deleted bucket : metadata from 172.23.122.132
|
[2021-02-22 17:47:49,129] - [bucket_helper:233] INFO - waiting for bucket deletion to complete....
|
[2021-02-22 17:47:49,136] - [rest_client:137] INFO - node 172.23.122.132 existing buckets : []
|
[2021-02-22 17:47:49,136] - [bucket_helper:165] INFO - deleted bucket : src_bucket from 172.23.122.132
|
[2021-02-22 17:47:49,147] - [bucket_helper:167] INFO - Could not find any buckets for node 172.23.122.133, nothing to delete
|
[2021-02-22 17:47:49,170] - [bucket_helper:167] INFO - Could not find any buckets for node 172.23.122.134, nothing to delete
|
[2021-02-22 17:47:49,180] - [bucket_helper:167] INFO - Could not find any buckets for node 172.23.122.131, nothing to delete
|
[2021-02-22 17:47:49,189] - [bucket_helper:167] INFO - Could not find any buckets for node 172.23.122.180, nothing to delete
|
[2021-02-22 17:47:49,196] - [bucket_helper:167] INFO - Could not find any buckets for node 172.23.122.130, nothing to delete
|
logs collected with LCB error ,2021-02-22 17:46:23
[2021-02-22 17:46:23,748] - [eventing_base:649] INFO - Stats for Node 172.23.122.133 is
|
[
|
{
|
"dcp_feed_boundary": "everything",
|
"event_processing_stats": {
|
"adhoc_timer_response_received": 24,
|
"agg_messages_sent_to_worker": 49960,
|
"agg_queue_memory": 0,
|
"agg_queue_memory_cap": 106954752,
|
"agg_queue_size": 0,
|
"agg_queue_size_cap": 300000,
|
"agg_timer_feedback_queue_cap": 1500,
|
"dcp_deletion": 10080,
|
"dcp_deletion_sent_to_worker": 10080,
|
"dcp_mutation": 20160,
|
"dcp_mutation_sent_to_worker": 20160,
|
"dcp_snapshot": 30233,
|
"dcp_stream_req_counter": 66340,
|
"dcp_stream_req_err_counter": 62940,
|
"dcp_streamend": 1794,
|
"dcp_streamreq": 3400,
|
"execution_stats": 3033,
|
"failure_stats": 3033,
|
"latency_stats": 3033,
|
"lcb_exception_stats": 3033,
|
"log_level": 3,
|
"num_processed_events": 30240,
|
"processed_events_size": 17822664,
|
"thr_count": 3,
|
"thr_map": 3,
|
"timer_events": 30240,
|
"timer_responses_received": 30240,
|
"v8_init": 3,
|
"v8_load": 3
|
},
|
"events_remaining": {
|
"dcp_backlog": 0
|
},
|
"execution_stats": {
|
"agg_queue_memory": 0,
|
"agg_queue_size": 0,
|
"curl": {
|
"delete": 0,
|
"get": 0,
|
"head": 0,
|
"post": 0,
|
"put": 0
|
},
|
"curl_success_count": 0,
|
"dcp_delete_msg_counter": 10080,
|
"dcp_delete_parse_failure": 0,
|
"dcp_mutation_msg_counter": 20160,
|
"dcp_mutation_parse_failure": 0,
|
"enqueued_dcp_delete_msg_counter": 10080,
|
"enqueued_dcp_mutation_msg_counter": 20160,
|
"enqueued_timer_msg_counter": 0,
|
"feedback_queue_size": 0,
|
"filtered_dcp_delete_counter": 0,
|
"filtered_dcp_mutation_counter": 0,
|
"lcb_retry_failure": 0,
|
"messages_parsed": 49948,
|
"no_op_counter": 0,
|
"num_processed_events": 30240,
|
"on_delete_failure": 0,
|
"on_delete_success": 10080,
|
"on_update_failure": 0,
|
"on_update_success": 20160,
|
"processed_events_size": 17822664,
|
"timer_callback_failure": 1,
|
"timer_callback_success": 30239,
|
"timer_cancel_counter": 0,
|
"timer_create_counter": 30240,
|
"timer_create_failure": 0,
|
"timer_msg_counter": 30240,
|
"timer_responses_sent": 0,
|
"timestamp": {
|
"43365": "2021-02-23T01:46:23Z",
|
"43373": "2021-02-23T01:46:23Z",
|
"43382": "2021-02-23T01:46:23Z"
|
},
|
"uv_msg_parse_failure": 0,
|
"uv_try_write_failure_counter": 0
|
},
|
"failure_stats": {
|
"app_worker_setting_events_lost": 0,
|
"bkt_ops_cas_mismatch_count": 0,
|
"bucket_op_exception_count": 1,
|
"checkpoint_failure_count": 0,
|
"curl_failure_count": 0,
|
"curl_max_resp_size_exceeded": 0,
|
"curl_non_200_response": 0,
|
"curl_timeout_count": 0,
|
"dcp_events_lost": 0,
|
"debugger_events_lost": 0,
|
"delete_events_lost": 0,
|
"mutation_events_lost": 0,
|
"n1ql_op_exception_count": 0,
|
"timeout_count": 0,
|
"timer_callback_missing_counter": 0,
|
"timer_context_size_exceeded_counter": 0,
|
"timer_events_lost": 0,
|
"timestamp": {
|
"43365": "2021-02-23T01:46:23Z",
|
"43373": "2021-02-23T01:46:23Z",
|
"43382": "2021-02-23T01:46:23Z"
|
},
|
"v8worker_events_lost": 0
|
},
|
"function_id": 2575840121,
|
"function_name": "Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes",
|
"gocb_creds_request_counter": 82,
|
"internal_vb_distribution_stats": {
|
"worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_0": "[0-341]",
|
"worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_1": "[342-682]",
|
"worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_2": "[683-1023]"
|
},
|
"latency_percentile_stats": {
|
"100": 19900,
|
"50": 1100,
|
"80": 1400,
|
"90": 1900,
|
"95": 2900,
|
"99": 6800
|
},
|
"lcb_creds_request_counter": 106,
|
"lcb_exception_stats": {
|
"210": 1
|
},
|
"metastore_stats": {
|
"metastore_delete_err": 0,
|
"metastore_deletes": 0,
|
"metastore_not_found": 0,
|
"metastore_scan": 0,
|
"metastore_scan_due": 0,
|
"metastore_scan_err": 0,
|
"metastore_set": 0,
|
"metastore_set_err": 0
|
},
|
"planner_stats": [
|
{
|
"host_name": "172.23.122.133:8096",
|
"start_vb": 0,
|
"vb_count": 1024
|
}
|
],
|
"vb_distribution_stats_from_metadata": {
|
"172.23.122.133:8096": {
|
"worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_0": "[0-341]",
|
"worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_1": "[342-682]",
|
"worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_2": "[683-1023]"
|
}
|
},
|
"worker_pids": {
|
"worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_0": 43365,
|
"worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_1": 43373,
|
"worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_2": 43382
|
}
|
}
|
]
|
Lynn Straus- This issue is not a must pass for beta refresh, removing the label.
Vikas Chaudhary where can I find the full libcouchbase logs for the latest swap rebalance run? The uploaded log only seems to be from the runner and not lcb itself.
Ah, I found them after talking to Matt Ingenthron - will take a look.
Ok, so I've analyzed the eventing log on node 122.133 which experienced the LCB errors.
So from the rough event timeline, we can see:
- During rebalance, lcb reports lots of NOT_MY_VBUCKET. this is expected.
- When the NMVBs end, we start to see LCB_ERR_SOCKET_SHUTDOWN
Interestingly, they are blocked a couple minutes apart each, with the first block starting after the NMVBs:
203:2021-02-22T17:31:47.406-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_0:/tmp/127.0.0.1:8091_0_2575840121.sock:43365] [lcb,server L:1108 I:3398046468] <172.23.122.131:11210> (CTX=0x7ff0f00c3dc0,memcached,SRV=0x7ff0f0037290,IX=0) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
205:2021-02-22T17:31:47.452-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_1:/tmp/127.0.0.1:8091_1_2575840121.sock:43373] [lcb,server L:1108 I:980799882] <172.23.122.131:11210> (CTX=0x7fa1c8052510,memcached,SRV=0x7fa1c8012070,IX=0) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
206:2021-02-22T17:31:47.500-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_2:/tmp/127.0.0.1:8091_2_2575840121.sock:43382] [lcb,server L:1108 I:1935748459] <172.23.122.131:11210> (CTX=0x7fae380031c0,memcached,SRV=0x7fae38002e70,IX=0) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
|
217:2021-02-22T17:34:21.412-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_0:/tmp/127.0.0.1:8091_0_2575840121.sock:43365] [lcb,server L:1108 I:3398046468] <172.23.123.44:11210> (CTX=0x7ff0f001acc0,memcached,SRV=0x7ff0fc1196a0,IX=2) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
218:2021-02-22T17:34:21.450-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_1:/tmp/127.0.0.1:8091_1_2575840121.sock:43373] [lcb,server L:1108 I:980799882] <172.23.123.44:11210> (CTX=0x7fa1c804c4c0,memcached,SRV=0x7fa1dc11ed30,IX=2) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
219:2021-02-22T17:34:21.450-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_1:/tmp/127.0.0.1:8091_1_2575840121.sock:43373] [lcb,server L:1108 I:980799882] <172.23.123.42:11210> (CTX=0x7fa1c80468c0,memcached,SRV=0x7fa1dc082150,IX=1) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
222:2021-02-22T17:34:21.501-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_2:/tmp/127.0.0.1:8091_2_2575840121.sock:43382] [lcb,server L:1108 I:1935748459] <172.23.123.42:11210> (CTX=0x7fae380101b0,memcached,SRV=0x7fae40081bc0,IX=1) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
|
223:2021-02-22T17:47:53.425-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_0:/tmp/127.0.0.1:8091_0_2575840121.sock:43365] [lcb,server L:1108 I:1817904706] <172.23.122.180:11210> (CTX=0x7ff0f00da510,memcached,SRV=0x7ff0f00ca2b0,IX=2) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
224:2021-02-22T17:47:53.425-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_0:/tmp/127.0.0.1:8091_0_2575840121.sock:43365] [lcb,server L:1108 I:1817904706] <172.23.122.132:11210> (CTX=0x7ff0f000ac90,memcached,SRV=0x7ff0fc073320,IX=1) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
230:2021-02-22T17:47:53.437-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_0:/tmp/127.0.0.1:8091_0_2575840121.sock:43365] [lcb,server L:1108 I:1817904706] <172.23.122.131:11210> (CTX=0x7ff0f0032150,memcached,SRV=0x7ff0f006a820,IX=0) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
240:2021-02-22T17:47:53.467-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_1:/tmp/127.0.0.1:8091_1_2575840121.sock:43373] [lcb,server L:1108 I:3078861705] <172.23.122.180:11210> (CTX=0x7fa1c80a3fb0,memcached,SRV=0x7fa1c8052570,IX=2) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
241:2021-02-22T17:47:53.467-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_1:/tmp/127.0.0.1:8091_1_2575840121.sock:43373] [lcb,server L:1108 I:3078861705] <172.23.122.131:11210> (CTX=0x7fa1c801f150,memcached,SRV=0x7fa1c801ca60,IX=0) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
242:2021-02-22T17:47:53.480-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_1:/tmp/127.0.0.1:8091_1_2575840121.sock:43373] [lcb,server L:1108 I:3078861705] <172.23.122.132:11210> (CTX=0x7fa1dc0e0c20,memcached,SRV=0x7fa1dc0835e0,IX=1) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
267:2021-02-22T17:47:53.511-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_2:/tmp/127.0.0.1:8091_2_2575840121.sock:43382] [lcb,server L:1108 I:1172510543] <172.23.122.132:11210> (CTX=0x7fae400e2b00,memcached,SRV=0x7fae400835f0,IX=1) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
268:2021-02-22T17:47:53.511-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_2:/tmp/127.0.0.1:8091_2_2575840121.sock:43382] [lcb,server L:1108 I:1172510543] <172.23.122.180:11210> (CTX=0x7fae38043630,memcached,SRV=0x7fae3809ac70,IX=2) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
279:2021-02-22T17:47:53.523-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_2:/tmp/127.0.0.1:8091_2_2575840121.sock:43382] [lcb,server L:1108 I:1172510543] <172.23.122.131:11210> (CTX=0x7fae38042d10,memcached,SRV=0x7fae3800f730,IX=0) Got socket error LCB_ERR_SOCKET_SHUTDOWN (1025)
|
From the first socket close the bucket not found errors start (at 2021-02-22T17:31:47.414-08:00) like:
204:2021-02-22T17:31:47.414-08:00 [Info] eventing-consumer [worker_Function_431426416_test_eventing_rebalance_with_multiple_kv_nodes_0:/tmp/127.0.0.1:8091_0_2575840121.sock:43365] [lcb,server L:961 I:3398046468] <NOHOST:NOPORT> (CTX=(nil),,SRV=0x7ff0f0037290,IX=0) Connection attempt failed. Received LCB_ERR_BUCKET_NOT_FOUND (210) from libcouchbase, received 0 from operating system
|
The LCB_ERR_BUCKET_NOT_FOUND represents a failure during the "select bucket" kv operation. So the servers in question were still accepting incoming connections and allowing LCB to authenticate (since select bucket is performed after the auth), but the bucket was not present on that node anymore.
The problem is that it is very hard to tell how LCB got into that state, because only a minimal subset of LCB information is actually provided in the log. So to further triage the "how", my ask is:
- Can the swap rebalance scenario from above be rerun with debug lcb logging enabled and visible in the eventing log? This would give us more insight into the state transitions and how it ended up there. Looking at https://github.com/couchbase/eventing/commit/0f44e77f0c52fc6ee9401989f293e37fbfa6c4d2 since build 3115 the logging callback should be hooked up and enable that.
Assigning back to you for a re-run with debug enabled on libcouchbase please!
Rerun with eventing log level as DEBUG on 7.0.0-4617
./testrunner.py -i /tmp/win10-bucket-ops.ini -p get-cbcollect-info=True,GROUP=timer_op_curl -t eventing.eventing_rebalance.EventingRebalance.test_rebalance_out_with_different_topologies,doc-per-day=20,dataset=default,nodes_init=5,services_init=kv-kv:eventing-kv:eventing-kv:eventing-index:n1ql,groups=simple,server_out=3,reset_services=True,GROUP=timer_op_curl,handler_code=timer_op_curl_jenkins,curl=True,host=http://qa.sc.couchbase.com/,eventing_log_level=DEBUG
|
|
Exception: missing data in destination bucket. Current : 40316 Expected : 40320 dcp_backlog : 0 TIMERS_IN_PAST : 0 lcb_exceptions : {'172.23.106.67': {'210': 1}, '172.23.106.73': {'210': 1}}
|
Logs attached, eventing node 172.23.106.67,172.23.106.73
I've been looking through the debug logs, but nothing jumped out immediately for me. Asking Sergey Avseyev to take a look at them next since he knows it in greater detail then I do.
Vikas Chaudhary, could you point which log file contains message about removing the node?
Sergey Avseyev Orchestrator node is 172.23.106.64-20210311-1050-diag.zip . Check all the logs uploaded on 11th Mar
Vikas Chaudhary, ns_server.eventing.log does not have any message about removing the node in the archive you are pointing. Is there some standalone or smaller example/test that reproduces the issue? Could you help to reconstruct the sequence of the events from the failing eventing node point of view? Is it only reproducing with timers?
Also I've noticed that list of the services in the last run is different compared to the original test invocation. Is this the minimal configuration that reproduces the bug?
Sergey Avseyev following test reproduce it with every run
./testrunner.py -i /tmp/win10-bucket-ops.ini -p get-cbcollect-info=True,GROUP=timer_op_curl -t eventing.eventing_rebalance.EventingRebalance.test_rebalance_out_with_different_topologies,doc-per-day=20,dataset=default,nodes_init=5,services_init=kv-kv:eventing-kv:eventing-kv:eventing-index:n1ql,groups=simple,server_out=3,reset_services=True,GROUP=timer_op_curl,handler_code=timer_op_curl_jenkins,curl=True,host=http://qa.sc.couchbase.com/,eventing_log_level=DEBUG
|
|
"lcb_exception_stats": {
|
"210": 2
|
},
|
Here are the test steps (4GB , 4core machines)
- Create 5 node cluster (kv, kv-eventing, kv-eventing, kv-eventing , index-n1ql)
- deploy eventing handler with timer + curl
- Load data and remove kv-eventing(both co-exist) node from the cluster and wait for all the mutations to process
logs:
[s3://cb-customers-secure/mb-44324/2021-04-04/collectinfo-2021-04-04t082151-ns_1@172.23.106.64.zip]
[s3://cb-customers-secure/mb-44324/2021-04-04/collectinfo-2021-04-04t082151-ns_1@172.23.106.67.zip]
[s3://cb-customers-secure/mb-44324/2021-04-04/collectinfo-2021-04-04t082151-ns_1@172.23.106.73.zip]
[s3://cb-customers-secure/mb-44324/2021-04-04/collectinfo-2021-04-04t082151-ns_1@172.23.106.75.zip]
172.23.106.64-20210405-1727-diag.zip 172.23.106.67-20210405-1727-diag.zip
172.23.106.73-20210405-1727-diag.zip
172.23.106.74-20210405-1727-diag.zip
172.23.106.75-20210405-1727-diag.zip
172.23.106.76-20210405-1727-diag.zip
172.23.106.77-20210405-1727-diag.zip
172.23.106.78-20210405-1727-diag.zip
172.23.106.79-20210405-1727-diag.zip
172.23.106.80-20210405-1727-diag.zip
172.23.106.81-20210405-1727-diag.zip
Logs from my repro of the same test case
From .106.67 eventing debug log:
2021-04-05T17:23:05.176-07:00 [Debug] Consumer::routeResponse [worker_Function_153343085_test_rebalance_out_with_different_topologies_1:/tmp/127.0.0.1:8091_1_1306880244.sock:44734] Received insight: {
|
"script": "'use strict';\u000afunction OnUpdate(doc, meta) {\u000a var expiry = new Date();\u000a expiry.setSeconds(expiry.getSeconds() + 30);\u000a\u000a var context = {docID : meta.id, random_text : \u0022e6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh0R7Aumoe6cZZGHuh07Aumoe6cZZGHuh07Aumoe6cZZGHuh07Aumoe6\u0022};\u000a createTimer(timerCallback, expiry, meta.id, context);\u000a}\u000a\u000a\u000a\u000afunction OnDelete(meta) {\u000a var expiry = new Date();\u000a expiry.setSeconds(expiry.getSeconds() + 30);\u000a\u000a var context = {docID : meta.id};\u000a createTimer(NDtimerCallback, expiry, meta.id, context);\u000a}\u000a\u000afunction NDtimerCallback(context) {\u000a var request = {\u000a\u0009path : 'job/test_suite_executor/api/json?tree=jobs[component]'\u000a };\u000a try {\u000a \u0009var response = curl(\u0022GET\u0022, server, request);\u000a \u0009log('response body received from server:', response.body);\u000a \u0009log('response headers received from server:', response.headers);\u000a \u0009log('response status received from server:', response.status);\u000a \u0009var res= new Uint8Array(response.body);\u000a \u0009delete dst_bucket[context.docID];\u000a }\u000a catch (e) {\u000a \u0009log('error:', e);\u000a }\u000a}\u000afunction timerCallback(context) {\u000a var request = {\u000a\u0009path : 'job/test_suite_executor/api/json?tree=jobs[component]'\u000a };\u000a try {\u000a \u0009var response = curl(\u0022GET\u0022, server, request);\u000a \u0009log('response body received from server:', response.body);\u000a \u0009log('response headers received from server:', response.headers);\u000a \u0009log('response status received from server:', response.status);\u000a \u0009var res= new Uint8Array(response.body);\u000a \u0009if(response.status == 200){\u000a \u0009 dst_bucket[context.docID]=response.body;\u000a \u0009}\u000a \u0009else{\u000a \u0009 dst_bucket[meta.id] = response.status;\u000a \u0009}\u000a }\u000a catch (e) {\u000a \u0009log('error:', e);\u000a }\u000a}\u000a\u000a",
|
"lines": {
|
"41": {
|
"call_count": 0,
|
"call_time": 0,
|
"error_count": 0,
|
"error_msg": "",
|
"last_log": "\u0022response body received from server:\u0022 {\u0022_class\u0022:\u0022hudson.model.FreeStyleProject\u0022} "
|
},
|
"42": {
|
"call_count": 0,
|
"call_time": 0,
|
"error_count": 0,
|
"error_msg": "",
|
"last_log": "\u0022response headers received from server:\u0022 {\u0022Content-Length\u0022:\u0022 62\u005cr\u005cn\u0022,\u0022X-Content-Type-Options\u0022:\u0022 nosniff\u005cr\u005cn\u0022,\u0022Date\u0022:\u0022 Tue, 06 Apr 2021 00:12:33 GMT\u005cr\u005cn\u0022,\u0022Content-Encoding\u0022:\u0022 gzip\u005cr\u005cn\u0022,\u0022X-Jenkins\u0022:\u0022 2.108\u005cr\u005cn\u0022,\u0022X-Jenkins-Session\u0022:\u0022 71262ec0\u005cr\u005cn\u0022,\u0022Server\u0022:\u0022 Jetty(9.4.z-SNAPSHOT)\u005cr\u005cn\u0022,\u0022Content-Type\u0022:\u0022 application/json;charset=utf-8\u005cr\u005cn\u0022} "
|
},
|
"43": {
|
"call_count": 0,
|
"call_time": 0,
|
"error_count": 0,
|
"error_msg": "",
|
"last_log": "\u0022response status received from server:\u0022 200 "
|
},
|
"53": {
|
"call_count": 0,
|
"call_time": 0,
|
"error_count": 0,
|
"error_msg": "",
|
"last_log": "\u0022error:\u0022 {\u0022message\u0022:{\u0022code\u0022:0,\u0022desc\u0022:\u0022LCB_ERR_BUCKET_NOT_FOUND (210): A request is made but the current bucket is not found\u0022,\u0022name\u0022:\u0022LCB_ERR_BUCKET_NOT_FOUND\u0022},\u0022stack\u0022:\u0022Error\u005cn at Function.timerCallback (Function_153343085_test_rebalance_out_with_different_topologies.js:47:35)\u0022} "
|
}
|
}
|
|
}
|
2021-04-05T17:23:05.176-07:00 [Info] eventing-consumer [worker_Function_153343085_test_rebalance_out_with_different_topologies_2:/tmp/127.0.0.1:8091_2_1306880244.sock:44750] Responding with insight {
|
Server build 7.0.0-4857
Sorry for delay, situational tests has found issue in release candidate for libcouchbase 3.1.1. I've been working on the fix, now switching back to eventing tickets and I will update this one shortly.
I've managed to build standalone reproduction of the issue. I'm working on the fix now. ETA is tomorrow (April 16, 2021).
Vikas Chaudhary, could you checkout patch from (CCBC-1394), and retry the tests?
http://review.couchbase.org/c/libcouchbase/+/151396
Hi Chanabasappa Ghali since Vikas is out, can you pls take a look at this ?
tests are running @
http://qa.sc.couchbase.com/job/test_suite_executor/337296/
http://qa.sc.couchbase.com/job/test_suite_executor/337297/
http://qa.sc.couchbase.com/job/test_suite_executor/337298/
Sergey Avseyev looks like all the tests have passed with the toy build that has the fix for CCBC-1394
Pablo Silberkasten Sergey Avseyev Arunkumar Senthilnathan We would need a released version of libcouchbase with this fix on top of 3.1.1 release so that we can incorporate into CB Server 7.0. Thank you!
The change has been merged to master, release 3.1.2 is planned for Friday. Meanwhile could you advance SHA1 in manifest to c84648dbc3bde08372de934053197d579a0fe5ca (current master branch) and run the rest of the eventing test suite?
http://latestbuilds.service.couchbase.com/builds/latestbuilds/couchbase-server/toybuilds/12746/
Contains latest libcouchbase changes.
http://server.jenkins.couchbase.com/view/Toys/job/toy-unix-simple/2746/
Arunkumar SenthilnathanChanabasappa Ghali I am curious, which Toy did you use with jobs at https://issues.couchbase.com/browse/MB-44324?focusedCommentId=492818&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-492818?
Ankit just triggered a Toy with SHA1 in manifest set to c84648dbc3bde08372de934053197d579a0fe5ca and details are provided at https://issues.couchbase.com/browse/MB-44324?focusedCommentId=493233&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-493233
Jeelan Poola we used below toy build
http://latestbuilds.service.couchbase.com/builds/latestbuilds/couchbase-server/toybuilds/12727/
Jeelan Poola old toy had libcouchbase 3.1.1 + the changes that Sergey Avseyev provided for this issue.
New toy contains the all the libcouchbase changes till c84648dbc3bde08372de934053197d579a0fe5ca
Chanabasappa Ghali Hope regression tests are running on Toy http://latestbuilds.service.couchbase.com/builds/latestbuilds/couchbase-server/toybuilds/12746/. Would be great to know the current status.
Any updates Chanabasappa Ghali? Would be great to know that it's verified fixed.
Matt Ingenthron while running with toy build 12746following exceptions are found and same is communicated to dev team with logs.
ERROR: test_rebalance_in_with_different_topologies (eventing.eventing_rebalance_collection.EventingRebalanceCollection)
----------------------------------------------------------------------
Traceback (most recent call last):
File "pytests/eventing/eventing_rebalance_collection.py", line 403, in test_rebalance_in_with_different_topologies
self.verify_doc_count_collections("src_bucket.scope_1.coll_1", self.docs_per_day * self.num_docs * 2)
File "./pytests/eventing/eventing_base.py", line 1047, in verify_doc_count_collections
actual_count, expected_count, total_dcp_backlog, timers_in_past, lcb))
Exception: missing data in destination bucket. Current : 63710 Expected : 80640 dcp_backlog : 0 TIMERS_IN_PAST : 0 lcb_exceptions : {'172.23.108.238': {}, '172.23.108.239': {}, '172.23.108.46': {}}----------------------------------------------------------------------
Ran 1 test in 1833.389s
======================================================================
ERROR: test_killing_eventing_processes_during_eventing_rebalance (eventing.eventing_rebalance.EventingRebalance)
----------------------------------------------------------------------
Traceback (most recent call last):
File "pytests/eventing/eventing_rebalance.py", line 1391, in test_killing_eventing_processes_during_eventing_rebalance
self.verify_eventing_results(self.function_name, 0, skip_stats_validation=True)
File "./pytests/eventing/eventing_base.py", line 377, in verify_eventing_results
timers_in_past,lcb))
Exception: duplicated data in destination bucket which is not expected. Current : 22452 Expected : 0 dcp_backlog : 0 TIMERS_IN_PAST : 0 lcb_exceptions : {'172.23.97.220':
, '172.23.97.221': {'12009': 667824, '405': 667824}, '172.23.97.222': {}}
----------------------------------------------------------------------
Ran 1 test in 1580.318s
Chanabasappa Ghali, looks like it is a query engine issue. Error 405 is LCB_ERR_DML_FAILURE and we don't retry it automatically. This error was introduced because of MB-44328.
Sergey Avseyev Not seeing the issue on toy http://latestbuilds.service.couchbase.com/builds/latestbuilds/couchbase-server/toybuilds/12746/ . But we have ns-server issue with the toys?
cc: Ankit Prabhu Vinayaka Kamath can you confirm?
Vikas Chaudhary manifest changes are merged. Next build should have changes with libcouchbase 3.1.2
Verified with build 7.0.0-5043, issue not observed. Hence closing the issue
Vikas Chaudhary, looks like bucket actually got deleted at 22:59:05.
2564:2021-02-12T22:59:05.034-08:00, ns_memcached:0:info:message(ns_1@172.23.123.61) - Shutting down bucket "src_bucket" on 'ns_1@172.23.123.61' for deletion
2565:2021-02-12T22:59:05.041-08:00, ns_memcached:0:info:message(ns_1@172.23.122.90) - Shutting down bucket "src_bucket" on 'ns_1@172.23.122.90' for deletion
2566:2021-02-12T22:59:05.041-08:00, ns_memcached:0:info:message(ns_1@172.23.123.62) - Shutting down bucket "src_bucket" on 'ns_1@172.23.123.62' for deletion
All the "LCB_ERR_BUCKET_NOT_FOUND" came after bucket got deleted.
ns_server.eventing.log:1159877:2021-02-12T22:59:05.197-08:00 [Info] eventing-consumer [worker_timers_0:/tmp/127.0.0.1:8091_0_1452344633.sock:84204] [lcb,server L:961 I:2979364568] <NOHOST:NOPORT> (CTX=(nil),,SRV=0x7f6374168ca0,IX=1) Connection attempt failed. Received LCB_ERR_BUCKET_NOT_FOUND (210) from libcouchbase, received 0 from operating system
ns_server.eventing.log:1159878:2021-02-12T22:59:05.197-08:00 [Info] eventing-consumer [worker_timers_0:/tmp/127.0.0.1:8091_0_1452344633.sock:84204] [lcb,server L:961 I:2979364568] <NOHOST:NOPORT> (CTX=(nil),,SRV=0x7f63741f83f0,IX=0) Connection attempt failed. Received LCB_ERR_BUCKET_NOT_FOUND (210) from libcouchbase, received 0 from operating system
ns_server.eventing.log:1159879:2021-02-12T22:59:05.197-08:00 [Info] eventing-consumer [worker_timers_0:/tmp/127.0.0.1:8091_0_1452344633.sock:84204] [lcb,server L:818 I:2979364568] <NOHOST:NOPORT> (CTX=(nil),,SRV=0x7f63741f83f0,IX=0) Failing command (pkt=0x7f636c0475f0, opaque=324398, opcode=0x0) with error LCB_ERR_BUCKET_NOT_FOUND (210)
ns_server.eventing.log:1159880:2021-02-12T22:59:05.197-08:00 [Info] eventing-consumer [worker_timers_0:/tmp/127.0.0.1:8091_0_1452344633.sock:84204] [lcb,server L:818 I:2979364568] <NOHOST:NOPORT> (CTX=(nil),,SRV=0x7f63741f83f0,IX=0) Failing command (pkt=0x7f636c144030, opaque=324399, opcode=0xb5) with error LCB_ERR_BUCKET_NOT_FOUND (210)
ns_server.eventing.log:1159881:2021-02-12T22:59:05.197-08:00 [Info] eventing-consumer [worker_timers_0:/tmp/127.0.0.1:8091_0_1452344633.sock:84204] [lcb,cccp L:187 I:2979364568] <NOHOST:NOPORT> (CTX=(nil),) Could not get configuration: LCB_ERR_BUCKET_NOT_FOUND (210)
ns_server.eventing.log:1159885:2021-02-12T22:59:05.199-08:00 [Info] eventing-consumer [worker_timers_0:/tmp/127.0.0.1:8091_0_1452344633.sock:84204] [lcb,server L:961 I:1960138947] <NOHOST:NOPORT> (CTX=(nil),,SRV=0x7f6374085290,IX=2) Connection attempt failed. Received LCB_ERR_BUCKET_NOT_FOUND (210) from libcouchbase, received 0 from operating system
ns_server.eventing.log:1159886:2021-02-12T22:59:05.199-08:00 [Info] eventing-consumer [worker_timers_0:/tmp/127.0.0.1:8091_0_1452344633.sock:84204] [lcb,server L:818 I:1960138947] <NOHOST:NOPORT> (CTX=(nil),,SRV=0x7f6374085290,IX=2) Failing command (pkt=0x7f6368032f50, opaque=325626, opcode=0x0) with error LCB_ERR_BUCKET_NOT_FOUND (210)
ns_server.eventing.log:1159887:2021-02-12T22:59:05.199-08:00 [Info] eventing-consumer [worker_timers_0:/tmp/127.0.0.1:8091_0_1452344633.sock:84204] [lcb,server L:961 I:1960138947] <NOHOST:NOPORT> (CTX=(nil),,SRV=0x7f6374084330,IX=0) Connection attempt failed. Received LCB_ERR_BUCKET_NOT_FOUND (210) from libcouchbase, received 0 from operating system
ns_server.eventing.log:1159888:2021-02-12T22:59:05.199-08:00 [Info] eventing-consumer [worker_timers_0:/tmp/127.0.0.1:8091_0_1452344633.sock:84204] [lcb,server L:818 I:1960138947] <NOHOST:NOPORT> (CTX=(nil),,SRV=0x7f6374084330,IX=0) Failing command (pkt=0x7f636802cc10, opaque=325627, opcode=0xb5) with error LCB_ERR_BUCKET_NOT_FOUND (210)
ns_server.eventing.log:1159889:2021-02-12T22:59:05.199-08:00 [Info] eventing-consumer [worker_timers_0:/tmp/127.0.0.1:8091_0_1452344633.sock:84204] [lcb,cccp L:187 I:1960138947] <NOHOST:NOPORT> (CTX=(nil),) Could not get configuration: LCB_ERR_BUCKET_NOT_FOUND (210)
ns_server.eventing.log:1159893:2021-02-12T22:59:05.204-08:00 [Info] eventing-consumer [worker_timers_0:/tmp/127.0.0.1:8091_0_1452344633.sock:84204] [lcb,server L:961 I:2979364568] <NOHOST:NOPORT> (CTX=(nil),,SRV=0x7f6374169170,IX=2) Connection attempt failed. Received LCB_ERR_BUCKET_NOT_FOUND (210) from libcouchbase, received 0 from operating system
ns_server.eventing.log:1159894:2021-02-12T22:59:05.204-08:00 [Info] eventing-consumer [worker_timers_0:/tmp/127.0.0.1:8091_0_1452344633.sock:84204] [lcb,server L:818 I:2979364568] <NOHOST:NOPORT> (CTX=(nil),,SRV=0x7f6374169170,IX=2) Failing command (pkt=0x7f6374006820, opaque=324401, opcode=0x0) with error LCB_ERR_BUCKET_NOT_FOUND (210)
From test.log:
[2021-02-12 22:59:03,344] - [bucket_helper] [140097149204224] - INFO - deleting existing buckets ['dst_bucket', 'metadata', 'src_bucket'] on 172.23.123.61
[2021-02-12 22:59:04,539] - [bucket_helper] [140097149204224] - INFO - waiting for bucket deletion to complete....
[2021-02-12 22:59:04,561] - [rest_client] [140097149204224] - INFO - node 172.23.123.61 existing buckets : ['metadata', 'src_bucket']
[2021-02-12 22:59:04,562] - [bucket_helper] [140097149204224] - INFO - deleted bucket : dst_bucket from 172.23.123.61
[2021-02-12 22:59:04,993] - [bucket_helper] [140097149204224] - INFO - waiting for bucket deletion to complete....
[2021-02-12 22:59:05,005] - [rest_client] [140097149204224] - INFO - node 172.23.123.61 existing buckets : ['src_bucket']
[2021-02-12 22:59:05,005] - [bucket_helper] [140097149204224] - INFO - deleted bucket : metadata from 172.23.123.61
[2021-02-12 22:59:06,185] - [bucket_helper] [140097149204224] - INFO - waiting for bucket deletion to complete....
[2021-02-12 22:59:06,191] - [rest_client] [140097149204224] - INFO - node 172.23.123.61 existing buckets : []