Details
-
Bug
-
Status: Closed
-
Major
-
Resolution: Fixed
-
6.0.4
-
Untriaged
-
1
-
Unknown
Description
As seen in a recent support case, if KillAndRespawn restarts consumer when there is no checkpoint, it starts from 0. However, if user had requested running "From Now", this is incorrect behavior as it must start from current seq no and not 0. So in other words, missing checkpoint must be interpreted as 0 only if boundary was everything; if it was from-now, then missing checkpoint must be interpreted as current sequence number
Attachments
Issue Links
- is a backport of
-
MB-39994 After KillAndRespawn, from-now directive ignored
-
- Closed
-
Activity
Build couchbase-server-6.6.0-7828 contains eventing commit 5794f10 with commit message:
MB-40009 : record the start_seqno in the initial check point for from_now
Vikas Chaudhary can we please add a functional test to ensure reasonable behavior during timeouts? We can reduce timeout and run some long N1QL query to induce such behaviors. Let's discuss in a meeting, but request adding a test before we Close/Verify this issue.
Sriram Melkote i try to verify this issue with the following step
- Load source bucket with 1000 docs
- Create a handler with curl (curl response delayed with 20 sec) and handler timeout 5 sec
function OnUpdate(doc, meta) {
|
log(meta.id);
|
var request = {
|
path : 'delay',
|
headers: {
|
"sample-header": "test"
|
}
|
};
|
try {
|
var response = curl("GET", server, request);
|
log('response body received from server:', response.body);
|
log('response headers received from server:', response.headers);
|
log('response status received from server:', response.status);
|
var res= new Uint8Array(response.body);
|
if(response.status == 200){
|
dst_bucket[meta.id]=response.body;
|
}
|
}
|
catch (e) {
|
log('error:', e);
|
}
|
}
|
|
function OnDelete(meta) {
|
var request = {
|
path : 'delay',
|
headers: {
|
"sample-header": "test"
|
}
|
};
|
try {
|
var response = curl("GET", server, request);
|
log('response body received from server:', response.body);
|
log('response headers received from server:', response.headers);
|
log('response status received from server:', response.status);
|
var res= new Uint8Array(response.body);
|
if(response.status == 200){
|
delete dst_bucket[meta.id];
|
}
|
}
|
catch (e) {
|
log('error:', e);
|
}
|
}
|
- deploy handler "from now" and update 100 docs (kill all eventing consumers )
- Observed there is no failure or error, no mutations, no trace of docs.
-
[
{
"dcp_feed_boundary": "from_now",
"event_processing_stats": {
"agg_messages_sent_to_worker": 14133,
"agg_queue_memory": 0,
"agg_queue_memory_cap": 267386880,
"agg_queue_size": 0,
"agg_queue_size_cap": 300000,
"agg_timer_feedback_queue_cap": 1500,
"dcp_stream_req_counter": 1919,
"dcp_stream_req_err_counter": 895,
"dcp_streamreq": 1024,
"execution_stats": 2617,
"failure_stats": 2617,
"latency_stats": 2617,
"lcb_exception_stats": 2617,
"log_level": 4,
"num_processed_events": 0,
"processed_events_size": 0,
"thr_count": 4,
"thr_map": 4,
"v8_init": 4,
"v8_load": 4,
"worker_spawn_counter": 5
},
"events_remaining": {
"dcp_backlog": 0
},
"execution_stats": {
"agg_queue_memory": 0,
"agg_queue_size": 0,
"curl": {
"delete": 0,
"get": 0,
"head": 0,
"post": 0,
"put": 0
},
"dcp_delete_msg_counter": 0,
"dcp_delete_parse_failure": 0,
"dcp_mutation_msg_counter": 0,
"dcp_mutation_parse_failure": 0,
"enqueued_dcp_delete_msg_counter": 0,
"enqueued_dcp_mutation_msg_counter": 0,
"enqueued_timer_msg_counter": 0,
"feedback_queue_size": 0,
"filtered_dcp_delete_counter": 0,
"filtered_dcp_mutation_counter": 0,
"lcb_retry_failure": 0,
"messages_parsed": 14115,
"num_processed_events": 0,
"on_delete_failure": 0,
"on_delete_success": 0,
"on_update_failure": 0,
"on_update_success": 0,
"processed_events_size": 0,
"timer_cancel_counter": 0,
"timer_create_counter": 0,
"timer_create_failure": 0,
"timer_msg_counter": 0,
"timer_responses_sent": 0,
"timestamp": {
"3615": "2020-06-29T07:57:09Z",
"3616": "2020-06-29T07:57:09Z",
"3655": "2020-06-29T07:57:09Z"
},
"uv_msg_parse_failure": 0,
"uv_try_write_failure_counter": 0
},
"failure_stats": {
"app_worker_setting_events_lost": 0,
"bucket_op_exception_count": 0,
"checkpoint_failure_count": 0,
"curl_non_200_response": 0,
"dcp_events_lost": 0,
"debugger_events_lost": 0,
"delete_events_lost": 0,
"mutation_events_lost": 0,
"n1ql_op_exception_count": 0,
"timeout_count": 0,
"timer_callback_missing_counter": 0,
"timer_context_size_exceeded_counter": 0,
"timer_events_lost": 0,
"timestamp": {
"3615": "2020-06-29T07:57:09Z",
"3616": "2020-06-29T07:57:09Z",
"3655": "2020-06-29T07:57:09Z"
},
"v8worker_events_lost": 0
},
"function_name": "delay1",
"gocb_creds_request_counter": 106,
"function_id": 2155015003,
"internal_vb_distribution_stats": {
"worker_delay1_0": "[0-341]",
"worker_delay1_2": "[683-1023]"
},
"latency_percentile_stats": {
"100": 0,
"50": 0,
"80": 0,
"90": 0,
"95": 0,
"99": 0
},
"lcb_creds_request_counter": 224,
"lcb_exception_stats": {},
"planner_stats": [
{
"host_name": "10.143.192.102:8096",
"start_vb": 0,
"vb_count": 1024
}
],
"metastore_stats": {},
"vb_distribution_stats_from_metadata": {
"10.143.192.102:8096": {
"worker_delay1_0": "[0-341]",
"worker_delay1_1": "[342-682]",
"worker_delay1_2": "[683-1023]"
}
},
"worker_pids": {
"worker_delay1_0": 3615,
"worker_delay1_2": 3655
}
}
]
Once it deployed if i load more data than i can see timeouts
logs: kv - .101 , eventing .102
https://cb-jira.s3.us-east-2.amazonaws.com/logs/MB-40009/collectinfo-2020-06-29T075803-ns_1%4010.143.192.101.zip
https://cb-jira.s3.us-east-2.amazonaws.com/logs/MB-40009/collectinfo-2020-06-29T075803-ns_1%4010.143.192.102.zip
https://cb-jira.s3.us-east-2.amazonaws.com/logs/MB-40009/collectinfo-2020-06-29T075803-ns_1%4010.143.192.103.zip
https://cb-jira.s3.us-east-2.amazonaws.com/logs/MB-40009/collectinfo-2020-06-29T075803-ns_1%4010.143.192.104.zip
Kill and respawn happened on 6.0.4 but not observed on 6.6.0 build 7827.
I have reduced worker_response_timeout and not seeing kill happening
root@node2-cb602-ubuntu16:/opt/couchbase/var/lib/couchbase/logs# tailf eventing.log | grep Producer::KillAndRespawnEventingConsumer
|
2020-06-30T01:16:01.543-07:00 [Info] Producer::KillAndRespawnEventingConsumer [n1ql:2] IndexToPurge: 2 ConsumerIndex: 2 Shutting down Eventing.Consumer instance: consumer => app: n1ql name: worker_n1ql_2 tcpPort: /tmp/127.0.0.1:8091_worker_n1ql_2.sock ospid: 16320 dcpEventProcessed: DCP_STREAMREQ:339 v8EventProcessed: failure_stats:1 log_level:1 thr_count:1 v8_load:1 latency_stats:1 lcb_exception_stats:1 thr_map:1 v8_init:1 execution_stats:1
|
2020-06-30T01:16:01.544-07:00 [Info] Producer::KillAndRespawnEventingConsumer [n1ql:2] IndexToPurge: 2 ConsumerIndex: 2 Closing down listener handles
|
2020-06-30T01:16:01.549-07:00 [Info] Producer::KillAndRespawnEventingConsumer [n1ql:2] ConsumerIndex: 2 respawning the Eventing.Consumer instance
|
2020-06-30T01:16:01.630-07:00 [Info] Producer::KillAndRespawnEventingConsumer [n1ql:2] IndexToPurge: 0 ConsumerIndex: 0 Shutting down Eventing.Consumer instance: consumer => app: n1ql name: worker_n1ql_0 tcpPort: /tmp/127.0.0.1:8091_worker_n1ql_0.sock ospid: 16318 dcpEventProcessed: DCP_STREAMREQ:337 v8EventProcessed: latency_stats:1 thr_map:1 thr_count:1 v8_init:1 v8_load:1 failure_stats:1 log_level:1 execution_stats:1 lcb_exception_stats:1
|
2020-06-30T01:16:01.636-07:00 [Info] Producer::KillAndRespawnEventingConsumer [n1ql:2] IndexToPurge: 0 ConsumerIndex: 0 Closing down listener handles
|
2020-06-30T01:16:01.639-07:00 [Info] Producer::KillAndRespawnEventingConsumer [n1ql:2] ConsumerIndex: 0 respawning the Eventing.Consumer instance
|
2020-06-30T01:16:01.692-07:00 [Info] Producer::KillAndRespawnEventingConsumer [n1ql:1] IndexToPurge: 0 ConsumerIndex: 1 Shutting down Eventing.Consumer instance: consumer => app: n1ql name: worker_n1ql_1 tcpPort: /tmp/127.0.0.1:8091_worker_n1ql_1.sock ospid: 16319 dcpEventProcessed: DCP_STREAMREQ:341 v8EventProcessed: v8_init:1 v8_load:1 execution_stats:1 lcb_exception_stats:1 log_level:1 thr_map:1 thr_count:1 failure_stats:1 latency_stats:1
|
2020-06-30T01:16:01.692-07:00 [Info] Producer::KillAndRespawnEventingConsumer [n1ql:1] IndexToPurge: 0 ConsumerIndex: 1 Closing down listener handles
|
2020-06-30T01:16:01.692-07:00 [Info] Producer::KillAndRespawnEventingConsumer [n1ql:1] ConsumerIndex: 1 respawning the Eventing.Consumer instance
|
Verified on 6.6.0 build 7850 where observed the duplicate mutations.
- Load 10k docs in the source bucket
- Create a handler and deploy with from now
function OnUpdate(doc, meta) {
|
log('docId', meta.id);
|
var key = meta.id+"_"+Date.now()+Math.random();
|
dst_bucket[key]=meta.id;
|
}
|
function OnDelete(meta, options) {
|
}
|
- load more documents(1k) and kill the eventing consumers (this should be done within 30 sec of initial deployment)
- Observed: Duplicate documents. Documents in destination bucket are 2k but it's not started from the beginning.
Jeelan Poola and Till Westmann - Thank you. Please add this defect for 6.6.