Details
-
Bug
-
Resolution: Fixed
-
Critical
-
None
-
None
-
Security Level: Public
-
None
Description
When running my wip commit that ensures things are persisted on replica and new master before initiating takeover (http://review.couchbase.org/18798) I sometimes get rebalance 'stuck'.
Seemingly because items are replicated & persisted (verified by looking at .couch files), but affected checkpoint is not marked as persisted.
In this particular case it happened to vbucket 13.
What my change is doing is I'm creating checkpoint (using create checkpoint command) after establishing replication stream. And then I'm waiting until checkpoint prior to that is persisted. But in this case stats tell me that checkpoint 0 is persisted instead of 2.
Relevant stats:
root@beta:~/src/altoros/moxi/ns_server/couch/0/default# ~/src/altoros/moxi/repo20/install/bin/cbstats lh:12000 raw 'checkpoint 13'
vb_13:checkpoint_extension: false
vb_13:eq_tapq:replication_building_13_'n_1@10.17.2.163':cursor_checkpoint_id: 3
vb_13:last_closed_checkpoint_id: 2
vb_13:num_checkpoint_items: 1
vb_13:num_checkpoints: 1
vb_13:num_items_for_persistence: 0
vb_13:num_open_checkpoint_items: 0
vb_13:num_tap_cursors: 1
vb_13:open_checkpoint_id: 3
vb_13:persisted_checkpoint_id: 2
vb_13:state: active
root@beta:~/src/altoros/moxi/ns_server/couch/0/default# ~/src/altoros/moxi/repo20/install/bin/cbstats lh:12002 raw 'checkpoint 13'
vb_13:checkpoint_extension: false
vb_13:last_closed_checkpoint_id: 2
vb_13:num_checkpoint_items: 1
vb_13:num_checkpoints: 1
vb_13:num_items_for_persistence: 0
vb_13:num_open_checkpoint_items: 0
vb_13:num_tap_cursors: 0
vb_13:open_checkpoint_id: 3
vb_13:persisted_checkpoint_id: 0
vb_13:state: replica
root@beta:~/src/altoros/moxi/ns_server/couch/0/default# ~/src/altoros/moxi/repo20/install/bin/cbstats lh:12002 raw 'tap'
ep_tap_ack_grace_period: 300
ep_tap_ack_interval: 1000
ep_tap_ack_window_size: 10
ep_tap_backoff_period: 5
ep_tap_bg_fetch_requeued: 0
ep_tap_bg_fetched: 0
ep_tap_bg_max_pending: 500
ep_tap_count: 2
ep_tap_deletes: 0
ep_tap_fg_fetched: 0
ep_tap_noop_interval: 20
ep_tap_queue_backfillremaining: 0
ep_tap_queue_backoff: 0
ep_tap_queue_drain: 0
ep_tap_queue_fill: 0
ep_tap_queue_itemondisk: 0
ep_tap_throttle_queue_cap: 1000000
ep_tap_throttle_threshold: 90
ep_tap_throttled: 0
ep_tap_total_backlog_size: 0
ep_tap_total_fetched: 0
ep_tap_total_queue: 0
eq_tapq:anon_1:connected: true
eq_tapq:anon_1:created: 14
eq_tapq:anon_1:num_checkpoint_end: 0
eq_tapq:anon_1:num_checkpoint_end_failed: 0
eq_tapq:anon_1:num_checkpoint_start: 13
eq_tapq:anon_1:num_checkpoint_start_failed: 0
eq_tapq:anon_1:num_delete: 0
eq_tapq:anon_1:num_delete_failed: 0
eq_tapq:anon_1:num_flush: 0
eq_tapq:anon_1:num_flush_failed: 0
eq_tapq:anon_1:num_mutation: 0
eq_tapq:anon_1:num_mutation_failed: 0
eq_tapq:anon_1:num_opaque: 2
eq_tapq:anon_1:num_opaque_failed: 0
eq_tapq:anon_1:num_unknown: 0
eq_tapq:anon_1:num_vbucket_set: 0
eq_tapq:anon_1:num_vbucket_set_failed: 0
eq_tapq:anon_1:pending_disconnect: false
eq_tapq:anon_1:reserved: 0
eq_tapq:anon_1:supports_ack: true
eq_tapq:anon_1:type: consumer
eq_tapq:anon_2:connected: true
eq_tapq:anon_2:created: 422
eq_tapq:anon_2:num_checkpoint_end: 0
eq_tapq:anon_2:num_checkpoint_end_failed: 0
eq_tapq:anon_2:num_checkpoint_start: 1
eq_tapq:anon_2:num_checkpoint_start_failed: 0
eq_tapq:anon_2:num_delete: 0
eq_tapq:anon_2:num_delete_failed: 0
eq_tapq:anon_2:num_flush: 0
eq_tapq:anon_2:num_flush_failed: 0
eq_tapq:anon_2:num_mutation: 119
eq_tapq:anon_2:num_mutation_failed: 0
eq_tapq:anon_2:num_opaque: 4
eq_tapq:anon_2:num_opaque_failed: 0
eq_tapq:anon_2:num_unknown: 0
eq_tapq:anon_2:num_vbucket_set: 0
eq_tapq:anon_2:num_vbucket_set_failed: 0
eq_tapq:anon_2:pending_disconnect: false
eq_tapq:anon_2:reserved: 0
eq_tapq:anon_2:supports_ack: true
eq_tapq:anon_2:type: consumer
root@beta:~/src/altoros/moxi/ns_server/couch/0/default#
root@beta:~/src/altoros/moxi/ns_server/couch/0/default# ~/src/altoros/moxi/repo20/install/bin/cbstats lh:12000 raw 'tap'
ep_tap_ack_grace_period: 300
ep_tap_ack_interval: 1000
ep_tap_ack_window_size: 10
ep_tap_backoff_period: 5
ep_tap_bg_fetch_requeued: 0
ep_tap_bg_fetched: 0
ep_tap_bg_max_pending: 500
ep_tap_count: 2
ep_tap_deletes: 0
ep_tap_fg_fetched: 119
ep_tap_noop_interval: 20
ep_tap_queue_backfillremaining: 0
ep_tap_queue_backoff: 0
ep_tap_queue_drain: 119
ep_tap_queue_fill: 0
ep_tap_queue_itemondisk: 0
ep_tap_throttle_queue_cap: 1000000
ep_tap_throttle_threshold: 90
ep_tap_throttled: 0
ep_tap_total_backlog_size: 0
ep_tap_total_fetched: 139
ep_tap_total_queue: 0
eq_tapq:replication_building_13_'n_1@10.17.2.163':ack_log_size: 0
eq_tapq:replication_building_13_'n_1@10.17.2.163':ack_seqno: 129
eq_tapq:replication_building_13_'n_1@10.17.2.163':ack_window_full: false
eq_tapq:replication_building_13_'n_1@10.17.2.163':backfill_completed: true
eq_tapq:replication_building_13_'n_1@10.17.2.163':bg_jobs_completed: 0
eq_tapq:replication_building_13_'n_1@10.17.2.163':bg_jobs_issued: 0
eq_tapq:replication_building_13_'n_1@10.17.2.163':bg_result_size: 0
eq_tapq:replication_building_13_'n_1@10.17.2.163':connected: true
eq_tapq:replication_building_13_'n_1@10.17.2.163':created: 422
eq_tapq:replication_building_13_'n_1@10.17.2.163':flags: 85 (ack,backfill,vblist,checkpoints)
eq_tapq:replication_building_13_'n_1@10.17.2.163':has_queued_item: false
eq_tapq:replication_building_13_'n_1@10.17.2.163':idle: true
eq_tapq:replication_building_13_'n_1@10.17.2.163':paused: 1
eq_tapq:replication_building_13_'n_1@10.17.2.163':pending_backfill: false
eq_tapq:replication_building_13_'n_1@10.17.2.163':pending_disconnect: false
eq_tapq:replication_building_13_'n_1@10.17.2.163':pending_disk_backfill: false
eq_tapq:replication_building_13_'n_1@10.17.2.163':qlen: 0
eq_tapq:replication_building_13_'n_1@10.17.2.163':qlen_high_pri: 0
eq_tapq:replication_building_13_'n_1@10.17.2.163':qlen_low_pri: 0
eq_tapq:replication_building_13_'n_1@10.17.2.163':queue_backfillremaining: 0
eq_tapq:replication_building_13_'n_1@10.17.2.163':queue_backoff: 0
eq_tapq:replication_building_13_'n_1@10.17.2.163':queue_drain: 119
eq_tapq:replication_building_13_'n_1@10.17.2.163':queue_fill: 0
eq_tapq:replication_building_13_'n_1@10.17.2.163':queue_itemondisk: 0
eq_tapq:replication_building_13_'n_1@10.17.2.163':queue_memory: 0
eq_tapq:replication_building_13_'n_1@10.17.2.163':rec_fetched: 124
eq_tapq:replication_building_13_'n_1@10.17.2.163':recv_ack_seqno: 128
eq_tapq:replication_building_13_'n_1@10.17.2.163':reserved: 1
eq_tapq:replication_building_13_'n_1@10.17.2.163':seqno_ack_requested: 128
eq_tapq:replication_building_13_'n_1@10.17.2.163':supports_ack: true
eq_tapq:replication_building_13_'n_1@10.17.2.163':suspended: false
eq_tapq:replication_building_13_'n_1@10.17.2.163':total_backlog_size: 0
eq_tapq:replication_building_13_'n_1@10.17.2.163':total_noops: 17
eq_tapq:replication_building_13_'n_1@10.17.2.163':type: producer
eq_tapq:replication_building_13_'n_1@10.17.2.163':vb_filter:
eq_tapq:replication_building_13_'n_1@10.17.2.163':vb_filters: 1
eq_tapq:replication_n_1@10.17.2.163:ack_log_size: 0
eq_tapq:replication_n_1@10.17.2.163:ack_seqno: 18
eq_tapq:replication_n_1@10.17.2.163:ack_window_full: false
eq_tapq:replication_n_1@10.17.2.163:backfill_completed: true
eq_tapq:replication_n_1@10.17.2.163:bg_jobs_completed: 0
eq_tapq:replication_n_1@10.17.2.163:bg_jobs_issued: 0
eq_tapq:replication_n_1@10.17.2.163:bg_result_size: 0
eq_tapq:replication_n_1@10.17.2.163:connected: true
eq_tapq:replication_n_1@10.17.2.163:created: 14
eq_tapq:replication_n_1@10.17.2.163:flags: 85 (ack,backfill,vblist,checkpoints)
eq_tapq:replication_n_1@10.17.2.163:has_queued_item: false
eq_tapq:replication_n_1@10.17.2.163:idle: true
eq_tapq:replication_n_1@10.17.2.163:paused: 1
eq_tapq:replication_n_1@10.17.2.163:pending_backfill: false
eq_tapq:replication_n_1@10.17.2.163:pending_disconnect: false
eq_tapq:replication_n_1@10.17.2.163:pending_disk_backfill: false
eq_tapq:replication_n_1@10.17.2.163:qlen: 0
eq_tapq:replication_n_1@10.17.2.163:qlen_high_pri: 0
eq_tapq:replication_n_1@10.17.2.163:qlen_low_pri: 0
eq_tapq:replication_n_1@10.17.2.163:queue_backfillremaining: 0
eq_tapq:replication_n_1@10.17.2.163:queue_backoff: 0
eq_tapq:replication_n_1@10.17.2.163:queue_drain: 0
eq_tapq:replication_n_1@10.17.2.163:queue_fill: 0
eq_tapq:replication_n_1@10.17.2.163:queue_itemondisk: 0
eq_tapq:replication_n_1@10.17.2.163:queue_memory: 0
eq_tapq:replication_n_1@10.17.2.163:rec_fetched: 15
eq_tapq:replication_n_1@10.17.2.163:recv_ack_seqno: 17
eq_tapq:replication_n_1@10.17.2.163:reserved: 1
eq_tapq:replication_n_1@10.17.2.163:seqno_ack_requested: 17
eq_tapq:replication_n_1@10.17.2.163:supports_ack: true
eq_tapq:replication_n_1@10.17.2.163:suspended: false
eq_tapq:replication_n_1@10.17.2.163:total_backlog_size: 0
eq_tapq:replication_n_1@10.17.2.163:total_noops: 36
eq_tapq:replication_n_1@10.17.2.163:type: producer
eq_tapq:replication_n_1@10.17.2.163:vb_filter:
eq_tapq:replication_n_1@10.17.2.163:vb_filters: 13
Attachments
For Gerrit Dashboard: MB-6084 | ||||||
---|---|---|---|---|---|---|
# | Subject | Branch | Project | Status | CR | V |
19355,2 | MB-6084 Reset checkpoint cursors after receiving backfill streams | master | ep-engine | Status: MERGED | +2 | +1 |