Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wait for a recheck after config reload and detection of primary not being available #123

Merged
merged 2 commits into from
Apr 11, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 27 additions & 17 deletions pglookout/pglookout.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,11 +494,17 @@ def consider_failover(self, own_state, master_node, standby_nodes):
self.log.warning("Not considering failover, because it's not enabled by configuration")
elif self.current_master:
self.cluster_monitor_check_queue.put("Master is missing, ask for immediate state check")
# Refresh the standby nodes list, and check that we still don't have a master node
self.failover_decision_queue.get(timeout=self.missing_master_from_config_timeout)
cluster_state = copy.deepcopy(self.cluster_state)
observer_state = copy.deepcopy(self.observer_state)
_, master_node, standby_nodes = self.create_node_map(cluster_state, observer_state)
# We seem to have a master node after all
if master_node and master_node.get("connection"):
return
master_known_to_be_gone = self.current_master in self.known_gone_nodes
now = time.monotonic()
config_timeout_exceeded = (now - self.cluster_nodes_change_time) >= self.missing_master_from_config_timeout

if master_known_to_be_gone or config_timeout_exceeded:
# we've seen a master at some point in time, but now it's
# not reachable or removed from configuration, perform an
Expand Down Expand Up @@ -860,29 +866,33 @@ def _get_check_interval(self) -> float:

def main_loop(self):
while self.running:
new_config = False
if self.config_reload_pending:
self.config_reload_pending = False
try:
self.load_config()
new_config = True
except:
self.config_reload_pending = True
raise
try:
self._apply_latest_config_version()
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Failed to update configuration")
self.stats.unexpected_exception(ex, where="main_loop_writer_cluster_state")
try:
self.check_cluster_state()
self._check_cluster_monitor_thread_health(now=time.monotonic())
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Failed to check cluster state")
self.stats.unexpected_exception(ex, where="main_loop_check_cluster_state")
try:
self.write_cluster_state_to_json_file()
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Failed to write cluster state")
self.stats.unexpected_exception(ex, where="main_loop_writer_cluster_state")
# If we have a new config, wait for the requested check to be completed before we try anything else.
if not new_config:
try:
self._apply_latest_config_version()
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Failed to update configuration")
self.stats.unexpected_exception(ex, where="main_loop_writer_cluster_state")
try:
self.check_cluster_state()
self._check_cluster_monitor_thread_health(now=time.monotonic())
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Failed to check cluster state")
self.stats.unexpected_exception(ex, where="main_loop_check_cluster_state")
try:
self.write_cluster_state_to_json_file()
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Failed to write cluster state")
self.stats.unexpected_exception(ex, where="main_loop_writer_cluster_state")
try:
self.failover_decision_queue.get(timeout=self._get_check_interval())
q = self.failover_decision_queue
Expand Down
Loading