From ae60b3a4511b163256a7596224f7a3af2b8f9e8d Mon Sep 17 00:00:00 2001 From: Ronan Dunklau Date: Wed, 10 Apr 2024 19:39:35 +0200 Subject: [PATCH 1/2] On new config, wait until the next check is complete before taking any decision --- pglookout/pglookout.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py index 4463df1..7829082 100755 --- a/pglookout/pglookout.py +++ b/pglookout/pglookout.py @@ -860,29 +860,33 @@ def _get_check_interval(self) -> float: def main_loop(self): while self.running: + new_config = False if self.config_reload_pending: self.config_reload_pending = False try: self.load_config() + new_config = True except: self.config_reload_pending = True raise - try: - self._apply_latest_config_version() - except Exception as ex: # pylint: disable=broad-except - self.log.exception("Failed to update configuration") - self.stats.unexpected_exception(ex, where="main_loop_writer_cluster_state") - try: - self.check_cluster_state() - self._check_cluster_monitor_thread_health(now=time.monotonic()) - except Exception as ex: # pylint: disable=broad-except - self.log.exception("Failed to check cluster state") - self.stats.unexpected_exception(ex, where="main_loop_check_cluster_state") - try: - self.write_cluster_state_to_json_file() - except Exception as ex: # pylint: disable=broad-except - self.log.exception("Failed to write cluster state") - self.stats.unexpected_exception(ex, where="main_loop_writer_cluster_state") + # If we have a new config, wait for the requested check to be completed before we try anything else. + if not new_config: + try: + self._apply_latest_config_version() + except Exception as ex: # pylint: disable=broad-except + self.log.exception("Failed to update configuration") + self.stats.unexpected_exception(ex, where="main_loop_writer_cluster_state") + try: + self.check_cluster_state() + self._check_cluster_monitor_thread_health(now=time.monotonic()) + except Exception as ex: # pylint: disable=broad-except + self.log.exception("Failed to check cluster state") + self.stats.unexpected_exception(ex, where="main_loop_check_cluster_state") + try: + self.write_cluster_state_to_json_file() + except Exception as ex: # pylint: disable=broad-except + self.log.exception("Failed to write cluster state") + self.stats.unexpected_exception(ex, where="main_loop_writer_cluster_state") try: self.failover_decision_queue.get(timeout=self._get_check_interval()) q = self.failover_decision_queue From 9ffc8c6fe100119264098f3f7fb2efa732fc9409 Mon Sep 17 00:00:00 2001 From: Ronan Dunklau Date: Wed, 10 Apr 2024 17:17:28 +0200 Subject: [PATCH 2/2] Actually refresh the standby_nodes from the newly acquired state --- pglookout/pglookout.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py index 7829082..42fbf3a 100755 --- a/pglookout/pglookout.py +++ b/pglookout/pglookout.py @@ -494,11 +494,17 @@ def consider_failover(self, own_state, master_node, standby_nodes): self.log.warning("Not considering failover, because it's not enabled by configuration") elif self.current_master: self.cluster_monitor_check_queue.put("Master is missing, ask for immediate state check") + # Refresh the standby nodes list, and check that we still don't have a master node self.failover_decision_queue.get(timeout=self.missing_master_from_config_timeout) + cluster_state = copy.deepcopy(self.cluster_state) + observer_state = copy.deepcopy(self.observer_state) + _, master_node, standby_nodes = self.create_node_map(cluster_state, observer_state) + # We seem to have a master node after all + if master_node and master_node.get("connection"): + return master_known_to_be_gone = self.current_master in self.known_gone_nodes now = time.monotonic() config_timeout_exceeded = (now - self.cluster_nodes_change_time) >= self.missing_master_from_config_timeout - if master_known_to_be_gone or config_timeout_exceeded: # we've seen a master at some point in time, but now it's # not reachable or removed from configuration, perform an