Skip to content

Commit

Permalink
Merge pull request #109 from aiven/packi-dont-failover-by-default
Browse files Browse the repository at this point in the history
Only enable the failover fix if configured

#109
  • Loading branch information
Samuel GIFFARD authored May 22, 2023
2 parents e69fb67 + 0ac108d commit 028ba9e
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 3 deletions.
5 changes: 5 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,11 @@ If set, it will increase the statsd counter `cluster_monitor_health_timeout` if
`cluster_monitor` thread has not successfully completed a check since
`cluster_monitor_health_timeout_seconds`.

``failover_on_disconnect`` (default ``true``)

Determines if we take a fail-over decision if we're not connected to the primary anymore.


License
=======

Expand Down
8 changes: 7 additions & 1 deletion pglookout/pglookout.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def __init__(self, config_path):
self._start_time = None
self._config_version = 0
self._config_version_applied = 0
self._failover_on_disconnect = True
self.load_config()

signal.signal(signal.SIGHUP, self.load_config)
Expand Down Expand Up @@ -164,6 +165,7 @@ def load_config(self, _signal=None, _frame=None):
self.replication_lag_failover_timeout = self.config.get("max_failover_replication_time_lag", 120.0)
self.replication_catchup_timeout = self.config.get("replication_catchup_timeout", 300.0)
self.missing_master_from_config_timeout = self.config.get("missing_master_from_config_timeout", 15.0)
self._failover_on_disconnect = self.config.get("failover_on_disconnect", True)

if self.replication_lag_warning_boundary >= self.replication_lag_failover_timeout:
msg = "Replication lag warning boundary (%s) is not lower than its failover timeout (%s)"
Expand Down Expand Up @@ -483,7 +485,10 @@ def consider_failover(self, own_state, master_node, standby_nodes):
self.current_master,
master_node,
)
if self.current_master:
if not self._failover_on_disconnect and master_node:
self.stats.increase("failover_decision_on_disconnect_not_taken")
self.log.warning("Not considering failover, because it's not enabled by configuration")
elif self.current_master:
self.cluster_monitor_check_queue.put("Master is missing, ask for immediate state check")
master_known_to_be_gone = self.current_master in self.known_gone_nodes
now = time.monotonic()
Expand All @@ -506,6 +511,7 @@ def consider_failover(self, own_state, master_node, standby_nodes):
self.log.warning("Performing failover decision because no master node was seen in cluster before timeout")
self.do_failover_decision(own_state, standby_nodes)
return

self.check_replication_lag(own_state, standby_nodes)

def is_replication_lag_over_warning_limit(self):
Expand Down
9 changes: 7 additions & 2 deletions test/test_lookout.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,9 +711,11 @@ def test_failover_master_one_standby_one_observer_no_connections(pgl):
assert pgl.execute_external_command.call_count == 1


def test_failover_master_one_standby_no_observer_no_connections(pgl):
@pytest.mark.parametrize("failover_on_disconnect", (True, False))
def test_failover_master_one_standby_no_observer_no_connections(pgl, failover_on_disconnect):
pgl.own_db = "this_host"
pgl.current_master = "primary"
pgl._failover_on_disconnect = failover_on_disconnect # pylint: disable=protected-access

# add db state
_add_db_to_cluster_state(pgl, "primary", pg_is_in_recovery=False, connection=False)
Expand Down Expand Up @@ -751,7 +753,10 @@ def test_failover_master_one_standby_no_observer_no_connections(pgl):
db_time=datetime.datetime.utcnow() - datetime.timedelta(seconds=pgl.replication_lag_failover_timeout + 1),
)
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 1
if failover_on_disconnect:
assert pgl.execute_external_command.call_count == 1
else:
assert pgl.execute_external_command.call_count == 0


def test_find_current_master(pgl):
Expand Down

0 comments on commit 028ba9e

Please sign in to comment.