Skip to content

Commit

Permalink
picks the random time for attempting new update (#3275)
Browse files Browse the repository at this point in the history
* random update time

* update test comment

* addressed comments

* address comments

* pylint warn

* addressed comment
  • Loading branch information
nagworld9 authored Dec 16, 2024
1 parent 50d37f8 commit 50fe8ca
Show file tree
Hide file tree
Showing 5 changed files with 180 additions and 125 deletions.
83 changes: 8 additions & 75 deletions azurelinuxagent/ga/agent_update_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# limitations under the License.
#
# Requires Python 2.6+ and Openssl 1.0+
import os

from azurelinuxagent.common import conf, logger
from azurelinuxagent.common.event import add_event, WALAEventOperation
Expand All @@ -25,6 +24,7 @@
from azurelinuxagent.common.utils import textutil
from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
from azurelinuxagent.common.version import get_daemon_version
from azurelinuxagent.ga.guestagent import GuestAgentUpdateUtil
from azurelinuxagent.ga.rsm_version_updater import RSMVersionUpdater
from azurelinuxagent.ga.self_update_version_updater import SelfUpdateVersionUpdater

Expand All @@ -41,10 +41,6 @@ def get_agent_update_handler(protocol):
return AgentUpdateHandler(protocol)


RSM_UPDATE_STATE_FILE = "waagent_rsm_update"
INITIAL_UPDATE_STATE_FILE = "waagent_initial_update"


class AgentUpdateHandler(object):
"""
This class handles two type of agent updates. Handler initializes the updater to SelfUpdateVersionUpdater and switch to appropriate updater based on below conditions:
Expand Down Expand Up @@ -84,7 +80,7 @@ def __init__(self, protocol):
self._last_attempted_update_error_msg = ""

# Restore the state of rsm update. Default to self-update if last update is not with RSM or if agent doing initial update
if not self._get_is_last_update_with_rsm() or self._is_initial_update():
if not GuestAgentUpdateUtil.is_last_update_with_rsm() or GuestAgentUpdateUtil.is_initial_update():
self._updater = SelfUpdateVersionUpdater(self._gs_id)
else:
self._updater = RSMVersionUpdater(self._gs_id, self._daemon_version)
Expand All @@ -98,68 +94,6 @@ def _get_daemon_version_for_update():
# use the min version as 2.2.53 as we started setting the daemon version starting 2.2.53.
return FlexibleVersion("2.2.53")

@staticmethod
def _get_initial_update_state_file():
"""
This file keeps if initial update is attempted or not
"""
return os.path.join(conf.get_lib_dir(), INITIAL_UPDATE_STATE_FILE)

def _save_initial_update_state_file(self):
"""
Save the file if agent attempted initial update
"""
try:
with open(self._get_initial_update_state_file(), "w"):
pass
except Exception as e:
msg = "Error creating the initial update state file ({0}): {1}".format(self._get_initial_update_state_file(), ustr(e))
logger.warn(msg)
add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)

def _is_initial_update(self):
"""
Returns True if state file doesn't exit as presence of file consider as initial update already attempted
"""
return not os.path.exists(self._get_initial_update_state_file())

@staticmethod
def _get_rsm_update_state_file():
"""
This file keeps if last attempted update is rsm or not.
"""
return os.path.join(conf.get_lib_dir(), RSM_UPDATE_STATE_FILE)

def _save_rsm_update_state_file(self):
"""
Save the rsm state empty file when we switch to RSM
"""
try:
with open(self._get_rsm_update_state_file(), "w"):
pass
except Exception as e:
msg = "Error creating the RSM state file ({0}): {1}".format(self._get_rsm_update_state_file(), ustr(e))
logger.warn(msg)
add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)

def _remove_rsm_update_state_file(self):
"""
Remove the rsm state file when we switch to self-update
"""
try:
if os.path.exists(self._get_rsm_update_state_file()):
os.remove(self._get_rsm_update_state_file())
except Exception as e:
msg = "Error removing the RSM state file ({0}): {1}".format(self._get_rsm_update_state_file(), ustr(e))
logger.warn(msg)
add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)

def _get_is_last_update_with_rsm(self):
"""
Returns True if state file exists as this consider as last update with RSM is true
"""
return os.path.exists(self._get_rsm_update_state_file())

def _get_agent_family_manifest(self, goal_state):
"""
Get the agent_family from last GS for the given family
Expand Down Expand Up @@ -214,8 +148,7 @@ def run(self, goal_state, ext_gs_updated):

# Always agent uses self-update for initial update regardless vm enrolled into RSM or not
# So ignoring the check for updater switch for the initial goal state/update
if not self._is_initial_update():

if not GuestAgentUpdateUtil.is_initial_update():
# Updater will return True or False if we need to switch the updater
# If self-updater receives RSM update enabled, it will switch to RSM updater
# If RSM updater receives RSM update disabled, it will switch to self-update
Expand All @@ -227,14 +160,14 @@ def run(self, goal_state, ext_gs_updated):
logger.info(msg)
add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)
self._updater = SelfUpdateVersionUpdater(self._gs_id)
self._remove_rsm_update_state_file()
GuestAgentUpdateUtil.remove_rsm_update_state_file()

if is_rsm_update_enabled and isinstance(self._updater, SelfUpdateVersionUpdater):
msg = "VM enabled for RSM updates, switching to RSM update mode"
logger.info(msg)
add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)
self._updater = RSMVersionUpdater(self._gs_id, self._daemon_version)
self._save_rsm_update_state_file()
GuestAgentUpdateUtil.save_rsm_update_state_file()

# If updater is changed in previous step, we allow update as it consider as first attempt. If not, it checks below condition
# RSM checks new goal state; self-update checks manifest download interval
Expand Down Expand Up @@ -284,8 +217,8 @@ def run(self, goal_state, ext_gs_updated):

# save initial update state when agent is doing first update
finally:
if self._is_initial_update():
self._save_initial_update_state_file()
if GuestAgentUpdateUtil.is_initial_update():
GuestAgentUpdateUtil.save_initial_update_state_file()

def get_vmagent_update_status(self):
"""
Expand All @@ -307,4 +240,4 @@ def get_vmagent_update_status(self):
msg = "Unable to report agent update status: {0}".format(textutil.format_exception(err))
logger.warn(msg)
add_event(op=WALAEventOperation.AgentUpgrade, is_success=False, message=msg, log_event=True)
return None
return None
71 changes: 71 additions & 0 deletions azurelinuxagent/ga/guestagent.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
MAX_FAILURE = 3 # Max failure allowed for agent before declare bad agent
AGENT_UPDATE_COUNT_FILE = "update_attempt.json" # File for tracking agent update attempt count

RSM_UPDATE_STATE_FILE = "waagent_rsm_update"
INITIAL_UPDATE_STATE_FILE = "waagent_initial_update"


class GuestAgent(object):
def __init__(self, path, pkg):
Expand Down Expand Up @@ -329,3 +332,71 @@ def to_json(self):
return data


class GuestAgentUpdateUtil(object):

@staticmethod
def get_initial_update_state_file():
"""
This file tracks whether the initial update attempt has been made or not
"""
return os.path.join(conf.get_lib_dir(), INITIAL_UPDATE_STATE_FILE)

@staticmethod
def save_initial_update_state_file():
"""
Save the file if agent attempted initial update
"""
try:
with open(GuestAgentUpdateUtil.get_initial_update_state_file(), "w"):
pass
except Exception as e:
msg = "Error creating the initial update state file ({0}): {1}".format(GuestAgentUpdateUtil.get_initial_update_state_file(), ustr(e))
logger.warn(msg)
add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)

@staticmethod
def is_initial_update():
"""
Returns True if the state file doesn't exist, as the presence of the file indicates that the initial update has already been attempted
"""
return not os.path.exists(GuestAgentUpdateUtil.get_initial_update_state_file())

@staticmethod
def get_rsm_update_state_file():
"""
This file tracks whether the last attempted update was an RSM update or not
"""
return os.path.join(conf.get_lib_dir(), RSM_UPDATE_STATE_FILE)

@staticmethod
def save_rsm_update_state_file():
"""
Save the rsm state empty file when we switch to RSM
"""
try:
with open(GuestAgentUpdateUtil.get_rsm_update_state_file(), "w"):
pass
except Exception as e:
msg = "Error creating the RSM state file ({0}): {1}".format(GuestAgentUpdateUtil.get_rsm_update_state_file(), ustr(e))
logger.warn(msg)
add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)

@staticmethod
def remove_rsm_update_state_file():
"""
Remove the rsm state file when we switch to self-update
"""
try:
if os.path.exists(GuestAgentUpdateUtil.get_rsm_update_state_file()):
os.remove(GuestAgentUpdateUtil.get_rsm_update_state_file())
except Exception as e:
msg = "Error removing the RSM state file ({0}): {1}".format(GuestAgentUpdateUtil.get_rsm_update_state_file(), ustr(e))
logger.warn(msg)
add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)

@staticmethod
def is_last_update_with_rsm():
"""
Returns True if the state file exists, as this indicates that the last update was with RSM
"""
return os.path.exists(GuestAgentUpdateUtil.get_rsm_update_state_file())
63 changes: 37 additions & 26 deletions azurelinuxagent/ga/self_update_version_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@
# Requires Python 2.6+ and Openssl 1.0+

import datetime
import random

from azurelinuxagent.common import conf, logger
from azurelinuxagent.common.event import add_event, WALAEventOperation
from azurelinuxagent.common.exception import AgentUpgradeExitException, AgentUpdateError
from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
from azurelinuxagent.common.version import CURRENT_VERSION
from azurelinuxagent.ga.ga_version_updater import GAVersionUpdater
from azurelinuxagent.ga.guestagent import GuestAgentUpdateUtil


class SelfUpdateType(object):
Expand All @@ -38,7 +40,7 @@ class SelfUpdateVersionUpdater(GAVersionUpdater):
def __init__(self, gs_id):
super(SelfUpdateVersionUpdater, self).__init__(gs_id)
self._last_attempted_manifest_download_time = datetime.datetime.min
self._last_attempted_self_update_time = datetime.datetime.min
self._next_update_time = datetime.datetime.min

@staticmethod
def _get_largest_version(agent_manifest):
Expand All @@ -61,34 +63,35 @@ def _get_agent_upgrade_type(version):
return SelfUpdateType.Regular

@staticmethod
def _get_next_process_time(last_val, frequency, now):
def _get_next_process_time(upgrade_type, now):
"""
Get the next upgrade time
Returns random time in between 0 to 24hrs(regular) or 4hrs(hotfix) from now
"""
return now if last_val == datetime.datetime.min else last_val + datetime.timedelta(seconds=frequency)
if upgrade_type == SelfUpdateType.Hotfix:
frequency = conf.get_self_update_hotfix_frequency()
else:
frequency = conf.get_self_update_regular_frequency()
return now + datetime.timedelta(seconds=random.randint(0, frequency))

def _is_new_agent_allowed_update(self):
def _new_agent_allowed_now_to_update(self):
"""
This method ensure that update is allowed only once per (hotfix/Regular) upgrade frequency
This method is called when a new update is detected and computes a random time for the next update on the first call.
Since the method is called periodically until we reach the next update time, we shouldn't refresh or recompute the next update time on every call.
We use default value(datetime.datetime.min) to ensure the computation happens only once. This next_update_time will reset to default value(datetime.min) when agent allowed to update.
So that, in case the update fails due to an issue, such as a package download error, the same default value used to recompute the next update time.
"""
now = datetime.datetime.utcnow()
upgrade_type = self._get_agent_upgrade_type(self._version)
if upgrade_type == SelfUpdateType.Hotfix:
next_update_time = self._get_next_process_time(self._last_attempted_self_update_time,
conf.get_self_update_hotfix_frequency(), now)
else:
next_update_time = self._get_next_process_time(self._last_attempted_self_update_time,
conf.get_self_update_regular_frequency(), now)

if self._version > CURRENT_VERSION:
message = "Self-update discovered new {0} upgrade WALinuxAgent-{1}; Will upgrade on or after {2}".format(
upgrade_type, str(self._version), next_update_time.strftime(logger.Logger.LogTimeFormatInUTC))
logger.info(message)
add_event(op=WALAEventOperation.AgentUpgrade, message=message, log_event=False)

if next_update_time <= now:
# Update the last upgrade check time even if no new agent is available for upgrade
self._last_attempted_self_update_time = now
if self._next_update_time == datetime.datetime.min:
self._next_update_time = self._get_next_process_time(upgrade_type, now)
message = "Self-update discovered new {0} upgrade WALinuxAgent-{1}; Will upgrade on or after {2}".format(
upgrade_type, str(self._version), self._next_update_time.strftime(logger.Logger.LogTimeFormatInUTC))
logger.info(message)
add_event(op=WALAEventOperation.AgentUpgrade, message=message, log_event=False)

if self._next_update_time <= now:
self._next_update_time = datetime.datetime.min
return True
return False

Expand Down Expand Up @@ -150,14 +153,22 @@ def retrieve_agent_version(self, agent_family, goal_state):

def is_retrieved_version_allowed_to_update(self, agent_family):
"""
checks update is spread per (as specified in the conf.get_self_update_hotfix_frequency() or conf.get_self_update_regular_frequency())
or if version below than current version
return false when we don't allow updates.
we don't allow new version update, if
1) The version is not greater than current version
2) if current time is before next update time
Allow the update, if
1) Initial update
2) If current time is on or after next update time
"""
if not self._is_new_agent_allowed_update():
if self._version <= CURRENT_VERSION:
return False

if self._version <= CURRENT_VERSION:
# very first update need to proceed without any delay
if GuestAgentUpdateUtil.is_initial_update():
return True

if not self._new_agent_allowed_now_to_update():
return False

return True
Expand Down
Loading

0 comments on commit 50fe8ca

Please sign in to comment.