diff --git a/.github/workflows/ci_pr.yml b/.github/workflows/ci_pr.yml index a4565198d..e5592688c 100644 --- a/.github/workflows/ci_pr.yml +++ b/.github/workflows/ci_pr.yml @@ -10,74 +10,91 @@ on: jobs: test-legacy-python-versions: - name: "Python 2.6 Unit Tests" - runs-on: ubuntu-18.04 - strategy: fail-fast: false + matrix: + include: + - python-version: 2.6 + - python-version: 3.4 + + name: "Python ${{ matrix.python-version }} Unit Tests" + runs-on: ubuntu-20.04 + container: + image: ubuntu:16.04 + volumes: + - /home/waagent:/home/waagent + defaults: + run: + shell: bash -l {0} + env: NOSEOPTS: "--verbose" steps: + - uses: actions/checkout@v3 - - name: Install Python 2.6 + - name: Install Python ${{ matrix.python-version }} run: | - curl https://dcrdata.blob.core.windows.net/python/python-2.6.tar.bz2 -o python-2.6.tar.bz2 - sudo tar xjvf python-2.6.tar.bz2 --directory / - - - uses: actions/checkout@v2 + apt-get update + apt-get install -y curl bzip2 sudo python3 + curl https://dcrdata.blob.core.windows.net/python/python-${{ matrix.python-version }}.tar.bz2 -o python-${{ matrix.python-version }}.tar.bz2 + sudo tar xjvf python-${{ matrix.python-version }}.tar.bz2 --directory / - name: Test with nosetests run: | - source /home/waagent/virtualenv/python2.6.9/bin/activate + if [[ ${{ matrix.python-version }} == 2.6 ]]; then + source /home/waagent/virtualenv/python2.6.9/bin/activate + else + source /home/waagent/virtualenv/python3.4.8/bin/activate + fi ./ci/nosetests.sh exit $? test-current-python-versions: - + strategy: fail-fast: false matrix: include: - python-version: 2.7 - PYLINTOPTS: "--rcfile=ci/2.7.pylintrc" + PYLINTOPTS: "--rcfile=ci/2.7.pylintrc --ignore=tests_e2e,makepkg.py" - - python-version: 3.4 - PYLINTOPTS: "--rcfile=ci/2.7.pylintrc" + - python-version: 3.5 + PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e,makepkg.py" - python-version: 3.6 - PYLINTOPTS: "--rcfile=ci/3.6.pylintrc" + PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e" - python-version: 3.7 - PYLINTOPTS: "--rcfile=ci/3.6.pylintrc" - + PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e" + - python-version: 3.8 - PYLINTOPTS: "--rcfile=ci/3.6.pylintrc" + PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e" - python-version: 3.9 PYLINTOPTS: "--rcfile=ci/3.6.pylintrc" additional-nose-opts: "--with-coverage --cover-erase --cover-inclusive --cover-branches --cover-package=azurelinuxagent" - + name: "Python ${{ matrix.python-version }} Unit Tests" - runs-on: ubuntu-18.04 + runs-on: ubuntu-20.04 env: PYLINTOPTS: ${{ matrix.PYLINTOPTS }} - PYLINTFILES: "azurelinuxagent setup.py makepkg.py tests" + PYLINTFILES: "azurelinuxagent setup.py makepkg.py tests tests_e2e" NOSEOPTS: "--with-timer ${{ matrix.additional-nose-opts }}" PYTHON_VERSION: ${{ matrix.python-version }} steps: - + - name: Checkout WALinuxAgent repo - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - + - name: Install dependencies id: install-dependencies run: | @@ -106,6 +123,6 @@ jobs: - name: Upload Coverage if: matrix.python-version == 3.9 - uses: codecov/codecov-action@v1 + uses: codecov/codecov-action@v2 with: file: ./coverage.xml \ No newline at end of file diff --git a/.gitignore b/.gitignore index d4c7873f2..fd64d3314 100644 --- a/.gitignore +++ b/.gitignore @@ -17,8 +17,6 @@ develop-eggs/ dist/ downloads/ eggs/ -lib/ -lib64/ parts/ sdist/ var/ diff --git a/CODEOWNERS b/CODEOWNERS index 32cd27f22..8707e60a5 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -21,4 +21,4 @@ # # Linux Agent team # -* @narrieta @ZhidongPeng @nagworld9 +* @narrieta @ZhidongPeng @nagworld9 @maddieford diff --git a/README.md b/README.md index 996fafd5e..ae6a85106 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,9 @@ The agent will use an HTTP proxy if provided via the `http_proxy` (for `http` re `https_proxy` (for `https` requests) environment variables. The `HttpProxy.Host` and `HttpProxy.Port` configuration variables (see below), if used, will override the environment settings. Due to limitations of Python, the agent *does not* support HTTP proxies requiring -authentication. +authentication. Note that when the agent service is managed by systemd, environment variables +such as `http_proxy` and `https_proxy` should be defined using one the mechanisms provided by +systemd (e.g. by using Environment or EnvironmentFile in the service file). ## Requirements diff --git a/azurelinuxagent/common/cgroup.py b/azurelinuxagent/common/cgroup.py index b22ea2994..b2bf32fbc 100644 --- a/azurelinuxagent/common/cgroup.py +++ b/azurelinuxagent/common/cgroup.py @@ -360,8 +360,7 @@ def try_swap_memory_usage(self): except CounterNotFound as e: if self._counter_not_found_error_count < 1: logger.periodic_info(logger.EVERY_HALF_HOUR, - 'Could not find swap counter from "memory.stat" file in the cgroup: {0}.' - ' Internal error: {1}'.format(self.path, ustr(e))) + '{0} from "memory.stat" file in the cgroup: {1}---[Note: This log for informational purpose only and can be ignored]'.format(ustr(e), self.path)) self._counter_not_found_error_count += 1 return 0 diff --git a/azurelinuxagent/common/cgroupapi.py b/azurelinuxagent/common/cgroupapi.py index 66e893ef6..ca0ef3bb5 100644 --- a/azurelinuxagent/common/cgroupapi.py +++ b/azurelinuxagent/common/cgroupapi.py @@ -253,7 +253,12 @@ def _is_systemd_failure(scope_name, stderr): return unit_not_found in stderr or scope_name not in stderr @staticmethod - def get_extension_slice_name(extension_name): + def get_extension_slice_name(extension_name, old_slice=False): + # The old slice makes it difficult for user to override the limits because they need to place drop-in files on every upgrade if extension slice is different for each version. + # old slice includes .- + # new slice without version . + if not old_slice: + extension_name = extension_name.rsplit("-", 1)[0] # Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects. return EXTENSION_SLICE_PREFIX + "-" + extension_name.replace('-', '_') + ".slice" diff --git a/azurelinuxagent/common/cgroupconfigurator.py b/azurelinuxagent/common/cgroupconfigurator.py index b22a26bcd..767786f01 100644 --- a/azurelinuxagent/common/cgroupconfigurator.py +++ b/azurelinuxagent/common/cgroupconfigurator.py @@ -26,7 +26,7 @@ from azurelinuxagent.common.cgroup import CpuCgroup, AGENT_NAME_TELEMETRY, MetricsCounter, MemoryCgroup from azurelinuxagent.common.cgroupapi import CGroupsApi, SystemdCgroupsApi, SystemdRunError, EXTENSION_SLICE_PREFIX from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry -from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException +from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException, AgentMemoryExceededException from azurelinuxagent.common.future import ustr from azurelinuxagent.common.osutil import get_osutil, systemd from azurelinuxagent.common.version import get_distro @@ -143,6 +143,7 @@ def __init__(self): self._cgroups_api = None self._agent_cpu_cgroup_path = None self._agent_memory_cgroup_path = None + self._agent_memory_cgroup = None self._check_cgroups_lock = threading.RLock() # Protect the check_cgroups which is called from Monitor thread and main loop. def initialize(self): @@ -213,7 +214,8 @@ def initialize(self): if self._agent_memory_cgroup_path is not None: _log_cgroup_info("Agent Memory cgroup: {0}", self._agent_memory_cgroup_path) - CGroupsTelemetry.track_cgroup(MemoryCgroup(AGENT_NAME_TELEMETRY, self._agent_memory_cgroup_path)) + self._agent_memory_cgroup = MemoryCgroup(AGENT_NAME_TELEMETRY, self._agent_memory_cgroup_path) + CGroupsTelemetry.track_cgroup(self._agent_memory_cgroup) _log_cgroup_info('Agent cgroups enabled: {0}', self._agent_cgroups_enabled) @@ -366,10 +368,9 @@ def __setup_azure_slice(): if not os.path.exists(vmextensions_slice): files_to_create.append((vmextensions_slice, _VMEXTENSIONS_SLICE_CONTENTS)) - if not os.path.exists(logcollector_slice): - slice_contents = _LOGCOLLECTOR_SLICE_CONTENTS_FMT.format(cpu_quota=_LOGCOLLECTOR_CPU_QUOTA) - - files_to_create.append((logcollector_slice, slice_contents)) + # Update log collector slice contents + slice_contents = _LOGCOLLECTOR_SLICE_CONTENTS_FMT.format(cpu_quota=_LOGCOLLECTOR_CPU_QUOTA) + files_to_create.append((logcollector_slice, slice_contents)) if fileutil.findre_in_file(agent_unit_file, r"Slice=") is not None: CGroupConfigurator._Impl.__cleanup_unit_file(agent_drop_in_file_slice) @@ -454,6 +455,11 @@ def __create_all_files(files_to_create): def is_extension_resource_limits_setup_completed(self, extension_name, cpu_quota=None): unit_file_install_path = systemd.get_unit_file_install_path() + old_extension_slice_path = os.path.join(unit_file_install_path, SystemdCgroupsApi.get_extension_slice_name(extension_name, old_slice=True)) + # clean up the old slice from the disk + if os.path.exists(old_extension_slice_path): + CGroupConfigurator._Impl.__cleanup_unit_file(old_extension_slice_path) + extension_slice_path = os.path.join(unit_file_install_path, SystemdCgroupsApi.get_extension_slice_name(extension_name)) cpu_quota = str( @@ -644,7 +650,7 @@ def _check_processes_in_agent_cgroup(self): Raises a CGroupsException if the check fails """ unexpected = [] - + agent_cgroup_proc_names = [] try: daemon = os.getppid() extension_handler = os.getpid() @@ -658,9 +664,13 @@ def _check_processes_in_agent_cgroup(self): systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands()) for process in agent_cgroup: + agent_cgroup_proc_names.append(self.__format_process(process)) # Note that the agent uses systemd-run to start extensions; systemd-run belongs to the agent cgroup, though the extensions don't. if process in (daemon, extension_handler) or process in systemd_run_commands: continue + # check shell systemd_run process if above process check didn't catch it + if self._check_systemd_run_process(process): + continue # systemd_run_commands contains the shell that started systemd-run, so we also need to check for the parent if self._get_parent(process) in systemd_run_commands and self._get_command( process) == 'systemd-run': @@ -679,6 +689,7 @@ def _check_processes_in_agent_cgroup(self): _log_cgroup_warning("Error checking the processes in the agent's cgroup: {0}".format(ustr(exception))) if len(unexpected) > 0: + self._report_agent_cgroups_procs(agent_cgroup_proc_names, unexpected) raise CGroupsException("The agent's cgroup includes unexpected processes: {0}".format(unexpected)) @staticmethod @@ -741,6 +752,33 @@ def __is_zombie_process(pid): pass return False + @staticmethod + def _check_systemd_run_process(process): + """ + Returns True if process is shell systemd-run process started by agent otherwise False. + + Ex: sh,7345 -c systemd-run --unit=enable_7c5cab19-eb79-4661-95d9-9e5091bd5ae0 --scope --slice=azure-vmextensions-Microsoft.OSTCExtensions.VMAccessForLinux_1.5.11.slice /var/lib/waagent/Microsoft.OSTCExtensions.VMAccessForLinux-1.5.11/processes.sh + """ + try: + process_name = "UNKNOWN" + cmdline = '/proc/{0}/cmdline'.format(process) + if os.path.exists(cmdline): + with open(cmdline, "r") as cmdline_file: + process_name = "{0}".format(cmdline_file.read()) + match = re.search(r'systemd-run.*--unit=.*--scope.*--slice=azure-vmextensions.*', process_name) + if match is not None: + return True + except Exception: + pass + return False + + @staticmethod + def _report_agent_cgroups_procs(agent_cgroup_proc_names, unexpected): + for proc_name in unexpected: + if 'UNKNOWN' in proc_name: + msg = "Agent includes following processes when UNKNOWN process found: {0}".format("\n".join([ustr(proc) for proc in agent_cgroup_proc_names])) + add_event(op=WALAEventOperation.CGroupsInfo, message=msg) + @staticmethod def _check_agent_throttled_time(cgroup_metrics): for metric in cgroup_metrics: @@ -748,6 +786,19 @@ def _check_agent_throttled_time(cgroup_metrics): if metric.value > conf.get_agent_cpu_throttled_time_threshold(): raise CGroupsException("The agent has been throttled for {0} seconds".format(metric.value)) + def check_agent_memory_usage(self): + if self.enabled() and self._agent_memory_cgroup: + metrics = self._agent_memory_cgroup.get_tracked_metrics() + current_usage = 0 + for metric in metrics: + if metric.counter == MetricsCounter.TOTAL_MEM_USAGE: + current_usage += metric.value + elif metric.counter == MetricsCounter.SWAP_MEM_USAGE: + current_usage += metric.value + + if current_usage > conf.get_agent_memory_quota(): + raise AgentMemoryExceededException("The agent memory limit {0} bytes exceeded. The current reported usage is {1} bytes.".format(conf.get_agent_memory_quota(), current_usage)) + @staticmethod def _get_parent(pid): """ @@ -875,7 +926,10 @@ def setup_extension_slice(self, extension_name, cpu_quota): SystemdCgroupsApi.get_extension_slice_name(extension_name)) try: cpu_quota = str(cpu_quota) + "%" if cpu_quota is not None else "" # setting an empty value resets to the default (infinity) - _log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}", extension_name, cpu_quota) + if cpu_quota == "": + _log_cgroup_info("CPUQuota not set for {0}", extension_name) + else: + _log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}", extension_name, cpu_quota) slice_contents = _EXTENSION_SLICE_CONTENTS.format(extension_name=extension_name, cpu_quota=cpu_quota) CGroupConfigurator._Impl.__create_unit_file(extension_slice_path, slice_contents) diff --git a/azurelinuxagent/common/conf.py b/azurelinuxagent/common/conf.py index 3c6e960fd..46765ea98 100644 --- a/azurelinuxagent/common/conf.py +++ b/azurelinuxagent/common/conf.py @@ -136,6 +136,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__): "Debug.CgroupLogMetrics": False, "Debug.CgroupDisableOnProcessCheckFailure": True, "Debug.CgroupDisableOnQuotaCheckFailure": True, + "Debug.EnableAgentMemoryUsageCheck": False, "Debug.EnableFastTrack": True, "Debug.EnableGAVersioning": False } @@ -186,6 +187,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__): "Debug.CgroupCheckPeriod": 300, "Debug.AgentCpuQuota": 50, "Debug.AgentCpuThrottledTimeThreshold": 120, + "Debug.AgentMemoryQuota": 30 * 1024 ** 2, "Debug.EtpCollectionPeriod": 300, "Debug.AutoUpdateHotfixFrequency": 14400, "Debug.AutoUpdateNormalFrequency": 86400, @@ -555,6 +557,24 @@ def get_agent_cpu_throttled_time_threshold(conf=__conf__): return conf.get_int("Debug.AgentCpuThrottledTimeThreshold", 120) +def get_agent_memory_quota(conf=__conf__): + """ + Memory quota for the agent in bytes. + + NOTE: This option is experimental and may be removed in later versions of the Agent. + """ + return conf.get_int("Debug.AgentMemoryQuota", 30 * 1024 ** 2) + + +def get_enable_agent_memory_usage_check(conf=__conf__): + """ + If True, Agent checks it's Memory usage. + + NOTE: This option is experimental and may be removed in later versions of the Agent. + """ + return conf.get_switch("Debug.EnableAgentMemoryUsageCheck", False) + + def get_cgroup_monitor_expiry_time(conf=__conf__): """ cgroups monitoring for pilot extensions disabled after expiry time diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index b7aba5e41..1f903a9fa 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -69,6 +69,7 @@ class WALAEventOperation: ActivateResourceDisk = "ActivateResourceDisk" AgentBlacklisted = "AgentBlacklisted" AgentEnabled = "AgentEnabled" + AgentMemory = "AgentMemory" AgentUpgrade = "AgentUpgrade" ArtifactsProfileBlob = "ArtifactsProfileBlob" CGroupsCleanUp = "CGroupsCleanUp" diff --git a/azurelinuxagent/common/exception.py b/azurelinuxagent/common/exception.py index 9b16c4267..048466232 100644 --- a/azurelinuxagent/common/exception.py +++ b/azurelinuxagent/common/exception.py @@ -58,6 +58,14 @@ def __init__(self, msg=None, inner=None): super(AgentConfigError, self).__init__(msg, inner) +class AgentMemoryExceededException(AgentError): + """ + When Agent memory limit reached. + """ + def __init__(self, msg=None, inner=None): + super(AgentMemoryExceededException, self).__init__(msg, inner) + + class AgentNetworkError(AgentError): """ When network is not available. diff --git a/azurelinuxagent/common/future.py b/azurelinuxagent/common/future.py index 0c0e016ee..be28ba9d8 100644 --- a/azurelinuxagent/common/future.py +++ b/azurelinuxagent/common/future.py @@ -109,6 +109,12 @@ def get_linux_distribution_from_distro(get_full_name): ) full_name = distro.linux_distribution()[0].strip() osinfo.append(full_name) + + # Fixing is the problem https://github.com/Azure/WALinuxAgent/issues/2715. Distro.linux_distribution method not retuning full version + # If best is true, the most precise version number out of all examined sources is returned. + if "mariner" in osinfo[0].lower(): + osinfo[1] = distro.version(best=True) + return osinfo diff --git a/azurelinuxagent/common/logcollector.py b/azurelinuxagent/common/logcollector.py index b0da848fc..fe62a7db6 100644 --- a/azurelinuxagent/common/logcollector.py +++ b/azurelinuxagent/common/logcollector.py @@ -34,6 +34,7 @@ # Please note: be careful when adding agent dependencies in this module. # This module uses its own logger and logs to its own file, not to the agent log. +from azurelinuxagent.common.protocol.goal_state import GoalStateProperties from azurelinuxagent.common.protocol.util import get_protocol_util _EXTENSION_LOG_DIR = get_ext_log_dir() @@ -117,8 +118,8 @@ def _set_resource_usage_cgroups(cpu_cgroup_path, memory_cgroup_path): @staticmethod def _initialize_telemetry(): - protocol = get_protocol_util().get_protocol() - protocol.client.update_goal_state(force_update=True) + protocol = get_protocol_util().get_protocol(init_goal_state=False) + protocol.client.reset_goal_state(goal_state_properties=GoalStateProperties.RoleConfig | GoalStateProperties.HostingEnv) # Initialize the common parameters for telemetry events initialize_event_logger_vminfo_common_parameters(protocol) diff --git a/azurelinuxagent/common/osutil/coreos.py b/azurelinuxagent/common/osutil/coreos.py index fc0a66043..373727e20 100644 --- a/azurelinuxagent/common/osutil/coreos.py +++ b/azurelinuxagent/common/osutil/coreos.py @@ -17,7 +17,7 @@ # import os -import azurelinuxagent.common.utils.shellutil as shellutil +from azurelinuxagent.common.utils import shellutil from azurelinuxagent.common.osutil.default import DefaultOSUtil @@ -78,7 +78,9 @@ def stop_agent_service(self): return shellutil.run("systemctl stop {0}".format(self.service_name), chk_err=False) def get_dhcp_pid(self): - return self._get_dhcp_pid(["systemctl", "show", "-p", "MainPID", "systemd-networkd"]) + return self._get_dhcp_pid( + ["systemctl", "show", "-p", "MainPID", "systemd-networkd"], + transform_command_output=lambda o: o.replace("MainPID=", "")) def conf_sshd(self, disable_password): # In CoreOS, /etc/sshd_config is mount readonly. Skip the setting. diff --git a/azurelinuxagent/common/osutil/default.py b/azurelinuxagent/common/osutil/default.py index 056c50e07..9fb97f157 100644 --- a/azurelinuxagent/common/osutil/default.py +++ b/azurelinuxagent/common/osutil/default.py @@ -36,11 +36,11 @@ import array -import azurelinuxagent.common.conf as conf -import azurelinuxagent.common.logger as logger -import azurelinuxagent.common.utils.fileutil as fileutil -import azurelinuxagent.common.utils.shellutil as shellutil -import azurelinuxagent.common.utils.textutil as textutil +from azurelinuxagent.common import conf +from azurelinuxagent.common import logger +from azurelinuxagent.common.utils import fileutil +from azurelinuxagent.common.utils import shellutil +from azurelinuxagent.common.utils import textutil from azurelinuxagent.common.exception import OSUtilError from azurelinuxagent.common.future import ustr, array_to_bytes @@ -1137,9 +1137,12 @@ def _text_to_pid_list(text): return [int(n) for n in text.split()] @staticmethod - def _get_dhcp_pid(command): + def _get_dhcp_pid(command, transform_command_output=None): try: - return DefaultOSUtil._text_to_pid_list(shellutil.run_command(command)) + output = shellutil.run_command(command) + if transform_command_output is not None: + output = transform_command_output(output) + return DefaultOSUtil._text_to_pid_list(output) except CommandError as exception: # pylint: disable=W0612 return [] diff --git a/azurelinuxagent/common/osutil/factory.py b/azurelinuxagent/common/osutil/factory.py index d48c49347..83123e3f5 100644 --- a/azurelinuxagent/common/osutil/factory.py +++ b/azurelinuxagent/common/osutil/factory.py @@ -40,6 +40,7 @@ from .photonos import PhotonOSUtil from .ubuntu import UbuntuOSUtil, Ubuntu12OSUtil, Ubuntu14OSUtil, \ UbuntuSnappyOSUtil, Ubuntu16OSUtil, Ubuntu18OSUtil +from .fedora import FedoraOSUtil def get_osutil(distro_name=DISTRO_NAME, @@ -153,5 +154,8 @@ def _get_osutil(distro_name, distro_code_name, distro_version, distro_full_name) if distro_name == "openwrt": return OpenWRTOSUtil() + if distro_name == "fedora": + return FedoraOSUtil() + logger.warn("Unable to load distro implementation for {0}. Using default distro implementation instead.", distro_name) return DefaultOSUtil() diff --git a/azurelinuxagent/common/osutil/fedora.py b/azurelinuxagent/common/osutil/fedora.py new file mode 100644 index 000000000..164b55ebf --- /dev/null +++ b/azurelinuxagent/common/osutil/fedora.py @@ -0,0 +1,77 @@ +# +# Copyright 2022 Red Hat Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Requires Python 2.6+ and Openssl 1.0+ +# + +import time +import azurelinuxagent.common.logger as logger +import azurelinuxagent.common.utils.shellutil as shellutil +from azurelinuxagent.common.osutil.default import DefaultOSUtil + + +class FedoraOSUtil(DefaultOSUtil): + + def __init__(self): + super(FedoraOSUtil, self).__init__() + self.agent_conf_file_path = '/etc/waagent.conf' + + @staticmethod + def get_systemd_unit_file_install_path(): + return '/usr/lib/systemd/system' + + @staticmethod + def get_agent_bin_path(): + return '/usr/sbin' + + def is_dhcp_enabled(self): + return True + + def start_network(self): + pass + + def restart_if(self, ifname=None, retries=None, wait=None): + retry_limit = retries+1 + for attempt in range(1, retry_limit): + return_code = shellutil.run("ip link set {0} down && ip link set {0} up".format(ifname)) + if return_code == 0: + return + logger.warn("failed to restart {0}: return code {1}".format(ifname, return_code)) + if attempt < retry_limit: + logger.info("retrying in {0} seconds".format(wait)) + time.sleep(wait) + else: + logger.warn("exceeded restart retries") + + def restart_ssh_service(self): + shellutil.run('systemctl restart sshd') + + def stop_dhcp_service(self): + pass + + def start_dhcp_service(self): + pass + + def start_agent_service(self): + return shellutil.run('systemctl start waagent', chk_err=False) + + def stop_agent_service(self): + return shellutil.run('systemctl stop waagent', chk_err=False) + + def get_dhcp_pid(self): + return self._get_dhcp_pid(["pidof", "dhclient"]) + + def conf_sshd(self, disable_password): + pass diff --git a/azurelinuxagent/common/protocol/goal_state.py b/azurelinuxagent/common/protocol/goal_state.py index ef4730503..6b2a0c2cf 100644 --- a/azurelinuxagent/common/protocol/goal_state.py +++ b/azurelinuxagent/common/protocol/goal_state.py @@ -48,6 +48,19 @@ _GET_GOAL_STATE_MAX_ATTEMPTS = 6 +class GoalStateProperties(object): + """ + Enum for defining the properties that we fetch in the goal state + """ + RoleConfig = 0x1 + HostingEnv = 0x2 + SharedConfig = 0x4 + ExtensionsGoalState = 0x8 + Certificates = 0x10 + RemoteAccessInfo = 0x20 + All = RoleConfig | HostingEnv | SharedConfig | ExtensionsGoalState | Certificates | RemoteAccessInfo + + class GoalStateInconsistentError(ProtocolError): """ Indicates an inconsistency in the goal state (e.g. missing tenant certificate) @@ -57,7 +70,7 @@ def __init__(self, msg, inner=None): class GoalState(object): - def __init__(self, wire_client, silent=False): + def __init__(self, wire_client, goal_state_properties=GoalStateProperties.All, silent=False): """ Fetches the goal state using the given wire client. @@ -72,6 +85,7 @@ def __init__(self, wire_client, silent=False): self._wire_client = wire_client self._history = None self._extensions_goal_state = None # populated from vmSettings or extensionsConfig + self._goal_state_properties = goal_state_properties self.logger = logger.Logger(logger.DEFAULT_LOGGER) self.logger.silent = silent @@ -83,6 +97,7 @@ def __init__(self, wire_client, silent=False): self._hosting_env = None self._shared_conf = None self._certs = EmptyCertificates() + self._certs_uri = None self._remote_access = None self.update(silent=silent) @@ -99,35 +114,59 @@ def incarnation(self): @property def container_id(self): - return self._container_id + if not self._goal_state_properties & GoalStateProperties.RoleConfig: + raise ProtocolError("ContainerId is not in goal state properties") + else: + return self._container_id @property def role_instance_id(self): - return self._role_instance_id + if not self._goal_state_properties & GoalStateProperties.RoleConfig: + raise ProtocolError("RoleInstanceId is not in goal state properties") + else: + return self._role_instance_id @property def role_config_name(self): - return self._role_config_name + if not self._goal_state_properties & GoalStateProperties.RoleConfig: + raise ProtocolError("RoleConfig is not in goal state properties") + else: + return self._role_config_name @property def extensions_goal_state(self): - return self._extensions_goal_state + if not self._goal_state_properties & GoalStateProperties.ExtensionsGoalState: + raise ProtocolError("ExtensionsGoalState is not in goal state properties") + else: + return self._extensions_goal_state @property def certs(self): - return self._certs + if not self._goal_state_properties & GoalStateProperties.Certificates: + raise ProtocolError("Certificates is not in goal state properties") + else: + return self._certs @property def hosting_env(self): - return self._hosting_env + if not self._goal_state_properties & GoalStateProperties.HostingEnv: + raise ProtocolError("HostingEnvironment is not in goal state properties") + else: + return self._hosting_env @property def shared_conf(self): - return self._shared_conf + if not self._goal_state_properties & GoalStateProperties.SharedConfig: + raise ProtocolError("SharedConfig is not in goal state properties") + else: + return self._shared_conf @property def remote_access(self): - return self._remote_access + if not self._goal_state_properties & GoalStateProperties.RemoteAccessInfo: + raise ProtocolError("RemoteAccessInfo is not in goal state properties") + else: + return self._remote_access def fetch_agent_manifest(self, family_name, uris): """ @@ -152,13 +191,6 @@ def _fetch_manifest(self, manifest_type, name, uris): except Exception as e: raise ProtocolError("Failed to retrieve {0} manifest. Error: {1}".format(manifest_type, ustr(e))) - def download_extension(self, uris, destination, on_downloaded=lambda: True): - """ - This is a convenience method that wraps WireClient.download_extension(), but adds the required 'use_verify_header' parameter. - """ - is_fast_track = self.extensions_goal_state.source == GoalStateSource.FastTrack - self._wire_client.download_extension(uris, destination, use_verify_header=is_fast_track, on_downloaded=on_downloaded) - @staticmethod def update_host_plugin_headers(wire_client): """ @@ -197,11 +229,12 @@ def _update(self, force_update): add_event(op=WALAEventOperation.GoalState, message=message) vm_settings, vm_settings_updated = None, False - try: - vm_settings, vm_settings_updated = GoalState._fetch_vm_settings(self._wire_client, force_update=force_update) - except VmSettingsSupportStopped as exception: # If the HGAP stopped supporting vmSettings, we need to use the goal state from the WireServer - self._restore_wire_server_goal_state(incarnation, xml_text, xml_doc, exception) - return + if self._goal_state_properties & GoalStateProperties.ExtensionsGoalState: + try: + vm_settings, vm_settings_updated = GoalState._fetch_vm_settings(self._wire_client, force_update=force_update) + except VmSettingsSupportStopped as exception: # If the HGAP stopped supporting vmSettings, we need to use the goal state from the WireServer + self._restore_wire_server_goal_state(incarnation, xml_text, xml_doc, exception) + return if vm_settings_updated: self.logger.info('') @@ -261,6 +294,10 @@ def _update(self, force_update): self._check_certificates() def _check_certificates(self): + # Re-download certificates in case they have been removed from disk since last download + if self._goal_state_properties & GoalStateProperties.Certificates and self._certs_uri is not None: + self._download_certificates(self._certs_uri) + # Check that certificates needed by extensions are in goal state certs.summary for extension in self.extensions_goal_state.extensions: for settings in extension.settings: if settings.protectedSettings is None: @@ -270,6 +307,20 @@ def _check_certificates(self): message = "Certificate {0} needed by {1} is missing from the goal state".format(settings.certificateThumbprint, extension.name) raise GoalStateInconsistentError(message) + def _download_certificates(self, certs_uri): + xml_text = self._wire_client.fetch_config(certs_uri, self._wire_client.get_header_for_cert()) + certs = Certificates(xml_text, self.logger) + # Log and save the certificates summary (i.e. the thumbprint but not the certificate itself) to the goal state history + for c in certs.summary: + message = "Downloaded certificate {0}".format(c) + self.logger.info(message) + add_event(op=WALAEventOperation.GoalState, message=message) + if len(certs.warnings) > 0: + self.logger.warn(certs.warnings) + add_event(op=WALAEventOperation.GoalState, message=certs.warnings) + self._history.save_certificates(json.dumps(certs.summary)) + return certs + def _restore_wire_server_goal_state(self, incarnation, xml_text, xml_doc, vm_settings_support_stopped_error): msg = 'The HGAP stopped supporting vmSettings; will fetched the goal state from the WireServer.' self.logger.info(msg) @@ -363,58 +414,57 @@ def _fetch_full_wire_server_goal_state(self, incarnation, xml_doc): self.logger.info(message) add_event(op=WALAEventOperation.GoalState, message=message) - role_instance = find(xml_doc, "RoleInstance") - role_instance_id = findtext(role_instance, "InstanceId") - role_config = find(role_instance, "Configuration") - role_config_name = findtext(role_config, "ConfigName") - container = find(xml_doc, "Container") - container_id = findtext(container, "ContainerId") + role_instance_id = None + role_config_name = None + container_id = None + if GoalStateProperties.RoleConfig & self._goal_state_properties: + role_instance = find(xml_doc, "RoleInstance") + role_instance_id = findtext(role_instance, "InstanceId") + role_config = find(role_instance, "Configuration") + role_config_name = findtext(role_config, "ConfigName") + container = find(xml_doc, "Container") + container_id = findtext(container, "ContainerId") extensions_config_uri = findtext(xml_doc, "ExtensionsConfig") - if extensions_config_uri is None: + if not (GoalStateProperties.ExtensionsGoalState & self._goal_state_properties) or extensions_config_uri is None: extensions_config = ExtensionsGoalStateFactory.create_empty(incarnation) else: xml_text = self._wire_client.fetch_config(extensions_config_uri, self._wire_client.get_header()) extensions_config = ExtensionsGoalStateFactory.create_from_extensions_config(incarnation, xml_text, self._wire_client) self._history.save_extensions_config(extensions_config.get_redacted_text()) - hosting_env_uri = findtext(xml_doc, "HostingEnvironmentConfig") - xml_text = self._wire_client.fetch_config(hosting_env_uri, self._wire_client.get_header()) - hosting_env = HostingEnv(xml_text) - self._history.save_hosting_env(xml_text) - - shared_conf_uri = findtext(xml_doc, "SharedConfig") - xml_text = self._wire_client.fetch_config(shared_conf_uri, self._wire_client.get_header()) - shared_config = SharedConfig(xml_text) - self._history.save_shared_conf(xml_text) - # SharedConfig.xml is used by other components (Azsec and Singularity/HPC Infiniband), so save it to the agent's root directory as well - shared_config_file = os.path.join(conf.get_lib_dir(), SHARED_CONF_FILE_NAME) - try: - fileutil.write_file(shared_config_file, xml_text) - except Exception as e: - logger.warn("Failed to save {0}: {1}".format(shared_config, e)) + hosting_env = None + if GoalStateProperties.HostingEnv & self._goal_state_properties: + hosting_env_uri = findtext(xml_doc, "HostingEnvironmentConfig") + xml_text = self._wire_client.fetch_config(hosting_env_uri, self._wire_client.get_header()) + hosting_env = HostingEnv(xml_text) + self._history.save_hosting_env(xml_text) + + shared_config = None + if GoalStateProperties.SharedConfig & self._goal_state_properties: + shared_conf_uri = findtext(xml_doc, "SharedConfig") + xml_text = self._wire_client.fetch_config(shared_conf_uri, self._wire_client.get_header()) + shared_config = SharedConfig(xml_text) + self._history.save_shared_conf(xml_text) + # SharedConfig.xml is used by other components (Azsec and Singularity/HPC Infiniband), so save it to the agent's root directory as well + shared_config_file = os.path.join(conf.get_lib_dir(), SHARED_CONF_FILE_NAME) + try: + fileutil.write_file(shared_config_file, xml_text) + except Exception as e: + logger.warn("Failed to save {0}: {1}".format(shared_config, e)) certs = EmptyCertificates() certs_uri = findtext(xml_doc, "Certificates") - if certs_uri is not None: - xml_text = self._wire_client.fetch_config(certs_uri, self._wire_client.get_header_for_cert()) - certs = Certificates(xml_text, self.logger) - # Log and save the certificates summary (i.e. the thumbprint but not the certificate itself) to the goal state history - for c in certs.summary: - message = "Downloaded certificate {0}".format(c) - self.logger.info(message) - add_event(op=WALAEventOperation.GoalState, message=message) - if len(certs.warnings) > 0: - self.logger.warn(certs.warnings) - add_event(op=WALAEventOperation.GoalState, message=certs.warnings) - self._history.save_certificates(json.dumps(certs.summary)) + if (GoalStateProperties.Certificates & self._goal_state_properties) and certs_uri is not None: + certs = self._download_certificates(certs_uri) remote_access = None - remote_access_uri = findtext(container, "RemoteAccessInfo") - if remote_access_uri is not None: - xml_text = self._wire_client.fetch_config(remote_access_uri, self._wire_client.get_header_for_cert()) - remote_access = RemoteAccess(xml_text) - self._history.save_remote_access(xml_text) + if GoalStateProperties.RemoteAccessInfo & self._goal_state_properties: + remote_access_uri = findtext(container, "RemoteAccessInfo") + if remote_access_uri is not None: + xml_text = self._wire_client.fetch_config(remote_access_uri, self._wire_client.get_header_for_cert()) + remote_access = RemoteAccess(xml_text) + self._history.save_remote_access(xml_text) self._incarnation = incarnation self._role_instance_id = role_instance_id @@ -423,6 +473,7 @@ def _fetch_full_wire_server_goal_state(self, incarnation, xml_doc): self._hosting_env = hosting_env self._shared_conf = shared_config self._certs = certs + self._certs_uri = certs_uri self._remote_access = remote_access return extensions_config diff --git a/azurelinuxagent/common/protocol/util.py b/azurelinuxagent/common/protocol/util.py index 92b691e92..7d7f90168 100644 --- a/azurelinuxagent/common/protocol/util.py +++ b/azurelinuxagent/common/protocol/util.py @@ -188,7 +188,7 @@ def _clear_wireserver_endpoint(self): return logger.error("Failed to clear wiresever endpoint: {0}", e) - def _detect_protocol(self): + def _detect_protocol(self, init_goal_state=True): """ Probe protocol endpoints in turn. """ @@ -217,7 +217,7 @@ def _detect_protocol(self): try: protocol = WireProtocol(endpoint) - protocol.detect() + protocol.detect(init_goal_state=init_goal_state) self._set_wireserver_endpoint(endpoint) return protocol @@ -268,7 +268,7 @@ def clear_protocol(self): finally: self._lock.release() - def get_protocol(self): + def get_protocol(self, init_goal_state=True): """ Detect protocol by endpoint. :returns: protocol instance @@ -296,7 +296,7 @@ def get_protocol(self): logger.info("Detect protocol endpoint") - protocol = self._detect_protocol() + protocol = self._detect_protocol(init_goal_state=init_goal_state) IOErrorCounter.set_protocol_endpoint(endpoint=protocol.get_endpoint()) self._save_protocol(WIRE_PROTOCOL_NAME) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 167d4820a..38a3e0621 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -19,7 +19,9 @@ import json import os import random +import shutil import time +import zipfile from collections import defaultdict from datetime import datetime, timedelta @@ -35,7 +37,8 @@ from azurelinuxagent.common.exception import ProtocolNotFoundError, \ ResourceGoneError, ExtensionDownloadError, InvalidContainerError, ProtocolError, HttpError, ExtensionErrorCodes from azurelinuxagent.common.future import httpclient, bytebuffer, ustr -from azurelinuxagent.common.protocol.goal_state import GoalState, TRANSPORT_CERT_FILE_NAME, TRANSPORT_PRV_FILE_NAME +from azurelinuxagent.common.protocol.goal_state import GoalState, TRANSPORT_CERT_FILE_NAME, TRANSPORT_PRV_FILE_NAME, \ + GoalStateProperties from azurelinuxagent.common.protocol.hostplugin import HostPluginProtocol from azurelinuxagent.common.protocol.restapi import DataContract, ProvisionStatus, VMInfo, VMStatus from azurelinuxagent.common.telemetryevent import GuestAgentExtensionEventsSchema @@ -70,7 +73,7 @@ def __init__(self, endpoint): raise ProtocolError("WireProtocol endpoint is None") self.client = WireClient(endpoint) - def detect(self): + def detect(self, init_goal_state=True): self.client.check_wire_protocol_version() trans_prv_file = os.path.join(conf.get_lib_dir(), @@ -81,11 +84,9 @@ def detect(self): cryptutil.gen_transport_cert(trans_prv_file, trans_cert_file) # Initialize the goal state, including all the inner properties - logger.info('Initializing goal state during protocol detection') - self.client.update_goal_state(force_update=True) - - def update_goal_state(self, silent=False): - self.client.update_goal_state(silent=silent) + if init_goal_state: + logger.info('Initializing goal state during protocol detection') + self.client.reset_goal_state() def update_host_plugin_from_goal_state(self): self.client.update_host_plugin_from_goal_state() @@ -604,25 +605,29 @@ def hgap_download(uri): return self._download_with_fallback_channel(download_type, uris, direct_download=direct_download, hgap_download=hgap_download) - def download_extension(self, uris, destination, use_verify_header, on_downloaded=lambda: True): + def download_zip_package(self, package_type, uris, target_file, target_directory, use_verify_header): """ - Walks the given list of 'uris' issuing HTTP GET requests and saves the content of the first successful request to 'destination'. + Downloads the ZIP package specified in 'uris' (which is a list of alternate locations for the ZIP), saving it to 'target_file' and then expanding + its contents to 'target_directory'. Deletes the target file after it has been expanded. - When the download is successful, this method invokes the 'on_downloaded' callback function, which can be used to process the results of the download. - on_downloaded() should return True on success and False on failure (it should not raise any exceptions); ff the return value is False, the download - is considered a failure and the next URI is tried. + The 'package_type' is only used in log messages and has no other semantics. It should specify the contents of the ZIP, e.g. "extension package" + or "agent package" + + The 'use_verify_header' parameter indicates whether the verify header should be added when using the extensionArtifact API of the HostGAPlugin. """ host_ga_plugin = self.get_host_plugin() - direct_download = lambda uri: self.stream(uri, destination, headers=None, use_proxy=True) + direct_download = lambda uri: self.stream(uri, target_file, headers=None, use_proxy=True) def hgap_download(uri): request_uri, request_headers = host_ga_plugin.get_artifact_request(uri, use_verify_header=use_verify_header, artifact_manifest_url=host_ga_plugin.manifest_uri) - return self.stream(request_uri, destination, headers=request_headers, use_proxy=False) + return self.stream(request_uri, target_file, headers=request_headers, use_proxy=False) + + on_downloaded = lambda: WireClient._try_expand_zip_package(package_type, target_file, target_directory) - self._download_with_fallback_channel("extension package", uris, direct_download=direct_download, hgap_download=hgap_download, on_downloaded=on_downloaded) + self._download_with_fallback_channel(package_type, uris, direct_download=direct_download, hgap_download=hgap_download, on_downloaded=on_downloaded) - def _download_with_fallback_channel(self, download_type, uris, direct_download, hgap_download, on_downloaded=lambda: True): + def _download_with_fallback_channel(self, download_type, uris, direct_download, hgap_download, on_downloaded=None): """ Walks the given list of 'uris' issuing HTTP GET requests, attempting to download the content of each URI. The download is done using both the default and the fallback channels, until one of them succeeds. The 'direct_download' and 'hgap_download' functions define the logic to do direct calls to the URI or @@ -630,9 +635,9 @@ def _download_with_fallback_channel(self, download_type, uris, direct_download, but the default can be depending on the success/failure of each channel (see _download_using_appropriate_channel() for the logic to do this). The 'download_type' is added to any log messages produced by this method; it should describe the type of content of the given URIs - (e.g. "manifest", "extension package", etc). + (e.g. "manifest", "extension package, "agent package", etc). - When the download is successful download_extension() invokes the 'on_downloaded' function, which can be used to process the results of the download. This + When the download is successful, _download_with_fallback_channel invokes the 'on_downloaded' function, which can be used to process the results of the download. This function should return True on success, and False on failure (it should not raise any exceptions). If the return value is False, the download is considered a failure and the next URI is tried. @@ -641,7 +646,7 @@ def _download_with_fallback_channel(self, download_type, uris, direct_download, This method enforces a timeout (_DOWNLOAD_TIMEOUT) on the download and raises an exception if the limit is exceeded. """ - logger.verbose("Downloading {0}", download_type) + logger.info("Downloading {0}", download_type) start_time = datetime.now() uris_shuffled = uris @@ -658,14 +663,34 @@ def _download_with_fallback_channel(self, download_type, uris, direct_download, # Disable W0640: OK to use uri in a lambda within the loop's body response = self._download_using_appropriate_channel(lambda: direct_download(uri), lambda: hgap_download(uri)) # pylint: disable=W0640 - if on_downloaded(): - return uri, response + if on_downloaded is not None: + on_downloaded() + return uri, response except Exception as exception: most_recent_error = exception raise ExtensionDownloadError("Failed to download {0} from all URIs. Last error: {1}".format(download_type, ustr(most_recent_error)), code=ExtensionErrorCodes.PluginManifestDownloadError) + @staticmethod + def _try_expand_zip_package(package_type, target_file, target_directory): + logger.info("Unzipping {0}: {1}", package_type, target_file) + try: + zipfile.ZipFile(target_file).extractall(target_directory) + except Exception as exception: + logger.error("Error while unzipping {0}: {1}", package_type, ustr(exception)) + if os.path.exists(target_directory): + try: + shutil.rmtree(target_directory) + except Exception as exception: + logger.warn("Cannot delete {0}: {1}", target_directory, ustr(exception)) + raise + finally: + try: + os.remove(target_file) + except Exception as exception: + logger.warn("Cannot delete {0}: {1}", target_file, ustr(exception)) + def stream(self, uri, destination, headers=None, use_proxy=None): """ Downloads the content of the given 'uri' and saves it to the 'destination' file. @@ -752,15 +777,12 @@ def update_host_plugin(self, container_id, role_config_name): self._host_plugin.update_container_id(container_id) self._host_plugin.update_role_config_name(role_config_name) - def update_goal_state(self, force_update=False, silent=False): + def update_goal_state(self, silent=False): """ - Updates the goal state if the incarnation or etag changed or if 'force_update' is True + Updates the goal state if the incarnation or etag changed """ try: - if force_update and not silent: - logger.info("Forcing an update of the goal state.") - - if self._goal_state is None or force_update: + if self._goal_state is None: self._goal_state = GoalState(self, silent=silent) else: self._goal_state.update(silent=silent) @@ -770,6 +792,21 @@ def update_goal_state(self, force_update=False, silent=False): except Exception as exception: raise ProtocolError("Error fetching goal state: {0}".format(ustr(exception))) + def reset_goal_state(self, goal_state_properties=GoalStateProperties.All, silent=False): + """ + Resets the goal state + """ + try: + if not silent: + logger.info("Forcing an update of the goal state.") + + self._goal_state = GoalState(self, goal_state_properties=goal_state_properties, silent=silent) + + except ProtocolError: + raise + except Exception as exception: + raise ProtocolError("Error fetching goal state: {0}".format(ustr(exception))) + def get_goal_state(self): if self._goal_state is None: raise ProtocolError("Trying to fetch goal state before initialization!") @@ -899,7 +936,7 @@ def upload_status_blob(self): if extensions_goal_state.status_upload_blob is None: # the status upload blob is in ExtensionsConfig so force a full goal state refresh - self.update_goal_state(force_update=True, silent=True) + self.reset_goal_state(silent=True) extensions_goal_state = self.get_goal_state().extensions_goal_state if extensions_goal_state.status_upload_blob is None: diff --git a/azurelinuxagent/common/version.py b/azurelinuxagent/common/version.py index 20e11cb3a..8e12eff5f 100644 --- a/azurelinuxagent/common/version.py +++ b/azurelinuxagent/common/version.py @@ -209,7 +209,7 @@ def has_logrotate(): # # When doing a release, be sure to use the actual agent version. Current agent version: 2.4.0.0 # -AGENT_VERSION = '2.9.0.4' +AGENT_VERSION = '2.9.1.1' AGENT_LONG_VERSION = "{0}-{1}".format(AGENT_NAME, AGENT_VERSION) AGENT_DESCRIPTION = """ The Azure Linux Agent supports the provisioning and running of Linux diff --git a/azurelinuxagent/daemon/main.py b/azurelinuxagent/daemon/main.py index c608768a6..1eb58ec99 100644 --- a/azurelinuxagent/daemon/main.py +++ b/azurelinuxagent/daemon/main.py @@ -28,6 +28,7 @@ from azurelinuxagent.common.event import add_event, WALAEventOperation, initialize_event_logger_vminfo_common_parameters from azurelinuxagent.common.future import ustr from azurelinuxagent.common.osutil import get_osutil +from azurelinuxagent.common.protocol.goal_state import GoalState, GoalStateProperties from azurelinuxagent.common.protocol.util import get_protocol_util from azurelinuxagent.common.rdma import setup_rdma_device from azurelinuxagent.common.utils import textutil @@ -160,9 +161,9 @@ def daemon(self, child_args=None): # current values. protocol = self.protocol_util.get_protocol() - protocol.client.update_goal_state(force_update=True) + goal_state = GoalState(protocol, goal_state_properties=GoalStateProperties.SharedConfig) - setup_rdma_device(nd_version, protocol.client.get_shared_conf()) + setup_rdma_device(nd_version, goal_state.shared_conf) except Exception as e: logger.error("Error setting up rdma device: %s" % e) else: diff --git a/azurelinuxagent/ga/collect_telemetry_events.py b/azurelinuxagent/ga/collect_telemetry_events.py index 792ae0de6..01049ee87 100644 --- a/azurelinuxagent/ga/collect_telemetry_events.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -58,6 +58,8 @@ class ExtensionEventSchema(object): "EventTid":"2", "OperationId":"Guid (str)" } + + From next version(2.10+) we accept integer values for EventPid and EventTid fields. But we still support string type for backward compatability """ Version = "Version" Timestamp = "Timestamp" @@ -78,7 +80,7 @@ class _ProcessExtensionEvents(PeriodicOperation): _EXTENSION_EVENT_FILE_NAME_REGEX = re.compile(r"^(\d+)\.json$", re.IGNORECASE) # Limits - _MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD = 300 + _MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD = 360 _EXTENSION_EVENT_FILE_MAX_SIZE = 4 * 1024 * 1024 # 4 MB = 4 * 1,048,576 Bytes _EXTENSION_EVENT_MAX_SIZE = 1024 * 6 # 6Kb or 6144 characters. Limit for the whole event. Prevent oversized events. _EXTENSION_EVENT_MAX_MSG_LEN = 1024 * 3 # 3Kb or 3072 chars. @@ -323,15 +325,20 @@ def _parse_event_and_ensure_it_is_valid(self, extension_event): :param extension_event: The json event from file :return: Verified Json event that qualifies the contract. """ - - clean_string = lambda x: x.strip() if x is not None else x + def _clean_value(k, v): + if v is not None: + if isinstance(v, int): + if k.lower() in [ExtensionEventSchema.EventPid.lower(), ExtensionEventSchema.EventTid.lower()]: + return str(v) + return v.strip() + return v event_size = 0 key_err_msg = "{0}: {1} not found" # Convert the dict to all lower keys to avoid schema confusion. # Only pick the params that we care about and skip the rest. - event = dict((k.lower(), clean_string(v)) for k, v in extension_event.items() if + event = dict((k.lower(), _clean_value(k, v)) for k, v in extension_event.items() if k.lower() in self._EXTENSION_EVENT_REQUIRED_FIELDS) # Trim message and only pick the first 3k chars diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index c01fc15bc..0aa4ed93d 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -46,6 +46,7 @@ ExtensionOperationError, ExtensionUpdateError, ProtocolError, ProtocolNotFoundError, ExtensionsGoalStateError, \ GoalStateAggregateStatusCodes, MultiConfigExtensionEnableError from azurelinuxagent.common.future import ustr, is_file_not_found_error +from azurelinuxagent.common.protocol.extensions_goal_state import GoalStateSource from azurelinuxagent.common.protocol.restapi import ExtensionStatus, ExtensionSubStatus, Extension, ExtHandlerStatus, \ VMStatus, GoalStateAggregateStatus, ExtensionState, ExtensionRequestedState, ExtensionSettings from azurelinuxagent.common.utils import textutil @@ -1000,6 +1001,8 @@ def report_ext_handler_status(self, vm_status, ext_handler, goal_state_changed): heartbeat = ext_handler_i.collect_heartbeat() if heartbeat is not None: handler_status.status = heartbeat.get('status') + if 'formattedMessage' in heartbeat: + handler_status.message = parse_formatted_message(heartbeat.get('formattedMessage')) except ExtensionError as e: ext_handler_i.set_handler_status(message=ustr(e), code=e.code) @@ -1252,21 +1255,23 @@ def download(self): if self.pkg is None or self.pkg.uris is None or len(self.pkg.uris) == 0: raise ExtensionDownloadError("No package uri found") - destination = os.path.join(conf.get_lib_dir(), self.get_extension_package_zipfile_name()) + package_file = os.path.join(conf.get_lib_dir(), self.get_extension_package_zipfile_name()) package_exists = False - if os.path.exists(destination): - self.logger.info("Using existing extension package: {0}", destination) - if self._unzip_extension_package(destination, self.get_base_dir()): + if os.path.exists(package_file): + self.logger.info("Using existing extension package: {0}", package_file) + if self._unzip_extension_package(package_file, self.get_base_dir()): package_exists = True else: self.logger.info("The existing extension package is invalid, will ignore it.") if not package_exists: - self.protocol.get_goal_state().download_extension(self.pkg.uris, destination, on_downloaded=lambda: self._unzip_extension_package(destination, self.get_base_dir())) + is_fast_track_goal_state = self.protocol.get_goal_state().extensions_goal_state.source == GoalStateSource.FastTrack + self.protocol.client.download_zip_package("extension package", self.pkg.uris, package_file, self.get_base_dir(), use_verify_header=is_fast_track_goal_state) self.report_event(message="Download succeeded", duration=elapsed_milliseconds(begin_utc)) - self.pkg_file = destination + self.pkg_file = package_file + def ensure_consistent_data_for_mc(self): # If CRP expects Handler to support MC, ensure the HandlerManifest also reflects that. diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 0fac66f25..2b0975b05 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -19,7 +19,7 @@ import glob import json import os -import random +import platform import re import shutil import signal @@ -28,18 +28,17 @@ import sys import time import uuid -import zipfile from datetime import datetime, timedelta from azurelinuxagent.common import conf from azurelinuxagent.common import logger from azurelinuxagent.common.protocol.imds import get_imds_client -from azurelinuxagent.common.utils import fileutil, restutil, textutil +from azurelinuxagent.common.utils import fileutil, textutil from azurelinuxagent.common.agent_supported_feature import get_supported_feature_by_name, SupportedFeatureNames from azurelinuxagent.common.cgroupconfigurator import CGroupConfigurator from azurelinuxagent.common.event import add_event, initialize_event_logger_vminfo_common_parameters, \ WALAEventOperation, EVENTS_DIRECTORY -from azurelinuxagent.common.exception import ResourceGoneError, UpdateError, ExitException, AgentUpgradeExitException +from azurelinuxagent.common.exception import UpdateError, ExitException, AgentUpgradeExitException, AgentMemoryExceededException from azurelinuxagent.common.future import ustr from azurelinuxagent.common.osutil import get_osutil, systemd from azurelinuxagent.common.persist_firewall_rules import PersistFirewallRulesHandler @@ -137,6 +136,7 @@ def get_update_handler(): class UpdateHandler(object): TELEMETRY_HEARTBEAT_PERIOD = timedelta(minutes=30) + CHECK_MEMORY_USAGE_PERIOD = timedelta(seconds=conf.get_cgroup_check_period()) def __init__(self): self.osutil = get_osutil() @@ -162,6 +162,9 @@ def __init__(self): self._heartbeat_id = str(uuid.uuid4()).upper() self._heartbeat_counter = 0 + self._last_check_memory_usage = datetime.min + self._check_memory_usage_last_error_report = datetime.min + # VM Size is reported via the heartbeat, default it here. self._vm_size = None @@ -401,6 +404,7 @@ def run(self, debug=False): self._check_threads_running(all_thread_handlers) self._process_goal_state(exthandlers_handler, remote_access_handler) self._send_heartbeat_telemetry(protocol) + self._check_agent_memory_usage() time.sleep(self._goal_state_period) except AgentUpgradeExitException as exitException: @@ -459,6 +463,9 @@ def _get_vm_size(self, protocol): return self._vm_size + def _get_vm_arch(self): + return platform.machine() + def _check_daemon_running(self, debug): # Check that the parent process (the agent's daemon) is still running if not debug and self._is_orphaned: @@ -482,7 +489,7 @@ def _try_update_goal_state(self, protocol): try: max_errors_to_log = 3 - protocol.update_goal_state(silent=self._update_goal_state_error_count >= max_errors_to_log) + protocol.client.update_goal_state(silent=self._update_goal_state_error_count >= max_errors_to_log) self._goal_state = protocol.get_goal_state() @@ -944,9 +951,6 @@ def _find_agents(self): logger.warn(u"Exception occurred loading available agents: {0}", ustr(e)) return - def _get_host_plugin(self, protocol): - return protocol.client.get_host_plugin() if protocol and protocol.client else None - def _get_pid_parts(self): pid_file = conf.get_agent_pid_file_path() pid_dir = os.path.dirname(pid_file) @@ -985,7 +989,7 @@ def _is_orphaned(self): def _load_agents(self): path = os.path.join(conf.get_lib_dir(), "{0}-*".format(AGENT_NAME)) - return [GuestAgent(path=agent_dir) + return [GuestAgent.from_installed_agent(agent_dir) for agent_dir in glob.iglob(path) if os.path.isdir(agent_dir)] def _partition(self): @@ -1200,8 +1204,8 @@ def agent_upgrade_time_elapsed(now_): # Set the agents to those available for download at least as current as the existing agent # or to the requested version (if specified) - host = self._get_host_plugin(protocol=protocol) - agents_to_download = [GuestAgent(is_fast_track_goal_state=self._goal_state.extensions_goal_state.source == GoalStateSource.FastTrack, pkg=pkg, host=host) for pkg in packages_to_download] + is_fast_track_goal_state = self._goal_state.extensions_goal_state.source == GoalStateSource.FastTrack + agents_to_download = [GuestAgent.from_agent_package(pkg, protocol, is_fast_track_goal_state) for pkg in packages_to_download] # Filter out the agents that were downloaded/extracted successfully. If the agent was not installed properly, # we delete the directory and the zip package from the filesystem @@ -1265,13 +1269,13 @@ def _send_heartbeat_telemetry(self, protocol): if datetime.utcnow() >= (self._last_telemetry_heartbeat + UpdateHandler.TELEMETRY_HEARTBEAT_PERIOD): dropped_packets = self.osutil.get_firewall_dropped_packets(protocol.get_endpoint()) auto_update_enabled = 1 if conf.get_autoupdate_enabled() else 0 - # Include VMSize in the heartbeat message because the kusto table does not have - # a separate column for it (or architecture). - vmsize = self._get_vm_size(protocol) + # Include vm architecture in the heartbeat message because the kusto table does not have + # a separate column for it. + vmarch = self._get_vm_arch() telemetry_msg = "{0};{1};{2};{3};{4};{5}".format(self._heartbeat_counter, self._heartbeat_id, dropped_packets, self._heartbeat_update_goal_state_error_count, - auto_update_enabled, vmsize) + auto_update_enabled, vmarch) debug_log_msg = "[DEBUG HeartbeatCounter: {0};HeartbeatId: {1};DroppedPackets: {2};" \ "UpdateGSErrors: {3};AutoUpdate: {4}]".format(self._heartbeat_counter, self._heartbeat_id, dropped_packets, @@ -1288,6 +1292,27 @@ def _send_heartbeat_telemetry(self, protocol): self._heartbeat_update_goal_state_error_count = 0 self._last_telemetry_heartbeat = datetime.utcnow() + def _check_agent_memory_usage(self): + """ + This checks the agent current memory usage and safely exit the process if agent reaches the memory limit + """ + try: + if conf.get_enable_agent_memory_usage_check() and self._extensions_summary.converged: + if self._last_check_memory_usage == datetime.min or datetime.utcnow() >= (self._last_check_memory_usage + UpdateHandler.CHECK_MEMORY_USAGE_PERIOD): + self._last_check_memory_usage = datetime.utcnow() + CGroupConfigurator.get_instance().check_agent_memory_usage() + except AgentMemoryExceededException as exception: + msg = "Check on agent memory usage:\n{0}".format(ustr(exception)) + logger.info(msg) + add_event(AGENT_NAME, op=WALAEventOperation.AgentMemory, is_success=True, message=msg) + raise ExitException("Agent {0} is reached memory limit -- exiting".format(CURRENT_AGENT)) + except Exception as exception: + if self._check_memory_usage_last_error_report == datetime.min or (self._check_memory_usage_last_error_report + timedelta(hours=6)) > datetime.now(): + self._check_memory_usage_last_error_report = datetime.now() + msg = "Error checking the agent's memory usage: {0} --- [NOTE: Will not log the same error for the 6 hours]".format(ustr(exception)) + logger.warn(msg) + add_event(AGENT_NAME, op=WALAEventOperation.AgentMemory, is_success=False, message=msg) + @staticmethod def _ensure_extension_telemetry_state_configured_properly(protocol): etp_enabled = get_supported_feature_by_name(SupportedFeatureNames.ExtensionTelemetryPipeline).is_supported @@ -1467,18 +1492,20 @@ def _reset_legacy_blacklisted_agents(self): class GuestAgent(object): - def __init__(self, path=None, pkg=None, is_fast_track_goal_state=False, host=None): + def __init__(self, path, pkg, protocol, is_fast_track_goal_state): """ If 'path' is given, the object is initialized to the version installed under that path. If 'pkg' is given, the version specified in the package information is downloaded and the object is initialized to that version. - 'is_fast_track_goal_state' and 'host' are using only when a package is downloaded. + 'is_fast_track_goal_state' and 'protocol' are used only when a package is downloaded. + + NOTE: Prefer using the from_installed_agent and from_agent_package methods instead of calling __init__ directly """ self._is_fast_track_goal_state = is_fast_track_goal_state self.pkg = pkg - self.host = host + self._protocol = protocol version = None if path is not None: m = AGENT_DIR_PATTERN.match(path) @@ -1502,26 +1529,12 @@ def __init__(self, path=None, pkg=None, is_fast_track_goal_state=False, host=Non self._ensure_downloaded() self._ensure_loaded() except Exception as e: - if isinstance(e, ResourceGoneError): - raise - - # The agent was improperly blacklisting versions due to a timeout - # encountered while downloading a later version. Errors of type - # socket.error are IOError, so this should provide sufficient - # protection against a large class of I/O operation failures. - if isinstance(e, IOError): - raise - - # If we're unable to download/unpack the agent, delete the Agent directory and the zip file (if exists) to - # ensure we try downloading again in the next round. + # If we're unable to download/unpack the agent, delete the Agent directory try: if os.path.isdir(self.get_agent_dir()): shutil.rmtree(self.get_agent_dir(), ignore_errors=True) - if os.path.isfile(self.get_agent_pkg_path()): - os.remove(self.get_agent_pkg_path()) except Exception as err: logger.warn("Unable to delete Agent files: {0}".format(err)) - msg = u"Agent {0} install failed with exception:".format( self.name) detailed_msg = '{0} {1}'.format(msg, textutil.format_exception(e)) @@ -1532,6 +1545,20 @@ def __init__(self, path=None, pkg=None, is_fast_track_goal_state=False, host=Non is_success=False, message=detailed_msg) + @staticmethod + def from_installed_agent(path): + """ + Creates an instance of GuestAgent using the agent installed in the given 'path'. + """ + return GuestAgent(path, None, None, False) + + @staticmethod + def from_agent_package(package, protocol, is_fast_track_goal_state): + """ + Creates an instance of GuestAgent using the information provided in the 'package'; if that version of the agent is not installed it, it installs it. + """ + return GuestAgent(None, package, protocol, is_fast_track_goal_state) + @property def name(self): return "{0}-{1}".format(AGENT_NAME, self.version) @@ -1594,7 +1621,6 @@ def _ensure_downloaded(self): self.name)) self._download() - self._unpack() msg = u"Agent {0} downloaded successfully".format(self.name) logger.verbose(msg) @@ -1610,39 +1636,10 @@ def _ensure_loaded(self): self._load_error() def _download(self): - uris_shuffled = self.pkg.uris - random.shuffle(uris_shuffled) - for uri in uris_shuffled: - if not HostPluginProtocol.is_default_channel and self._fetch(uri): - break - - elif self.host is not None and self.host.ensure_initialized(): - if not HostPluginProtocol.is_default_channel: - logger.warn("Download failed, switching to host plugin") - else: - logger.verbose("Using host plugin as default channel") - - uri, headers = self.host.get_artifact_request(uri, use_verify_header=self._is_fast_track_goal_state, artifact_manifest_url=self.host.manifest_uri) - try: - if self._fetch(uri, headers=headers, use_proxy=False, retry_codes=restutil.HGAP_GET_EXTENSION_ARTIFACT_RETRY_CODES): - if not HostPluginProtocol.is_default_channel: - logger.verbose("Setting host plugin as default channel") - HostPluginProtocol.is_default_channel = True - break - else: - logger.warn("Host plugin download failed") - - # If the HostPlugin rejects the request, - # let the error continue, but set to use the HostPlugin - except ResourceGoneError: - HostPluginProtocol.is_default_channel = True - raise - - else: - logger.error("No download channels available") - - if not os.path.isfile(self.get_agent_pkg_path()): - msg = u"Unable to download Agent {0} from any URI".format(self.name) + try: + self._protocol.client.download_zip_package("agent package", self.pkg.uris, self.get_agent_pkg_path(), self.get_agent_dir(), use_verify_header=self._is_fast_track_goal_state) + except Exception as exception: + msg = "Unable to download Agent {0}: {1}".format(self.name, ustr(exception)) add_event( AGENT_NAME, op=WALAEventOperation.Download, @@ -1651,37 +1648,6 @@ def _download(self): message=msg) raise UpdateError(msg) - def _fetch(self, uri, headers=None, use_proxy=True, retry_codes=None): - package = None - try: - is_healthy = True - error_response = '' - resp = restutil.http_get(uri, use_proxy=use_proxy, headers=headers, max_retry=3, retry_codes=retry_codes) # Use only 3 retries, since there are usually 5 or 6 URIs and we try all of them - if restutil.request_succeeded(resp): - package = resp.read() - fileutil.write_file(self.get_agent_pkg_path(), - bytearray(package), - asbin=True) - logger.verbose(u"Agent {0} downloaded from {1}", self.name, uri) - else: - error_response = restutil.read_response_error(resp) - logger.verbose("Fetch was unsuccessful [{0}]", error_response) - is_healthy = not restutil.request_failed_at_hostplugin(resp) - - if self.host is not None: - self.host.report_fetch_health(uri, is_healthy, source='GuestAgent', response=error_response) - - except restutil.HttpError as http_error: - if isinstance(http_error, ResourceGoneError): - raise - - logger.verbose(u"Agent {0} download from {1} failed [{2}]", - self.name, - uri, - http_error) - - return package is not None - def _load_error(self): try: self.error = GuestAgentError(self.get_agent_error_file()) @@ -1731,35 +1697,6 @@ def _load_manifest(self): ustr(self.manifest.data)) return - def _unpack(self): - try: - if os.path.isdir(self.get_agent_dir()): - shutil.rmtree(self.get_agent_dir()) - - zipfile.ZipFile(self.get_agent_pkg_path()).extractall(self.get_agent_dir()) - - except Exception as e: - fileutil.clean_ioerror(e, - paths=[self.get_agent_dir(), self.get_agent_pkg_path()]) - - msg = u"Exception unpacking Agent {0} from {1}: {2}".format( - self.name, - self.get_agent_pkg_path(), - ustr(e)) - raise UpdateError(msg) - - if not os.path.isdir(self.get_agent_dir()): - msg = u"Unpacking Agent {0} failed to create directory {1}".format( - self.name, - self.get_agent_dir()) - raise UpdateError(msg) - - logger.verbose( - u"Agent {0} unpacked successfully to {1}", - self.name, - self.get_agent_dir()) - return - class GuestAgentError(object): def __init__(self, path): diff --git a/azurelinuxagent/pa/deprovision/default.py b/azurelinuxagent/pa/deprovision/default.py index 105b61825..89492b75e 100644 --- a/azurelinuxagent/pa/deprovision/default.py +++ b/azurelinuxagent/pa/deprovision/default.py @@ -26,8 +26,10 @@ import azurelinuxagent.common.conf as conf import azurelinuxagent.common.utils.fileutil as fileutil from azurelinuxagent.common import version +from azurelinuxagent.common.cgroupconfigurator import _AGENT_DROP_IN_FILE_SLICE, _DROP_IN_FILE_CPU_ACCOUNTING, \ + _DROP_IN_FILE_CPU_QUOTA, _DROP_IN_FILE_MEMORY_ACCOUNTING, LOGCOLLECTOR_SLICE from azurelinuxagent.common.exception import ProtocolError -from azurelinuxagent.common.osutil import get_osutil +from azurelinuxagent.common.osutil import get_osutil, systemd from azurelinuxagent.common.persist_firewall_rules import PersistFirewallRulesHandler from azurelinuxagent.common.protocol.util import get_protocol_util from azurelinuxagent.ga.exthandlers import HANDLER_COMPLETE_NAME_PATTERN @@ -199,6 +201,7 @@ def setup(self, deluser): self.del_user(warnings, actions) self.del_persist_firewall_rules(actions) + self.remove_agent_cgroup_config(actions) return warnings, actions @@ -210,6 +213,7 @@ def setup_changed_unique_id(self): self.del_lib_dir_files(warnings, actions) self.del_ext_handler_files(warnings, actions) self.del_persist_firewall_rules(actions) + self.remove_agent_cgroup_config(actions) return warnings, actions @@ -266,3 +270,20 @@ def del_persist_firewall_rules(actions): actions.append(DeprovisionAction(fileutil.rm_files, [agent_network_service_path, os.path.join(conf.get_lib_dir(), PersistFirewallRulesHandler.BINARY_FILE_NAME)])) + + @staticmethod + def remove_agent_cgroup_config(actions): + # Get all service drop in file paths + agent_drop_in_path = systemd.get_agent_drop_in_path() + slice_path = os.path.join(agent_drop_in_path, _AGENT_DROP_IN_FILE_SLICE) + cpu_accounting_path = os.path.join(agent_drop_in_path, _DROP_IN_FILE_CPU_ACCOUNTING) + cpu_quota_path = os.path.join(agent_drop_in_path, _DROP_IN_FILE_CPU_QUOTA) + mem_accounting_path = os.path.join(agent_drop_in_path, _DROP_IN_FILE_MEMORY_ACCOUNTING) + + # Get log collector slice + unit_file_install_path = systemd.get_unit_file_install_path() + log_collector_slice_path = os.path.join(unit_file_install_path, LOGCOLLECTOR_SLICE) + + actions.append(DeprovisionAction(fileutil.rm_files, + [slice_path, cpu_accounting_path, cpu_quota_path, mem_accounting_path, + log_collector_slice_path])) diff --git a/ci/nosetests_only.sh b/ci/nosetests_only.sh deleted file mode 100755 index 8f87ea248..000000000 --- a/ci/nosetests_only.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env bash - -set -u - -EXIT_CODE=0 - -echo -echo "=========================================" -echo "nosetests -a '!requires_sudo' output" -echo "=========================================" -nosetests -a '!requires_sudo' tests || EXIT_CODE=$(($EXIT_CODE || $?)) - -echo "=========================================" -echo "nosetests -a 'requires_sudo' output" -echo "=========================================" -sudo env "PATH=$PATH" nosetests -a 'requires_sudo' tests || EXIT_CODE=$(($EXIT_CODE || $?)) - -exit "$EXIT_CODE" diff --git a/ci/pylint_and_nosetests.sh b/ci/pylint_and_nosetests.sh deleted file mode 100755 index e3e6b9355..000000000 --- a/ci/pylint_and_nosetests.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -set -u - -pylint $PYLINTOPTS --jobs=0 $PYLINTFILES &> pylint.output & PYLINT_PID=$! -nosetests -a '!requires_sudo' tests &> nosetests_no_sudo.output & NOSETESTS_PID=$! -sudo env "PATH=$PATH" nosetests -a 'requires_sudo' tests &> nosetests_sudo.output & NOSETESTS_SUDO_PID=$! - -EXIT_CODE=0 -wait $PYLINT_PID || EXIT_CODE=$(($EXIT_CODE || $?)) -wait $NOSETESTS_PID || EXIT_CODE=$(($EXIT_CODE || $?)) -wait $NOSETESTS_SUDO_PID || EXIT_CODE=$(($EXIT_CODE || $?)) - -echo "=========================================" -echo "pylint output:" -echo "=========================================" - -cat pylint.output - -echo -echo "=========================================" -echo "nosetests -a '!requires_sudo' output:" -echo "=========================================" -cat nosetests_no_sudo.output - -echo -echo "=========================================" -echo "nosetests -a 'requires_sudo' output:" -echo "=========================================" -cat nosetests_sudo.output - -exit "$EXIT_CODE" \ No newline at end of file diff --git a/dcr/README.md b/dcr/README.md deleted file mode 100644 index 7f8b4da7e..000000000 --- a/dcr/README.md +++ /dev/null @@ -1,129 +0,0 @@ -# DCR v2 - Azure Pipelines - -## Introduction - -This is the testing pipeline for the Linux Guest Agent. It uses [Azure Pipelines](https://docs.microsoft.com/en-us/azure/devops/pipelines/get-started/what-is-azure-pipelines?view=azure-devops) for its orchestration. Here's the link -of the pipeline - https://dev.azure.com/cplatruntime/WALinuxAgent/_build?definitionId=1 -
// To-Do: Update link with the final pipeline - -## Architecture - -A rough outline of the workflow is defined below. -- The entry way into the pipeline is - `dcr/azure-pipelines.yml` -- When a run is initiated, a DevOps agent on a fresh VM is assigned to the run from the Azure Pipelines hosted pool (image: `ubuntu-latest`). This is the orchestrator VM. - -### Orchestrator Setup -- We set up the orchestrator VM for the test runs - - Setup SSH keys on the orchestrator. We use the same keys to deploy the test VMs later so that the orchestrator always has access to the test VMs. - - Pin to Python 3.7 and install pip dependencies - - Download pypy3.7 for the test VM - - Overwrite the default settings of the run if a `config.json` file exists in the executing scenario. - - Downloads all secrets from Key-Vault that are needed to deploy ARM template and use `az-cli` to initiate extension/VM related APIs (These secrets are only available scoped till the orchestrator and not passed to the test VM to avoid any cred leaks) - -### Test-VM/VMSS Setup -- After setup, we deploy the Test-VM/VMSS as per the requested Distro. -- Once the VM is deployed, we set up the test VM and prepare it for the tests - - - Copy over all files (agent + DCR related) to the test VM. - - The agent that is copied over is placed in the `/var/lib/waagent` directory and the agent is restarted to force it to pick up the testing agent. - > We don't install agent from source. - - (We copy the whole `dcr` directory over to the test VM. To make the abstractions easier, a new directory `dcr/scenario/` is created on the test VM and the contents of the executing scenario are copied over there. This negates the need to maintain a name-specific path if needed in the tests). - - In the case of a VMSS, we set up each instance separately. - - Using `asyncio`, we execute the setup commands simultaneously on each instance of the VMSS using SSH and wait for them to complete. - - If even a single instance fails to set up properly, we fail the whole step. - - The recommended way for setting up a Scale Set is to either use a custom image or CSE extension. Since we didn't want to rely on either of those methods, we chose this approach of setting up the instances separately. - - Install test dependencies on the Test VM (pip requirements, etc) - - Run scenario specific `setup.sh` (if applicable) - - Run distro specific setup scripts (if applicable) - -### Execute tests - -- Finally, after the setup is complete, we execute the tests (`run.host.py` or `run.py`) - - If both files are present in the scenario directory, we execute `run.host.py` first and then `run.py` in that order. - - If none of these files are present, then **no scripts would be executed for that scenario**. - > Note: run.py is executed on the VM using pypy3.7 and not the system python interpreter. - -### Fetch and Publish test results and artifacts - -- Once the tests are executed, we capture the `harvest` logs from the test VMs -- We collect the results from either `os.environ['BUILD_ARTIFACTSTAGINGDIRECTORY’]` directory in case of orchestrator VM or the `/home/{admin_username}` (or `~`) directory in case of the test VM. -- After collecting both data, we publish them separately per run to be visible on the UI. - -![Basic Workflow](docs/DCR-workflow.jpg) - -![Orchestrator-TestVM Flow](docs/orchestrator-vm-flow.jpg) - -## Key-points of DCR v2 - - -- Uses PyPy3.7 for executing python scripts on the Test VMs, alleviating the need to write cross-version compatible code -- For more ease of authoring scripts, pinned the orchestrator python version to py3.7 for parity -- Supports Mooncake and Fairfax -- Sets up the test VM by setting up the new agent as auto-update rather than installing from source (as that’s distro - dependent), making this less susceptible to set up failures -- Parameterized inputs, makes it easier to test specific scenarios and distros if needed. Easier to onboard new distros - too -- Easy to author as its simple python scripts -- (M * N) test VMs are created per run, where M being the number of scenarios and N being the number of Distros -- There's a 1:1 relation between the Azure Pipeline agent VM and the test VM. This is to reduce the setup scripts we need to maintain on our end and to utilize Azure Pipelines to the fullest. -- This framework supports both VMs and VMSS deployments and can handle concurrently executing scripts for their setups. - - -## Author a Test - -### Add a test -- To add a new scenario to the daily runs, simply add a new directory to the `dcr/scenarios` directory and add the scenario name to `scenarios` parameter in `dcr/azure-pipelines.yml` file -- Each file inside the scenario directory is confined to that scenario. To share code between scenarios, add the code in `dcr/scenario_utils` directory. -- Can specify `setup.sh` to run any scenario specific setup on the test VM –
-Eg: set up a cron job for firewall testing -- Can specify a `config.json` file to override default parameters of the test –
-Eg: set VM location to a specific region - -### Executing the test scripts -There are 2 entry points into the test pipeline – **_run.host.py_** and _**run.py**_ - - #### run.host.py - - Run the script on the Orchestrator VM - - Useful for running scenarios that require controlling the test VM externally - - Eg: Using Az-cli for adding extensions, restarting VMs, etc - - Drop off the result Junit XML file to `os.environ['BUILD_ARTIFACTSTAGINGDIRECTORY’]` directory - > This is the only script that has access to the KeyVault secrets. The other method does not have that. - - - #### run.py - - Executed via SSH on the test VM - - Can run anything, the only requirement is to drop off test result Junit XML file with to the home directory - `~ - /test-result-pf-run.xml` - -### Publishing test results -- The test framework expects the test results to be in a JUnit XML file. If no file is produced by the test script, then no test results would be published in the pipeline. -- Depending on the type of test script (orchestrator VM vs the test VM), the JUnit file needs to be dropped off in a specific location. -- In addition to the directory location, the result file must conform to this naming convention - `test-result*.xml`
-Eg: `test-results-bvt-host.xml` or `test-results-ext-seq-run1.xml` -- The framework will automatically fetch all the test files from both the locations and aggregate them into one single file for better readability in the UI. -- The test results would be visible from the `Tests` tab under the summary UI. - -## Troubleshooting failures -- The current implementation provides multiple helper utility tools to ease the process of authoring tests with enough retries and logging. -- In case of a test failure, the best place would be to start at the `Test UI` page as that would give you the exact failures with a stack trace of the failure. -- If that's not enough, you can go into the `summary` page of the run and check the console output at the task level.
-> Tip: The logger implemented in the code logs to the console too in the format it expects to make it more readable from the console output itself. -- Additionally, the `harvest logs` captures all relevant data from the TestVM before deleting it. That can be referred to too if the failure is coming from within the test VM itself. -- Currently, the logs are not written to a file, but can be added later if needed. - -## Nomenclature - -Here's a list of certain terminologies used widely in the repo - -| Name | Meaning | -| ------------- |:-------------:| -| Scenarios | The test directories used to add new test cases to the pipeline. Each directory under dcr/scenarios represents a test scenario that would be tested on the test VM | -| Test Orchestrator | The VM created by Azure Pipelines that hosts the tests for a specific scenario and a distro | -| Test VM | The VM created by the pipeline to run the tests. This is where we actually test out the scripts. | -| Scenario Utils | Directory where all common code is placed | -| Harvest logs | All the logs from the test VM. Useful for debugging VM related issues | -| [YML/YAML](https://docs.microsoft.com/en-us/azure/devops/pipelines/get-started/yaml-pipeline-editor?view=azure-devops) | The file format in which the azure pipeline is defined | -| [Azure Pipelines](https://docs.microsoft.com/en-us/azure/devops/pipelines/get-started/what-is-azure-pipelines?view=azure-devops) | CI/CD tool that we utilize for our DCR testing | -| JUnit XML | Standard for the test results file that we use to publish our test results | -| [Jobs](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/phases?view=azure-devops&tabs=yaml) | A job is a series of steps that run sequentially as a unit. In other words, a job is the smallest unit of work that can be scheduled to run. | -| [Tasks](https://docs.microsoft.com/en-us/azure/devops/pipelines/get-started/key-pipelines-concepts?view=azure-devops#task) | A task is the building block for defining automation in a pipeline. A task is packaged script or procedure that has been abstracted with a set of inputs. | -| [Steps](https://docs.microsoft.com/en-us/azure/devops/pipelines/get-started/key-pipelines-concepts?view=azure-devops#step) | A step is the smallest building block of a pipeline. | - - - diff --git a/dcr/azure-cleanup-pipeline.yml b/dcr/azure-cleanup-pipeline.yml deleted file mode 100644 index 12f899337..000000000 --- a/dcr/azure-cleanup-pipeline.yml +++ /dev/null @@ -1,35 +0,0 @@ -# Pipeline for cleaning up any remaining Resource Groups generated by the Azure.WALinuxAgent pipeline -# Runs every 3 hours and deletes any resource groups that are more than a day old and contain string dcr-v2-test - -schedules: - - cron: "0 */3 * * *" # Run every 3 hours - displayName: cleanup build - branches: - include: - - develop - always: true - -# no PR triggers -pr: none - -pool: - vmImage: ubuntu-latest - -variables: - - template: templates/vars.yml - -steps: - - - task: AzureCLI@2 - inputs: - azureSubscription: '$(azureConnection)' - scriptType: 'bash' - scriptLocation: 'inlineScript' - inlineScript: | - set -euxo pipefail - date=`date --utc +%Y-%m-%d'T'%H:%M:%S.%N'Z' -d "1 day ago"` - - # Using the Azure REST GET resourceGroups API call as we can add the createdTime to the results. - # This feature is not available via the az-cli commands directly so we have to use the Azure REST APIs - - az rest --method GET --url "https://management.azure.com/subscriptions/$(subId)/resourcegroups" --url-parameters api-version=2021-04-01 \$expand=createdTime --output json --query value | jq --arg date "$date" '.[] | select (.createdTime < $date).name' | grep "$(rgPrefix)" | xargs -l -t -r az group delete --no-wait -y -n || echo "No resource groups found to delete" diff --git a/dcr/azure-pipelines.yml b/dcr/azure-pipelines.yml deleted file mode 100644 index 3fcffc6ae..000000000 --- a/dcr/azure-pipelines.yml +++ /dev/null @@ -1,113 +0,0 @@ -parameters: - - name: scenarios - type: object - default: - - agent-bvt - - extension-telemetry-pipeline - - - name: distros - type: object - default: - - publisher: "Canonical" - offer: "UbuntuServer" - version: "latest" - sku: "18.04-LTS" - name: "ubuntu18" - # ToDo: Figure out a better way to incorporate distro setup scripts -# setupPath: "dcr/distros/install_pip_packages.sh" - - - publisher: "Debian" - offer: "debian-10" - sku: "10" - version: "latest" - name: "deb10" -## setupPath: "dcr/distros/install_pip_packages.sh" -# - - publisher: "OpenLogic" - offer: "CentOS" - sku: "7_9" - version: "latest" - name: "cent79" -## - - publisher: "SUSE" - offer: "sles-15-sp2-basic" - sku: "gen1" - version: "latest" - name: "suse15" -## - - publisher: "RedHat" - offer: "RHEL" - sku: "7-RAW" - version: "latest" - name: "rhel7Raw" -## - - publisher: "microsoftcblmariner" - offer: "cbl-mariner" - sku: "cbl-mariner-1" - version: "latest" - name: "mariner1" -## - - publisher: "microsoftcblmariner" - offer: "cbl-mariner" - sku: "cbl-mariner-2" - version: "latest" - name: "mariner2" - -trigger: - - develop - -# no PR triggers -pr: none - -schedules: - - cron: "0 */8 * * *" # Run every 8 hours - displayName: Daily validation builds - branches: - include: - - develop - always: true - -variables: - - template: templates/vars.yml - - - name: SSH_PUBLIC - value: "$(sshPublicKey)" # set in GUI variables - - name: rgNamePrefix - value: "$(rgPrefix)$(Build.BuildId)" - - -pool: #larohra-dcrvmsspool - vmImage: ubuntu-latest - -stages: - - stage: "Execute" - jobs: - - template: 'templates/setup-vm-and-execute-tests.yml' - parameters: - scenarios: - - ${{ parameters.scenarios }} - distros: - - ${{ parameters.distros }} - rgPrefix: $(rgNamePrefix) - - - stage: "Cleanup" - condition: succeededOrFailed() - jobs: - - job: "Wait" - pool: server - # ToDo: Add a parameter to force wait before deleting the Test VMs - condition: in(stageDependencies.Execute.CreateVM.result, 'Failed', 'SucceededWithIssues') - steps: - - task: ManualValidation@0 - timeoutInMinutes: 50 - inputs: - notifyUsers: 'larohra' - onTimeout: 'resume' - - - template: templates/arm-delete.yml - parameters: - scenarios: - - ${{ parameters.scenarios }} - distros: - - ${{ parameters.distros }} - rgPrefix: $(rgNamePrefix) \ No newline at end of file diff --git a/dcr/docs/DCR-workflow.jpg b/dcr/docs/DCR-workflow.jpg deleted file mode 100644 index 0e517f1fa..000000000 Binary files a/dcr/docs/DCR-workflow.jpg and /dev/null differ diff --git a/dcr/docs/orchestrator-vm-flow.jpg b/dcr/docs/orchestrator-vm-flow.jpg deleted file mode 100644 index 76c892636..000000000 Binary files a/dcr/docs/orchestrator-vm-flow.jpg and /dev/null differ diff --git a/dcr/requirements.txt b/dcr/requirements.txt deleted file mode 100644 index 91f91625b..000000000 --- a/dcr/requirements.txt +++ /dev/null @@ -1,14 +0,0 @@ -# This is a list of pip packages that will be installed on both the orchestrator and the test VM -# Only add the common packages here, for more specific modules, add them to the scenario itself -azure-identity -azure-keyvault-keys -azure-mgmt-compute>=22.1.0 -azure-mgmt-keyvault>=7.0.0 -azure-mgmt-network>=16.0.0 -azure-mgmt-resource>=15.0.0 -cryptography -distro -junitparser -msrestazure -pudb -python-dotenv \ No newline at end of file diff --git a/dcr/scenario_utils/agent_log_parser.py b/dcr/scenario_utils/agent_log_parser.py deleted file mode 100644 index 5c67c3a80..000000000 --- a/dcr/scenario_utils/agent_log_parser.py +++ /dev/null @@ -1,99 +0,0 @@ -from __future__ import print_function - -import os -import re -from datetime import datetime - -AGENT_LOG_FILE = '/var/log/waagent.log' - -# Example: -# ProcessExtensionsGoalState completed [etag_2824367392948713696 4073 ms] -GOAL_STATE_COMPLETED = r"ProcessExtensionsGoalState completed\s\[(?P[a-z_\d]+)\s(?P\d+)\sms\]" - -# The format of the log has changed over time and the current log may include records from different sources. Most records are single-line, but some of them -# can span across multiple lines. We will assume records always start with a line similar to the examples below; any other lines will be assumed to be part -# of the record that is being currently parsed. -# -# Newer Agent: 2019-11-27T22:22:48.123985Z VERBOSE ExtHandler ExtHandler Report vm agent status -# 2021-03-30T19:45:33.793213Z INFO ExtHandler [Microsoft.Azure.Security.Monitoring.AzureSecurityLinuxAgent-2.14.64] Target handler state: enabled [incarnation 3] -# -# Older Agent: 2021/03/30 19:35:35.971742 INFO Daemon Azure Linux Agent Version:2.2.45 -# -# Extension: 2021/03/30 19:45:31 Azure Monitoring Agent for Linux started to handle. -# 2021/03/30 19:45:31 [Microsoft.Azure.Monitor.AzureMonitorLinuxAgent-1.7.0] cwd is /var/lib/waagent/Microsoft.Azure.Monitor.AzureMonitorLinuxAgent-1.7.0 -# -_NEW_AGENT_RECORD = re.compile(r'(?P[0-9-]+T[0-9:.]+Z)\s(?PVERBOSE|INFO|WARNING|ERROR)\s(?P\S+)\s(?P(Daemon)|(ExtHandler)|(\[\S+\]))\s(?P.*)') -_OLD_AGENT_RECORD = re.compile(r'(?P[0-9/]+\s[0-9:.]+)\s(?PVERBOSE|INFO|WARNING|ERROR)\s(?P)(?P\S*)\s(?P.*)') -_EXTENSION_RECORD = re.compile(r'(?P[0-9/]+\s[0-9:.]+)\s(?P)(?P)((?P\[[^\]]+\])\s)?(?P.*)') - -# In 2.2.46, the date time was changed to ISO-8601 format but thread name was not added. This regex takes care of that -# Sample: 2021-05-28T01:17:40.683072Z INFO ExtHandler Wire server endpoint:168.63.129.16 -# 2021-05-28T01:17:40.683823Z WARNING ExtHandler Move rules file 70-persistent-net.rules to /var/lib/waagent/70-persistent-net.rules -# 2021-05-28T01:17:40.767600Z INFO ExtHandler Successfully added Azure fabric firewall rules -_46_AGENT_RECORD = re.compile(r'(?P[0-9-]+T[0-9:.]+Z)\s(?PVERBOSE|INFO|WARNING|ERROR)\s(?P)(?PDaemon|ExtHandler|\[\S+\])\s(?P.*)') - - -class AgentLogRecord: - __ERROR_TAGS = ['Exception', 'Traceback', '[CGW]'] - - def __init__(self, match): - self.text = match.string - self.when = match.group("when") - self.level = match.group("level") - self.thread = match.group("thread") - self.who = match.group("who") - self.message = match.group("message") - - def get_timestamp(self): - return datetime.strptime(self.when, u'%Y-%m-%dT%H:%M:%S.%fZ') - - @property - def is_error(self): - is_error = self.level in ('ERROR', 'WARNING') or any(err in self.text for err in self.__ERROR_TAGS) - # - # Don't report errors in the telemetry data. Sample log line: - # - # 2022-03-27T06:40:46.011455Z VERBOSE SendTelemetryHandler ExtHandler HTTP connection [POST] [/machine?comp=telemetrydata] [ VMMetaData: - return self.__vm_data - - @property - def compute_client(self) -> ComputeManagementClient: - if self.__compute_client is None: - self.__compute_client = ComputeManagementClient( - credential=DefaultAzureCredential(), - subscription_id=self.vm_data.sub_id - ) - return self.__compute_client - - @property - def resource_client(self) -> ResourceManagementClient: - if self.__resource_client is None: - self.__resource_client = ResourceManagementClient( - credential=DefaultAzureCredential(), - subscription_id=self.vm_data.sub_id - ) - return self.__resource_client - - @property - @abstractmethod - def vm_func(self): - pass - - @property - @abstractmethod - def extension_func(self): - pass - - @abstractmethod - def get_vm_instance_view(self): - pass - - @abstractmethod - def get_extensions(self): - pass - - @abstractmethod - def get_extension_instance_view(self, extension_name): - pass - - @abstractmethod - def get_ext_props(self, extension_data, settings=None, protected_settings=None, auto_upgrade_minor_version=True, - force_update_tag=None): - pass - - @abstractmethod - def restart(self, timeout=5): - pass - - def _run_azure_op_with_retry(self, get_func): - max_retries = 3 - retries = max_retries - while retries > 0: - try: - ext = get_func() - return ext - except (CloudError, HttpResponseError) as ce: - if retries > 0: - self.log.exception(f"Got Azure error: {ce}") - self.log.warning("...retrying [{0} attempts remaining]".format(retries)) - retries -= 1 - time.sleep(30 * (max_retries - retries)) - else: - raise - - -class VirtualMachineHelper(AzureComputeBaseClass): - - def __init__(self): - super().__init__() - - @property - def vm_func(self): - return self.compute_client.virtual_machines - - @property - def extension_func(self): - return self.compute_client.virtual_machine_extensions - - def get_vm_instance_view(self) -> VirtualMachineInstanceView: - return self._run_azure_op_with_retry(lambda: self.vm_func.get( - resource_group_name=self.vm_data.rg_name, - vm_name=self.vm_data.name, - expand="instanceView" - )) - - def get_extensions(self) -> List[VirtualMachineExtension]: - return self._run_azure_op_with_retry(lambda: self.extension_func.list( - resource_group_name=self.vm_data.rg_name, - vm_name=self.vm_data.name - )) - - def get_extension_instance_view(self, extension_name) -> VirtualMachineExtensionInstanceView: - return self._run_azure_op_with_retry(lambda: self.extension_func.get( - resource_group_name=self.vm_data.rg_name, - vm_name=self.vm_data.name, - vm_extension_name=extension_name, - expand="instanceView" - )) - - def get_ext_props(self, extension_data, settings=None, protected_settings=None, auto_upgrade_minor_version=True, - force_update_tag=None) -> VirtualMachineExtension: - return VirtualMachineExtension( - location=self.vm_data.location, - publisher=extension_data.publisher, - type_properties_type=extension_data.ext_type, - type_handler_version=extension_data.version, - auto_upgrade_minor_version=auto_upgrade_minor_version, - settings=settings, - protected_settings=protected_settings, - force_update_tag=force_update_tag - ) - - def restart(self, timeout=5): - self.log.info(f"Initiating restart of machine: {self.vm_data.name}") - poller : LROPoller = self._run_azure_op_with_retry(lambda: self.vm_func.begin_restart( - resource_group_name=self.vm_data.rg_name, - vm_name=self.vm_data.name - )) - poller.wait(timeout=timeout * 60) - if not poller.done(): - raise TimeoutError(f"Machine {self.vm_data.name} failed to restart after {timeout} mins") - self.log.info(f"Restarted machine: {self.vm_data.name}") - - -class VirtualMachineScaleSetHelper(AzureComputeBaseClass): - - def restart(self, timeout=5): - poller: LROPoller = self._run_azure_op_with_retry(lambda: self.vm_func.begin_restart( - resource_group_name=self.vm_data.rg_name, - vm_scale_set_name=self.vm_data.name - )) - poller.wait(timeout=timeout * 60) - if not poller.done(): - raise TimeoutError(f"ScaleSet {self.vm_data.name} failed to restart after {timeout} mins") - - def __init__(self): - super().__init__() - - @property - def vm_func(self): - return self.compute_client.virtual_machine_scale_set_vms - - @property - def extension_func(self): - return self.compute_client.virtual_machine_scale_set_extensions - - def get_vm_instance_view(self) -> VirtualMachineScaleSetInstanceView: - # Since this is a VMSS, return the instance view of the first VMSS VM. For the instance view of the complete VMSS, - # use the compute_client.virtual_machine_scale_sets function - - # https://docs.microsoft.com/en-us/python/api/azure-mgmt-compute/azure.mgmt.compute.v2019_12_01.operations.virtualmachinescalesetsoperations?view=azure-python - - for vm in self._run_azure_op_with_retry(lambda: self.vm_func.list(self.vm_data.rg_name, self.vm_data.name)): - try: - return self._run_azure_op_with_retry(lambda: self.vm_func.get_instance_view( - resource_group_name=self.vm_data.rg_name, - vm_scale_set_name=self.vm_data.name, - instance_id=vm.instance_id - )) - except Exception as err: - self.log.warning( - f"Unable to fetch instance view of VMSS VM: {vm}. Trying out other instances.\nError: {err}") - continue - - raise Exception(f"Unable to fetch instance view of any VMSS instances for {self.vm_data.name}") - - def get_extensions(self) -> List[VirtualMachineScaleSetExtension]: - return self._run_azure_op_with_retry(lambda: self.extension_func.list( - resource_group_name=self.vm_data.rg_name, - vm_scale_set_name=self.vm_data.name - )) - - def get_extension_instance_view(self, extension_name) -> VirtualMachineExtensionInstanceView: - return self._run_azure_op_with_retry(lambda: self.extension_func.get( - resource_group_name=self.vm_data.rg_name, - vm_scale_set_name=self.vm_data.name, - vmss_extension_name=extension_name, - expand="instanceView" - )) - - def get_ext_props(self, extension_data, settings=None, protected_settings=None, auto_upgrade_minor_version=True, - force_update_tag=None) -> VirtualMachineScaleSetExtension: - return VirtualMachineScaleSetExtension( - publisher=extension_data.publisher, - type_properties_type=extension_data.ext_type, - type_handler_version=extension_data.version, - auto_upgrade_minor_version=auto_upgrade_minor_version, - settings=settings, - protected_settings=protected_settings - ) - - -class ComputeManager: - """ - The factory class for setting the Helper class based on the setting. - """ - def __init__(self): - self.__vm_data = get_vm_data_from_env() - self.__compute_manager = None - - @property - def is_vm(self) -> bool: - return self.__vm_data.model_type == VMModelType.VM - - @property - def compute_manager(self): - if self.__compute_manager is None: - self.__compute_manager = VirtualMachineHelper() if self.is_vm else VirtualMachineScaleSetHelper() - return self.__compute_manager diff --git a/dcr/scenario_utils/cgroups_helpers.py b/dcr/scenario_utils/cgroups_helpers.py deleted file mode 100644 index f38827c11..000000000 --- a/dcr/scenario_utils/cgroups_helpers.py +++ /dev/null @@ -1,301 +0,0 @@ -import os -import re -import subprocess -import sys - -from dcr.scenario_utils.distro import get_distro - -BASE_CGROUP = '/sys/fs/cgroup' -AGENT_CGROUP_NAME = 'WALinuxAgent' -AGENT_SERVICE_NAME = "walinuxagent.service" -CONTROLLERS = ['cpu'] # Only verify the CPU controller since memory accounting is not enabled yet. - -DAEMON_CMDLINE_PATTERN = re.compile(r".*python.*waagent.*-daemon") -AGENT_CMDLINE_PATTERN = re.compile(r".*python.*-run-exthandlers") - -CREATED_CGROUP_PATTERN = r"..*Created cgroup (/sys/fs/cgroup/.+)" -EXTENSION_PID_ADDED_PATTERN = re.compile(r".*Added PID (\d+) to cgroup[s]* (/sys/fs/cgroup/.+)") -CGROUP_TRACKED_PATTERN = re.compile(r'Started tracking cgroup ([^\s]+)\s+\[(?P[^\s]+)\]') - -# -# It is OK for these processes to show up in the Agent's cgroup -# -WHITELISTED_AGENT_REGEXES = [ - # - # The monitor thread uses these periodically: - # - re.compile(r"/sbin/dhclient\s.+/run/dhclient.*/var/lib/dhcp/dhclient.*/var/lib/dhcp/dhclient.*"), - re.compile(r".*iptables --version.*"), - re.compile(r".*iptables (-w)? -t security.*"), - # - # The agent starts extensions using systemd-run; the actual extension command will be on a different process. - # - re.compile(r".*systemd-run --unit=Microsoft.Azure.Diagnostics.LinuxDiagnostic_3.* " - r"--scope --slice=azure-vmextensions.slice /var/lib/waagent/Microsoft.Azure.Diagnostics.LinuxDiagnostic-3.*/diagnostic.py " - r"-enable.*"), - # - # The agent can start a new shell process. - # - re.compile(r"^\[sh\]$") -] - - -def exit_if_cgroups_not_supported(): - print("===== Checking if distro supports cgroups =====") - - __distro__ = get_distro() - base_fs_exists = os.path.exists(BASE_CGROUP) - - if not base_fs_exists: - print("\tDistro {0} does not support cgroups -- exiting".format(__distro__)) - sys.exit(1) - else: - print('\tDistro {0} supports cgroups\n'.format(__distro__)) - - -def run_get_output(cmd, print_std_out=False): - # Returns a list of stdout lines without \n at the end of the line. - output = subprocess.check_output(cmd, - stderr=subprocess.STDOUT, - shell=True) - output = str(output, - encoding='utf-8', - errors="backslashreplace") - - if print_std_out: - print(output) - - return output.split("\n") - - -def is_systemd_distro(): - try: - return run_get_output('cat /proc/1/comm')[0].strip() == 'systemd' - except Exception: - return False - - -def print_cgroups(): - print("====== Currently mounted cgroups ======") - for m in run_get_output('mount'): - if 'type cgroup' in m: - print('\t{0}'.format(m)) - print("") - - -def print_processes(): - print("====== Currently running processes ======") - processes = run_get_output("ps aux --forest") - for process in processes: - print("\t{0}".format(process)) - print("") - - -def print_service_status(service_status): - # Make sure to replace non-ascii characters since DCR logs anything that goes to stdout and will fail if - # there are non-ascii characters such as the ones showing up in `systemctl status {service_name}`. - for line in service_status: - print("\t" + line.encode("ascii", "replace").decode().replace("\n", "")) - print("") - - -def get_parent_pid(pid): - try: - with open("/proc/{0}/stat".format(pid), "r") as fh: - raw = fh.readline() - ppid = raw.split(" ")[3] - return ppid - except Exception: - return None - - -def get_pid_by_cmdline(pattern): - agent_pid = -1 - - for dirname in os.listdir('/proc'): - if dirname == 'curproc': - continue - - try: - with open('/proc/{0}/cmdline'.format(dirname), mode='r') as fd: - ps_cmd = fd.read() - if re.match(pattern, ps_cmd): - agent_pid = dirname - break - except Exception: - pass - - return agent_pid - - -def get_cmdline_by_pid(pid): - try: - with open('/proc/{0}/cmdline'.format(pid), mode='r') as process_fd: - return process_fd.read() - except Exception: - return None - - -def get_process_cgroups(pid): - with open('/proc/{0}/cgroup'.format(pid), mode='r') as fd: - return fd.read().split('\n')[:-1] - - -def get_agent_cgroup_mount_path(): - # TODO: change the service name based on distro (SUSE is waagent, for example) - if is_systemd_distro(): - return os.path.join('/', 'azure.slice', AGENT_SERVICE_NAME) - else: - return os.path.join('/', AGENT_SERVICE_NAME) - - -def check_cgroup_for_agent_process(name, pid): - process_cgroups = get_process_cgroups(pid) - expected_cgroup_path = get_agent_cgroup_mount_path() - - print('\tretrieved cgroups for {0}:'.format(name)) - for cgroup in process_cgroups: - print("\t\t{0}".format(cgroup)) - print("") - - for controller in CONTROLLERS: - for cgroup in process_cgroups: - # This is what the lines in /proc/PID/cgroup look like: - # 4:memory:/system.slice/walinuxagent.service - # 7:memory:/WALinuxAgent/Microsoft.EnterpriseCloud.Monitoring.OmsAgentForLinux - # We are interested in extracting the controller and mount path - mounted_controller = cgroup.split(':')[1].split(',') - mounted_path = cgroup.split(':')[2] - if controller in mounted_controller: - if mounted_path != expected_cgroup_path: - raise Exception("Expected {0} cgroup to be mounted under {1}, " - "but it's mounted under {2}".format(name, expected_cgroup_path, mounted_path)) - - print("\t{0}'s PID is {1}, cgroup mount path is {2}".format(name, pid, expected_cgroup_path)) - print("\tverified {0}'s /proc/cgroup is expected!\n".format(name)) - - -def check_pids_in_agent_cgroup(agent_cgroup_procs, daemon_pid, agent_pid): - with open(agent_cgroup_procs, "r") as agent_fd: - content = agent_fd.read() - print("\tcontent of {0}:\n{1}".format(agent_cgroup_procs, content)) - - pids = content.split('\n')[:-1] - - if daemon_pid not in pids: - raise Exception("Daemon PID {0} not found in expected cgroup {1}!".format(daemon_pid, agent_cgroup_procs)) - - if agent_pid not in pids: - raise Exception("Agent PID {0} not found in expected cgroup {1}!".format(agent_pid, agent_cgroup_procs)) - - for pid in pids: - if pid == daemon_pid or pid == agent_pid: - continue - else: - # There is an unexpected PID in the cgroup, check what process it is - cmd = get_cmdline_by_pid(pid) - ppid = get_parent_pid(pid) - whitelisted = is_whitelisted(cmd) - - # If the process is whitelisted and a child of the agent, allow it. The process could have terminated - # in the meantime, but we allow it if it's whitelisted. - if whitelisted and (ppid is None or ppid == agent_pid or ppid == daemon_pid): - print("\tFound whitelisted process in agent cgroup:\n\t{0} {1}\n" - "\tparent process {2}".format(pid, cmd, ppid)) - continue - - raise Exception("Found unexpected process in the agent cgroup:\n\t{0} {1}\n" - "\tparent process {2}".format(pid, cmd, ppid)) - - return True - - -def is_whitelisted(cmd): - matches = [re.match(r, cmd) is not None for r in WHITELISTED_AGENT_REGEXES] - return any(matches) - - -def parse_processes_from_systemctl_status(service_status): - processes_start_pattern = re.compile(r".*CGroup:\s+.*") - processes_end_pattern = re.compile(r"^$") - - processes_start_index = -1 - processes_end_index = -1 - - for line in service_status: - if re.match(processes_start_pattern, line): - processes_start_index = service_status.index(line) - if re.match(processes_end_pattern, line): - processes_end_index = service_status.index(line) - break - - processes_raw = service_status[processes_start_index+1:processes_end_index] - - # Remove non-ascii characters and extra whitespace - cleaned = list(map(lambda x: ''.join([i if ord(i) < 128 else '' for i in x]).strip(), processes_raw)) - - # Return a list of tuples [(PID1, cmdline1), (PID2, cmdline2)] - processes = list(map(lambda x: (x.split(" ")[0], ' '.join(x.split(" ")[1:])), cleaned)) - - return processes - - -def verify_agent_cgroup_assigned_correctly_systemd(service_status): - print_service_status(service_status) - - is_active = False - is_active_pattern = re.compile(r".*Active:\s+active.*") - - for line in service_status: - if re.match(is_active_pattern, line): - is_active = True - - if not is_active: - raise Exception('walinuxagent service was not active') - - print("\tVerified the agent service status is correct!\n") - - -def verify_agent_cgroup_assigned_correctly_filesystem(): - print("===== Verifying the daemon and the agent are assigned to the same correct cgroup using filesystem =====") - - # Find out daemon and agent PIDs by looking at currently running processes - daemon_pid = get_pid_by_cmdline(DAEMON_CMDLINE_PATTERN) - agent_pid = get_pid_by_cmdline(AGENT_CMDLINE_PATTERN) - - if daemon_pid == -1: - raise Exception('daemon PID not found!') - - if agent_pid == -1: - raise Exception('agent PID not found!') - - # Ensure both the daemon and the agent are assigned to the (same) expected cgroup - check_cgroup_for_agent_process("daemon", daemon_pid) - check_cgroup_for_agent_process("agent", agent_pid) - - # Ensure the daemon/agent cgroup doesn't have any other processes there - for controller in CONTROLLERS: - # Mount path is /system.slice/walinuxagent.service or - # /WALinuxAgent/WALinuxAgent, so remove the first "/" to correctly build path - agent_cgroup_mount_path = get_agent_cgroup_mount_path()[1:] - agent_cgroup_path = os.path.join(BASE_CGROUP, controller, agent_cgroup_mount_path) - agent_cgroup_procs = os.path.join(agent_cgroup_path, 'cgroup.procs') - - # Check if the processes in the agent cgroup are expected. We expect to see the daemon and extension handler - # processes. Sometimes, we might observe more than one extension handler process. This is short-lived and - # happens because, in Linux, the process doubles before forking. Therefore, check twice with a bit of delay - # in between to see if it goes away. Still raise an exception if this happens so we can keep track of it. - check_pids_in_agent_cgroup(agent_cgroup_procs, daemon_pid, agent_pid) - - print('\tVerified the daemon and agent are assigned to the same correct cgroup {0}'.format(agent_cgroup_path)) - print("") - - -def verify_agent_cgroup_assigned_correctly(): - if is_systemd_distro(): - print("===== Verifying the daemon and the agent are assigned to the same correct cgroup using systemd =====") - output = run_get_output("systemctl status walinuxagent") - verify_agent_cgroup_assigned_correctly_systemd(output) - else: - verify_agent_cgroup_assigned_correctly_filesystem() - diff --git a/dcr/scenario_utils/check_waagent_log.py b/dcr/scenario_utils/check_waagent_log.py deleted file mode 100644 index cfc6668bc..000000000 --- a/dcr/scenario_utils/check_waagent_log.py +++ /dev/null @@ -1,207 +0,0 @@ -import re - -from dcr.scenario_utils.agent_log_parser import AGENT_LOG_FILE, parse_agent_log_file -from dcr.scenario_utils.cgroups_helpers import is_systemd_distro -from dcr.scenario_utils.distro import get_distro - - -def check_waagent_log_for_errors(waagent_log=AGENT_LOG_FILE, ignore=None): - # Returns any ERROR messages from the log except transient ones. - # Currently, the only transient one is /proc/net/route not being set up if it's being reported before - # provisioning was completed. In that case, we ignore that error message. - - no_routes_error = None - provisioning_complete = False - - distro = "".join(get_distro()) - systemd_enabled = is_systemd_distro() - - # - # NOTES: - # * 'message' is matched using re.search; be sure to escape any regex metacharacters - # * 'if' receives as parameter an AgentLogRecord - # - ignore_list = [ - # This warning is expected on CentOS/RedHat 7.8 and Redhat 7.6 - { - 'message': r"Move rules file 70-persistent-net.rules to /var/lib/waagent/70-persistent-net.rules", - 'if': lambda log_line: re.match(r"((centos7\.8)|(redhat7\.8)|(redhat7\.6)|(redhat8\.2))\D*", distro, - flags=re.IGNORECASE) is not None and log_line.level == "WARNING" and - log_line.who == "ExtHandler" and log_line.thread in ("", "EnvHandler") - }, - # This warning is expected on SUSE 12 - { - 'message': r"WARNING EnvHandler ExtHandler Move rules file 75-persistent-net-generator.rules to /var/lib/waagent/75-persistent-net-generator.rules", - 'if': lambda _: re.match(r"((sles15\.2)|suse12)\D*", distro, flags=re.IGNORECASE) is not None - }, - # This warning is expected on when WireServer gives us the incomplete goalstate without roleinstance data - { - 'message': r"\[ProtocolError\] Fetched goal state without a RoleInstance", - }, - # The following message is expected to log an error if systemd is not enabled on it - { - 'message': r"Did not detect Systemd, unable to set wa(|linux)agent-network-setup.service", - 'if': lambda _: not systemd_enabled - }, - # Download warnings (manifest and zips). - # - # Examples: - # 2021-03-31T03:48:35.216494Z WARNING ExtHandler ExtHandler Fetch failed: [HttpError] [HTTP Failed] GET https://zrdfepirv2cbn04prdstr01a.blob.core.windows.net/f72653efd9e349ed9842c8b99e4c1712/Microsoft.CPlat.Core_NullSeqA_useast2euap_manifest.xml -- IOError ('The read operation timed out',) -- 1 attempts made - # 2021-03-31T06:54:29.655861Z WARNING ExtHandler ExtHandler Fetch failed: [HttpError] [HTTP Retry] GET http://168.63.129.16:32526/extensionArtifact -- Status Code 502 -- 1 attempts made - # 2021-03-31T06:43:17.806663Z WARNING ExtHandler ExtHandler Download failed, switching to host plugin - { - 'message': r"(Fetch failed: \[HttpError\] .+ GET .+ -- [0-9]+ attempts made)|(Download failed, switching to host plugin)", - 'if': lambda log_line: log_line.level == "WARNING" and log_line.who == "ExtHandler" and log_line.thread == "ExtHandler" - }, - # Sometimes it takes the Daemon some time to identify primary interface and the route to Wireserver, - # ignoring those errors if they come from the Daemon. - { - 'message': r"(No route exists to \d+\.\d+\.\d+\.\d+|" - r"Could not determine primary interface, please ensure \/proc\/net\/route is correct|" - r"Contents of \/proc\/net\/route:|Primary interface examination will retry silently|" - r"\/proc\/net\/route contains no routes)", - 'if': lambda log_line: log_line.who == "Daemon" - }, - # Journalctl in Debian 8.11 does not have the --utc option by default. - # Ignoring this error for Deb 8 as its not a blocker and since Deb 8 is old and not widely used - { - 'message': r"journalctl: unrecognized option '--utc'", - 'if': lambda log_line: re.match(r"(debian8\.11)\D*", distro, - flags=re.IGNORECASE) is not None and log_line.level == "WARNING" - }, - # 2021-07-09T01:46:53.307959Z INFO MonitorHandler ExtHandler [CGW] Disabling resource usage monitoring. Reason: Check on cgroups failed: - # [CGroupsException] The agent's cgroup includes unexpected processes: ['[PID: 2367] UNKNOWN'] - { - 'message': r"The agent's cgroup includes unexpected processes: \[('\[PID:\s?\d+\]\s*UNKNOWN'(,\s*)?)+\]" - }, - # Probably the agent should log this as INFO, but for now it is a warning - # e.g. - # 2021-07-29T04:40:17.190879Z WARNING EnvHandler ExtHandler Dhcp client is not running. - { - 'message': r"WARNING EnvHandler ExtHandler Dhcp client is not running." - }, - # 2021-12-20T07:46:23.020197Z INFO ExtHandler ExtHandler [CGW] The agent's process is not within a memory cgroup - { - 'message': r"The agent's process is not within a memory cgroup", - 'if': lambda log_line: re.match(r"((centos7\.8)|(centos7\.9)|(redhat7\.8)|(redhat8\.2))\D*", distro, - flags=re.IGNORECASE) - }, - # - # 2022-01-20T06:52:21.515447Z WARNING Daemon Daemon Fetch failed: [HttpError] [HTTP Failed] GET https://dcrgajhx62.blob.core.windows.net/$system/edprpwqbj6.5c2ddb5b-d6c3-4d73-9468-54419ca87a97.vmSettings -- IOError timed out -- 6 attempts made - # - # The daemon does not need the artifacts profile blob, but the request is done as part of protocol initialization. This timeout can be ignored, if the issue persist the log would include additional instances. - # - { - 'message': r"\[HTTP Failed\] GET https://.*\.vmSettings -- IOError timed out", - 'if': lambda log_line: log_line.level == "WARNING" and log_line.who == "Daemon" - }, - # - # 2022-02-09T04:50:37.384810Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] GET vmSettings [correlation ID: 2bed9b62-188e-4668-b1a8-87c35cfa4927 eTag: 7031887032544600793]: [Internal error in HostGAPlugin] [HTTP Failed] [502: Bad Gateway] b'{ "errorCode": "VMArtifactsProfileBlobContentNotFound", "message": "VM artifacts profile blob has no content in it.", "details": ""}' - # - # Fetching the goal state may catch the HostGAPlugin in the process of computing the vmSettings. This can be ignored, if the issue persist the log would include additional instances. - # - { - 'message': r"\[ProtocolError\] GET vmSettings.*VMArtifactsProfileBlobContentNotFound", - 'if': lambda log_line: log_line.level == "ERROR" - }, - # - # 2021-12-29T06:50:49.904601Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] Error fetching goal state Inner error: [ResourceGoneError] [HTTP Failed] [410: Gone] The page you requested was removed. - # 2022-03-21T02:44:03.770017Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] Error fetching goal state Inner error: [ResourceGoneError] Resource is gone - # 2022-02-16T04:46:50.477315Z WARNING Daemon Daemon Fetching the goal state failed: [ResourceGoneError] [HTTP Failed] [410: Gone] b'\n\n ResourceNotAvailable\n The resource requested is no longer available. Please refresh your cache.\n
\n
' - # - # ResourceGone can happen if we are fetching one of the URIs in the goal state and a new goal state arrives - { - 'message': r"(?s)(Fetching the goal state failed|Error fetching goal state|Error fetching the goal state).*(\[ResourceGoneError\]|\[410: Gone\]|Resource is gone)", - 'if': lambda log_line: log_line.level in ("WARNING", "ERROR") - }, - # - # 2022-03-08T03:03:23.036161Z WARNING ExtHandler ExtHandler Fetch failed from [http://168.63.129.16:32526/extensionArtifact]: [HTTP Failed] [400: Bad Request] b'' - # 2022-03-08T03:03:23.042008Z WARNING ExtHandler ExtHandler Fetch failed: [ProtocolError] Fetch failed from [http://168.63.129.16:32526/extensionArtifact]: [HTTP Failed] [400: Bad Request] b'' - # - # Warning downloading extension manifest. If the issue persists, this would cause errors elsewhere so safe to ignore - { - 'message': r"\[http://168.63.129.16:32526/extensionArtifact\]: \[HTTP Failed\] \[400: Bad Request\]", - 'if': lambda log_line: log_line.level == "WARNING" - }, - # - # 2022-03-08T03:03:23.036161Z WARNING ExtHandler ExtHandler Fetch failed from [http://168.63.129.16:32526/extensionArtifact]: [HTTP Failed] [400: Bad Request] b'' - # 2022-03-08T03:03:23.042008Z WARNING ExtHandler ExtHandler Fetch failed: [ProtocolError] Fetch failed from [http://168.63.129.16:32526/extensionArtifact]: [HTTP Failed] [400: Bad Request] b'' - # - # Warning downloading extension manifest. If the issue persists, this would cause errors elsewhere so safe to ignore - { - 'message': r"\[http://168.63.129.16:32526/extensionArtifact\]: \[HTTP Failed\] \[400: Bad Request\]", - 'if': lambda log_line: log_line.level == "WARNING" - }, - # - # 2022-03-29T05:52:10.089958Z WARNING ExtHandler ExtHandler An error occurred while retrieving the goal state: [ProtocolError] GET vmSettings [correlation ID: da106cf5-83a0-44ec-9484-d0e9223847ab eTag: 9856274988128027586]: Timeout - # - # Ignore warnings about timeouts in vmSettings; if the condition persists, an error will occur elsewhere. - # - { - 'message': r"GET vmSettings \[[^]]+\]: Timeout", - 'if': lambda log_line: log_line.level == "WARNING" - }, - # 2022-03-09T20:04:33.745721Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=Install, message=[ExtensionOperationError] \ - # Non-zero exit code: 51, /var/lib/waagent/Microsoft.Azure.Monitor.AzureMonitorLinuxAgent-1.15.3/./shim.sh -install - # - # This is a known issue where AMA does not support Mariner 2.0. Please remove when support is - # added in the next AMA release (1.16.x). - { - 'message': r"Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=Install, message=\[ExtensionOperationError\] Non-zero exit code: 51", - 'if': lambda log_line: "Mariner2.0" in distro and log_line.level == "ERROR" and log_line.who == "ExtHandler" - }, - # 2022-03-18T00:13:37.063540Z INFO ExtHandler ExtHandler [CGW] The daemon's PID was added to a legacy cgroup; will not monitor resource usage. - # - # Agent disables cgroups in older versions of the daemon (2.2.31-2.2.40).This is known issue and ignoring. - { - 'message': r"The daemon's PID was added to a legacy cgroup; will not monitor resource usage" - } - ] - - if ignore is not None: - ignore_list.extend(ignore) - - def can_be_ignored(log_line): - return any(re.search(msg['message'], log_line.text) is not None and ('if' not in msg or msg['if'](log_line)) for msg in ignore_list) - - errors = [] - - for agent_log_line in parse_agent_log_file(waagent_log): - if agent_log_line.is_error and not can_be_ignored(agent_log_line): - # Handle "/proc/net/route contains no routes" as a special case since it can take time for the - # primary interface to come up and we don't want to report transient errors as actual errors - if "/proc/net/route contains no routes" in agent_log_line.text: - no_routes_error = agent_log_line.text - provisioning_complete = False - else: - errors.append(agent_log_line.text) - - if "Provisioning complete" in agent_log_line.text: - provisioning_complete = True - - # Keep the "no routes found" as a genuine error message if it was never corrected - if no_routes_error is not None and not provisioning_complete: - errors.append(no_routes_error) - - if len(errors) > 0: - # print('waagent.log contains the following ERROR(s):') - # for item in errors: - # print(item.rstrip()) - raise Exception("waagent.log contains the following ERROR(s): {0}".format('\n '.join(errors))) - - print(f"No errors/warnings found in {waagent_log}") - - -def is_data_in_waagent_log(data): - """ - This function looks for the specified test data string in the WALinuxAgent logs and returns if found or not. - :param data: The string to look for in the agent logs - :raises: Exception if data string not found - """ - for agent_log_line in parse_agent_log_file(): - if data in agent_log_line.text: - print("Found data: {0} in line: {1}".format(data, agent_log_line.text)) - return - - raise AssertionError("waagent.log file did not have the data string: {0}".format(data)) - diff --git a/dcr/scenario_utils/common_utils.py b/dcr/scenario_utils/common_utils.py deleted file mode 100644 index b1d58ab73..000000000 --- a/dcr/scenario_utils/common_utils.py +++ /dev/null @@ -1,154 +0,0 @@ -import asyncio -import math -import os -import secrets -import subprocess -import time -from datetime import datetime -from typing import List - -from dcr.scenario_utils.distro import get_distro -from dcr.scenario_utils.logging_utils import get_logger -from dcr.scenario_utils.models import get_vm_data_from_env - -logger = get_logger("dcr.scenario_utils.common_utils") - - -def get_current_agent_name(distro_name=None): - """ - Only Ubuntu and Debian used walinuxagent, everyone else uses waagent. - Note: If distro_name is not specified, we will search the distro in the VM itself - :return: walinuxagent or waagent - """ - - if distro_name is None: - distro_name = get_distro()[0] - - walinuxagent_distros = ["ubuntu", "debian"] - if any(dist.lower() in distro_name.lower() for dist in walinuxagent_distros): - return "walinuxagent" - - return "waagent" - - -def execute_command_and_raise_on_error(command, shell=False, timeout=None, stdout=subprocess.PIPE, - stderr=subprocess.PIPE): - pipe = subprocess.Popen(command, shell=shell, stdout=stdout, stderr=stderr) - stdout, stderr = pipe.communicate(timeout=timeout) - - logger.info("STDOUT:\n{0}".format(stdout.decode())) - logger.info("STDERR:\n{0}".format(stderr.decode())) - if pipe.returncode != 0: - raise Exception("non-0 exit code: {0} for command: {1}".format(pipe.returncode, command)) - - return stdout.decode().strip(), stderr.decode().strip() - - -def execute_py_script_over_ssh_on_test_vms(command: str): - """ - Execute a python script over SSH on test VMs. If there are multiple VMs, this will execute the script on all VMs concurrently. - The script should be relative to the dcr/ directory. It uses the PyPy interpreter to execute the script and - logs the stdout/stderr of the script - raises: Exception if any script exits with non-0 exit code. - """ - ssh_cmd = f"ssh -o StrictHostKeyChecking=no {{username}}@{{ip}} sudo PYTHONPATH=. {os.environ['PYPYPATH']} /home/{{username}}/{command}" - asyncio.run(execute_commands_concurrently_on_test_vms([ssh_cmd])) - logger.info(f"Finished executing SSH command: {ssh_cmd}") - - -def random_alphanum(length: int) -> str: - if length == 0: - return '' - elif length < 0: - raise ValueError('negative argument not allowed') - else: - text = secrets.token_hex(nbytes=math.ceil(length / 2)) - is_length_even = length % 2 == 0 - return text if is_length_even else text[1:] - - -async def execute_commands_concurrently_on_test_vms(commands: List[str], timeout: int = 5): - vm_data = get_vm_data_from_env() - tasks = [ - asyncio.create_task(_execute_commands_on_vm_async(commands=commands, username=vm_data.admin_username, ip=ip_)) - for ip_ in vm_data.ips] - return await asyncio.wait_for(asyncio.gather(*tasks, return_exceptions=False), timeout=timeout * 60) - - -async def _execute_commands_on_vm_async(commands: List[str], username: str, ip: str, max_retry: int = 5): - """ - Execute the list of commands synchronously on the VM. This runs as an async operation. - The code also replaces the {username} and {ip} in the command string with their actual values before executing the command. - """ - attempt = 0 - - for command in commands: - cmd = command.format(ip=ip, username=username) - stdout, stderr = "", "" - # ToDo: Separate out retries due to network error vs retries due to test failures. - # The latter should be only once (or as specified by the test author). - # https://msazure.visualstudio.com/One/_workitems/edit/12377120 - while attempt < max_retry: - try: - proc = await asyncio.create_subprocess_shell(cmd, stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE) - stdout, stderr = await proc.communicate() - stdout = stdout.decode('utf-8') - stderr = stderr.decode('utf-8') - if proc.returncode != 0: - raise Exception(f"Failed command: {cmd}. Exit Code: {proc.returncode}") - break - - except asyncio.CancelledError as err: - logger.warning(f"Task was cancelled: {cmd}; {err}") - try: - proc.terminate() - except: - # Eat all exceptions when trying to terminate a process that has been Cancelled - pass - finally: - return - - except Exception as err: - attempt += 1 - if attempt < max_retry: - logger.warning(f"[{username}/{ip}] ({attempt}/{max_retry}) Failed to execute command {cmd}: {err}. Retrying in 3 secs", - exc_info=True) - await asyncio.sleep(3) - else: - raise - - finally: - print(f"##[group][{username}/{ip}] - Attempts ({attempt}/{max_retry})") - print(f"##[command]{cmd}") - if stdout: - logger.info(f"Stdout: {stdout}") - if stderr: - logger.warning(f"Stderr: {stderr}") - print("##[endgroup]") - - -def execute_with_retry(func, max_retry=3, sleep=5): - retry = 0 - while retry < max_retry: - try: - func() - return - except Exception as error: - print("{0} Op failed with error: {1}. Retry: {2}, total attempts: {3}".format(datetime.utcnow().isoformat(), - error, retry + 1, max_retry)) - retry += 1 - if retry < max_retry: - time.sleep(sleep) - continue - raise - - -def read_file(log_file): - if not os.path.exists(log_file): - raise Exception("{0} file not found!".format(log_file)) - - with open(log_file) as f: - lines = list(map(lambda _: _.strip(), f.readlines())) - - return lines diff --git a/dcr/scenario_utils/crypto.py b/dcr/scenario_utils/crypto.py deleted file mode 100644 index f7555be0c..000000000 --- a/dcr/scenario_utils/crypto.py +++ /dev/null @@ -1,60 +0,0 @@ -import os - -from cryptography.hazmat.primitives import serialization -from cryptography.hazmat.primitives.asymmetric import rsa -from cryptography.hazmat.backends import default_backend - -from dcr.scenario_utils.common_utils import random_alphanum - - -class OpenSshKey(object): - """ - Represents an OpenSSH key pair. - """ - - def __init__(self, public_key: bytes, private_key: bytes): - self._private_key = private_key - self._public_key = public_key - - @property - def private_key(self) -> bytes: - return self._private_key - - @property - def public_key(self) -> bytes: - return self._public_key - - -class OpenSshKeyFactory(object): - @staticmethod - def create() -> OpenSshKey: - key = rsa.generate_private_key( - public_exponent=65537, - key_size=2048, - backend=default_backend() - ) - - public_key = key.public_key().public_bytes( - serialization.Encoding.OpenSSH, - serialization.PublicFormat.OpenSSH - ) - - private_key = key.private_bytes( - serialization.Encoding.PEM, - serialization.PrivateFormat.PKCS8, - serialization.NoEncryption() - ) - - return OpenSshKey(public_key, private_key) - - -def generate_ssh_key_pair(key_prefix='dcr_id_rsa'): - # New SSH public/private keys - ssh_keys = OpenSshKeyFactory().create() - - private_key_file_name = '{0}_{1}'.format(key_prefix, random_alphanum(10)) - with open(private_key_file_name, 'wb') as fh: - fh.write(ssh_keys.private_key) - private_key_file = os.path.abspath(private_key_file_name) - - return ssh_keys.public_key.decode('utf-8'), private_key_file \ No newline at end of file diff --git a/dcr/scenario_utils/distro.py b/dcr/scenario_utils/distro.py deleted file mode 100644 index eaa687b58..000000000 --- a/dcr/scenario_utils/distro.py +++ /dev/null @@ -1,40 +0,0 @@ -import platform -import sys - -import distro - - -def get_distro(): - """ - In some distros, e.g. SUSE 15, platform.linux_distribution is present, - but returns an empty value - so we also try distro.linux_distribution in those cases - """ - osinfo = [] - if hasattr(platform, 'linux_distribution'): - osinfo = list(platform.linux_distribution( - full_distribution_name=0, - supported_dists=platform._supported_dists + ('alpine',))) - - # Remove trailing whitespace and quote in distro name - - osinfo[0] = osinfo[0].strip('"').strip(' ').lower() - if not osinfo or not len(osinfo[0]): - # platform.linux_distribution() is deprecated, the suggested option is to use distro module - osinfo = distro.linux_distribution() - - return osinfo - - -def print_distro_info(): - print('\n--== distro ==--') - distro_name = get_distro() - - print('DISTRO_NAME = {0}'.format(distro_name[0])) - print('DISTRO_VERSION = {0}'.format(distro_name[1])) - print('DISTRO_CODE_NAME = {0}'.format(distro_name[2])) - - print('PY_VERSION = {0}'.format(sys.version_info)) - print('PY_VERSION_MAJOR = {0}'.format(sys.version_info[0])) - print('PY_VERSION_MINOR = {0}'.format(sys.version_info[1])) - print('PY_VERSION_MICRO = {0}'.format(sys.version_info[2])) diff --git a/dcr/scenario_utils/extensions/BaseExtensionTestClass.py b/dcr/scenario_utils/extensions/BaseExtensionTestClass.py deleted file mode 100644 index 8c23e1e71..000000000 --- a/dcr/scenario_utils/extensions/BaseExtensionTestClass.py +++ /dev/null @@ -1,113 +0,0 @@ -import time -from typing import List - -from azure.core.polling import LROPoller - -from dcr.scenario_utils.azure_models import ComputeManager -from dcr.scenario_utils.logging_utils import LoggingHandler -from dcr.scenario_utils.models import ExtensionMetaData, get_vm_data_from_env - - -class BaseExtensionTestClass(LoggingHandler): - - def __init__(self, extension_data: ExtensionMetaData): - super().__init__() - self.__extension_data = extension_data - self.__vm_data = get_vm_data_from_env() - self.__compute_manager = ComputeManager().compute_manager - - def get_ext_props(self, settings=None, protected_settings=None, auto_upgrade_minor_version=True, - force_update_tag=None): - - return self.__compute_manager.get_ext_props( - extension_data=self.__extension_data, - settings=settings, - protected_settings=protected_settings, - auto_upgrade_minor_version=auto_upgrade_minor_version, - force_update_tag=force_update_tag - ) - - def run(self, ext_props: List, remove: bool = True, continue_on_error: bool = False): - - def __add_extension(): - extension: LROPoller = self.__compute_manager.extension_func.begin_create_or_update( - self.__vm_data.rg_name, - self.__vm_data.name, - self.__extension_data.name, - ext_prop - ) - self.log.info("Add extension: {0}".format(extension.result(timeout=5 * 60))) - - def __remove_extension(): - self.__compute_manager.extension_func.begin_delete( - self.__vm_data.rg_name, - self.__vm_data.name, - self.__extension_data.name - ).result() - self.log.info(f"Delete vm extension {self.__extension_data.name} successful") - - def _retry_on_retryable_error(func): - retry = 1 - while retry < 5: - try: - func() - break - except Exception as err_: - if "RetryableError" in str(err_) and retry < 5: - self.log.warning(f"({retry}/5) Ran into RetryableError, retrying in 30 secs: {err_}") - time.sleep(30) - retry += 1 - continue - raise - - try: - for ext_prop in ext_props: - try: - _retry_on_retryable_error(__add_extension) - # Validate success from instance view - _retry_on_retryable_error(self.validate_ext) - except Exception as err: - if continue_on_error: - self.log.exception("Ran into error but ignoring it as asked: {0}".format(err)) - continue - else: - raise - finally: - # Always try to delete extensions if asked to remove even on errors - if remove: - _retry_on_retryable_error(__remove_extension) - - def validate_ext(self): - """ - Validate if the extension operation was successful from the Instance View - :raises: Exception if either unable to fetch instance view or if extension not successful - """ - retry = 0 - max_retry = 3 - ext_instance_view = None - status = None - - while retry < max_retry: - try: - ext_instance_view = self.__compute_manager.get_extension_instance_view(self.__extension_data.name) - if ext_instance_view is None: - raise Exception("Extension not found") - elif not ext_instance_view.instance_view: - raise Exception("Instance view not present") - elif not ext_instance_view.instance_view.statuses or len(ext_instance_view.instance_view.statuses) < 1: - raise Exception("Instance view status not present") - else: - status = ext_instance_view.instance_view.statuses[0].code - status_message = ext_instance_view.instance_view.statuses[0].message - self.log.info('Extension Status: \n\tCode: [{0}]\n\tMessage: {1}'.format(status, status_message)) - break - except Exception as err: - self.log.exception(f"Ran into error: {err}") - retry += 1 - if retry < max_retry: - self.log.info("Retrying in 30 secs") - time.sleep(30) - raise - - if 'succeeded' not in status: - raise Exception(f"Extension did not succeed. Last Instance view: {ext_instance_view}") diff --git a/dcr/scenario_utils/extensions/CustomScriptExtension.py b/dcr/scenario_utils/extensions/CustomScriptExtension.py deleted file mode 100644 index 29df35113..000000000 --- a/dcr/scenario_utils/extensions/CustomScriptExtension.py +++ /dev/null @@ -1,29 +0,0 @@ -import uuid - -from dcr.scenario_utils.extensions.BaseExtensionTestClass import BaseExtensionTestClass -from dcr.scenario_utils.models import ExtensionMetaData - - -class CustomScriptExtension(BaseExtensionTestClass): - META_DATA = ExtensionMetaData( - publisher='Microsoft.Azure.Extensions', - ext_type='CustomScript', - version="2.1" - ) - - def __init__(self, extension_name: str): - extension_data = self.META_DATA - extension_data.name = extension_name - super().__init__(extension_data) - - -def add_cse(): - # Install and remove CSE - cse = CustomScriptExtension(extension_name="testCSE") - - ext_props = [ - cse.get_ext_props(settings={'commandToExecute': f"echo \'Hello World! {uuid.uuid4()} \'"}), - cse.get_ext_props(settings={'commandToExecute': "echo \'Hello again\'"}) - ] - - cse.run(ext_props=ext_props) \ No newline at end of file diff --git a/dcr/scenario_utils/extensions/GATestExtGoExtension.py b/dcr/scenario_utils/extensions/GATestExtGoExtension.py deleted file mode 100644 index 39eb144fd..000000000 --- a/dcr/scenario_utils/extensions/GATestExtGoExtension.py +++ /dev/null @@ -1,26 +0,0 @@ -from typing import List - -from azure.mgmt.compute.models import VirtualMachineExtension - -from dcr.scenario_utils.extensions.BaseExtensionTestClass import BaseExtensionTestClass -from dcr.scenario_utils.models import ExtensionMetaData - - -class GATestExtGoExtension(BaseExtensionTestClass): - def __init__(self, extension_name: str): - extension_data = ExtensionMetaData( - publisher='Microsoft.Azure.Extensions.Edp', - ext_type='GATestExtGo', - version="1.0", - ext_name=extension_name - ) - super().__init__(extension_data) - - def run(self, ext_props: List[VirtualMachineExtension], remove: bool = True, continue_on_error: bool = False): - for ext_prop in ext_props: - if 'name' not in ext_prop.settings: - # GATestExtGo expects name to always be there, making sure we send it always - ext_prop.settings['name'] = "Enabling GA Test Extension" - - super().run(ext_props, remove, continue_on_error) - diff --git a/dcr/scenario_utils/extensions/RunCommandExtension.py b/dcr/scenario_utils/extensions/RunCommandExtension.py deleted file mode 100644 index 0a059bd39..000000000 --- a/dcr/scenario_utils/extensions/RunCommandExtension.py +++ /dev/null @@ -1,27 +0,0 @@ -import uuid - -from dcr.scenario_utils.extensions.BaseExtensionTestClass import BaseExtensionTestClass -from dcr.scenario_utils.models import ExtensionMetaData - - -class RunCommandExtension(BaseExtensionTestClass): - def __init__(self, extension_name: str): - extension_data = ExtensionMetaData( - publisher='Microsoft.CPlat.Core', - ext_type='RunCommandLinux', - version="1.0", - ext_name=extension_name - ) - super().__init__(extension_data) - - -def add_rc(): - # Install and remove RC - rc = RunCommandExtension(extension_name="testRC") - - ext_props = [ - rc.get_ext_props(settings={'commandToExecute': f"echo \'Hello World! {uuid.uuid4()} \'"}), - rc.get_ext_props(settings={'commandToExecute': "echo \'Hello again\'"}) - ] - - rc.run(ext_props=ext_props) diff --git a/dcr/scenario_utils/extensions/VMAccessExtension.py b/dcr/scenario_utils/extensions/VMAccessExtension.py deleted file mode 100644 index c84ae1205..000000000 --- a/dcr/scenario_utils/extensions/VMAccessExtension.py +++ /dev/null @@ -1,38 +0,0 @@ -import asyncio -import os - -from dcr.scenario_utils.common_utils import random_alphanum, execute_commands_concurrently_on_test_vms -from dcr.scenario_utils.crypto import generate_ssh_key_pair -from dcr.scenario_utils.extensions.BaseExtensionTestClass import BaseExtensionTestClass -from dcr.scenario_utils.models import ExtensionMetaData - - -class VMAccessExtension(BaseExtensionTestClass): - META_DATA = ExtensionMetaData( - publisher='Microsoft.OSTCExtensions', - ext_type='VMAccessForLinux', - version="1.5" - ) - - def __init__(self, extension_name: str): - extension_data = self.META_DATA - extension_data.name = extension_name - super().__init__(extension_data) - self.public_key, self.private_key_file = generate_ssh_key_pair('dcr_py') - self.user_name = f'dcr{random_alphanum(length=8)}' - - def verify(self): - os.chmod(self.private_key_file, 0o600) - ssh_cmd = f'ssh -o StrictHostKeyChecking=no -i {self.private_key_file} {self.user_name}@{{ip}} ' \ - f'"echo script was executed successfully on remote vm"' - print(asyncio.run(execute_commands_concurrently_on_test_vms([ssh_cmd]))) - - -def add_and_verify_vmaccess(): - vmaccess = VMAccessExtension(extension_name="testVmAccessExt") - ext_props = [ - vmaccess.get_ext_props(protected_settings={'username': vmaccess.user_name, 'ssh_key': vmaccess.public_key, - 'reset_ssh': 'false'}) - ] - vmaccess.run(ext_props=ext_props) - vmaccess.verify() diff --git a/dcr/scenario_utils/logging_utils.py b/dcr/scenario_utils/logging_utils.py deleted file mode 100644 index 462f6a957..000000000 --- a/dcr/scenario_utils/logging_utils.py +++ /dev/null @@ -1,33 +0,0 @@ -# Create a base class -import logging - - -def get_logger(name): - return LoggingHandler(name).log - - -class LoggingHandler: - """ - Base class for Logging - """ - def __init__(self, name=None): - self.log = self.__setup_and_get_logger(name) - - def __setup_and_get_logger(self, name): - logger = logging.getLogger(name if name is not None else self.__class__.__name__) - if logger.hasHandlers(): - # Logging module inherits from base loggers if already setup, if a base logger found, reuse that - return logger - - # No handlers found for logger, set it up - # This logging format is easier to read on the DevOps UI - - # https://docs.microsoft.com/en-us/azure/devops/pipelines/scripts/logging-commands?view=azure-devops&tabs=bash#formatting-commands - log_formatter = logging.Formatter("##[%(levelname)s] [%(asctime)s] [%(module)s] {%(pathname)s:%(lineno)d} %(message)s", - datefmt="%Y-%m-%dT%H:%M:%S%z") - console_handler = logging.StreamHandler() - console_handler.setFormatter(log_formatter) - logger.addHandler(console_handler) - logger.setLevel(logging.INFO) - - return logger - diff --git a/dcr/scenario_utils/models.py b/dcr/scenario_utils/models.py deleted file mode 100644 index 806c830c1..000000000 --- a/dcr/scenario_utils/models.py +++ /dev/null @@ -1,137 +0,0 @@ -import os -from enum import Enum, auto -from typing import List - -from dotenv import load_dotenv - - -class VMModelType(Enum): - VM = auto() - VMSS = auto() - - -class ExtensionMetaData: - def __init__(self, publisher: str, ext_type: str, version: str, ext_name: str = ""): - self.__publisher = publisher - self.__ext_type = ext_type - self.__version = version - self.__ext_name = ext_name - - @property - def publisher(self) -> str: - return self.__publisher - - @property - def ext_type(self) -> str: - return self.__ext_type - - @property - def version(self) -> str: - return self.__version - - @property - def name(self): - return self.__ext_name - - @name.setter - def name(self, ext_name): - self.__ext_name = ext_name - - @property - def handler_name(self): - return f"{self.publisher}.{self.ext_type}" - - -class VMMetaData: - - def __init__(self, vm_name: str, rg_name: str, sub_id: str, location: str, admin_username: str, - ips: List[str] = None): - self.__vm_name = vm_name - self.__rg_name = rg_name - self.__sub_id = sub_id - self.__location = location - self.__admin_username = admin_username - - vm_ips, vmss_ips = _get_ips(admin_username) - # By default assume the test is running on a VM - self.__type = VMModelType.VM - self.__ips = vm_ips - if any(vmss_ips): - self.__type = VMModelType.VMSS - self.__ips = vmss_ips - - if ips is not None: - self.__ips = ips - - print(f"IPs: {self.__ips}") - - @property - def name(self) -> str: - return self.__vm_name - - @property - def rg_name(self) -> str: - return self.__rg_name - - @property - def location(self) -> str: - return self.__location - - @property - def sub_id(self) -> str: - return self.__sub_id - - @property - def admin_username(self): - return self.__admin_username - - @property - def ips(self) -> List[str]: - return self.__ips - - @property - def model_type(self): - return self.__type - - -def _get_ips(username) -> (list, list): - """ - Try fetching Ips from the files that we create via az-cli. - We do a best effort to fetch this from both orchestrator or the test VM. Its located in different locations on both - scenarios. - Returns: Tuple of (VmIps, VMSSIps). - """ - - vms, vmss = [], [] - orchestrator_path = os.path.join(os.environ['BUILD_SOURCESDIRECTORY'], "dcr") - test_vm_path = os.path.join("/home", username, "dcr") - - for ip_path in [orchestrator_path, test_vm_path]: - - vm_ip_path = os.path.join(ip_path, ".vm_ips") - if os.path.exists(vm_ip_path): - with open(vm_ip_path, 'r') as vm_ips: - vms.extend(ip.strip() for ip in vm_ips.readlines()) - - vmss_ip_path = os.path.join(ip_path, ".vmss_ips") - if os.path.exists(vmss_ip_path): - with open(vmss_ip_path, 'r') as vmss_ips: - vmss.extend(ip.strip() for ip in vmss_ips.readlines()) - - return vms, vmss - - -def get_vm_data_from_env() -> VMMetaData: - if get_vm_data_from_env.__instance is None: - load_dotenv() - get_vm_data_from_env.__instance = VMMetaData(vm_name=os.environ["VMNAME"], - rg_name=os.environ['RGNAME'], - sub_id=os.environ["SUBID"], - location=os.environ['LOCATION'], - admin_username=os.environ['ADMINUSERNAME']) - - return get_vm_data_from_env.__instance - - -get_vm_data_from_env.__instance = None - diff --git a/dcr/scenario_utils/test_orchestrator.py b/dcr/scenario_utils/test_orchestrator.py deleted file mode 100644 index 014531164..000000000 --- a/dcr/scenario_utils/test_orchestrator.py +++ /dev/null @@ -1,97 +0,0 @@ -import os -import time -import traceback -from typing import List - -from dotenv import load_dotenv -from junitparser import TestCase, Skipped, Failure, TestSuite, JUnitXml - -from dcr.scenario_utils.logging_utils import LoggingHandler -from dcr.scenario_utils.models import get_vm_data_from_env - - -class TestFuncObj: - def __init__(self, test_name, test_func, raise_on_error=False, retry=1): - self.name = test_name - self.func = test_func - self.raise_on_error = raise_on_error - self.retry = retry - - -class TestOrchestrator(LoggingHandler): - def __init__(self, name: str, tests: List[TestFuncObj]): - super().__init__() - self.name = name - self.__tests: List[TestFuncObj] = tests - self.__test_suite = TestSuite(name) - - def run_tests(self): - load_dotenv() - skip_due_to = None - for test in self.__tests: - tc = TestCase(test.name, classname=os.environ['SCENARIONAME']) - if skip_due_to is not None: - tc.result = [Skipped(message=f"Skipped due to failing test: {skip_due_to}")] - else: - attempt = 1 - while attempt <= test.retry: - print(f"##[group][{test.name}] - Attempts ({attempt}/{test.retry})") - tc = self.run_test_and_get_tc(test.name, test.func) - if isinstance(tc.result, Failure): - attempt += 1 - if attempt > test.retry and test.raise_on_error: - self.log.warning(f"Breaking test case failed: {test.name}; Skipping remaining tests") - skip_due_to = test.name - else: - self.log.warning(f"(Attempt {attempt-1}/Total {test.retry}) Test {test.name} failed") - if attempt <= test.retry: - self.log.warning("retrying in 10 secs") - time.sleep(10) - print("##[endgroup]") - else: - print("##[endgroup]") - break - self.__test_suite.add_testcase(tc) - - def __generate_report(self, test_file_path): - xml_junit = JUnitXml() - xml_junit.add_testsuite(self.__test_suite) - xml_junit.write(filepath=test_file_path, pretty=True) - - def generate_report_on_orchestrator(self, file_name: str): - """ - Use this function to generate Junit XML report on the orchestrator. - The report is dropped in `$(Build.ArtifactStagingDirectory)/harvest` directory - """ - assert file_name.startswith("test-result"), "File name is invalid, it should start with test-result*" - self.__generate_report(os.path.join(os.environ['BUILD_ARTIFACTSTAGINGDIRECTORY'], file_name)) - - def generate_report_on_vm(self, file_name): - """ - Use this function to generate Junit XML report on the Test VM. - The report is dropped in `/home/$(adminUsername)/` directory - """ - assert file_name.startswith("test-result"), "File name is invalid, it should start with test-result*" - admin_username = get_vm_data_from_env().admin_username - self.__generate_report(os.path.join("/home", admin_username, file_name)) - - @property - def failed(self) -> bool: - return (self.__test_suite.failures + self.__test_suite.errors) > 0 - - def run_test_and_get_tc(self, test_name, test_func) -> TestCase: - tc = TestCase(test_name, classname=os.environ['SCENARIONAME']) - start_time = time.time() - self.log.info("Execute Test: {0}".format(test_name)) - try: - stdout = test_func() - self.log.debug("[{0}] Debug Output: {1}".format(test_name, stdout)) - except Exception as err: - self.log.exception("Error: {1}".format(test_name, err)) - stdout = str(err) - tc.result = [Failure(f"Failure: {err}", type_=f"Stack: {traceback.format_exc()}")] - - tc.system_out = stdout - tc.time = (time.time() - start_time) - return tc - diff --git a/dcr/scenarios/__init__.py b/dcr/scenarios/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/dcr/scenarios/agent-bvt/__init__.py b/dcr/scenarios/agent-bvt/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/dcr/scenarios/agent-bvt/check_extension_timing.py b/dcr/scenarios/agent-bvt/check_extension_timing.py deleted file mode 100644 index 97c6b61af..000000000 --- a/dcr/scenarios/agent-bvt/check_extension_timing.py +++ /dev/null @@ -1,55 +0,0 @@ -from __future__ import print_function - -import re - -from dcr.scenario_utils.agent_log_parser import parse_agent_log_file, GOAL_STATE_COMPLETED - -extension_name_pattern = r'\[(\S*)\]' - -# 2018/05/22 21:23:32.888949 INFO [Microsoft.EnterpriseCloud.Monitoring.OmsAgentForLinux-1.6.42.0] Target handler state: enabled -handle_extensions_starting_pattern = r'Target handler state:\s(\S*)' - -extension_cycle = {0: '', 1: ''} - -cycle_completed = False - - -def __update_cycle(pos, name, when, info): - global extension_cycle - global cycle_completed - - extension_cycle[pos] = '@trace {0} {1} [{2}]'.format(when, name, info) - - for i in range(pos+1, 2): - extension_cycle[i] = '' - - if all(i != '' for i in extension_cycle.values()): - for key in extension_cycle.keys(): - print(extension_cycle[key]) - extension_cycle = {} - cycle_completed = True - - -def verify_extension_timing(): - for agent_log_line in parse_agent_log_file(): - match = re.match(handle_extensions_starting_pattern, agent_log_line.message) - if match: - op = match.groups()[0] - match = re.match(extension_name_pattern, agent_log_line.who) - ext_name = match.groups()[0] if match else "invalid.extension.name.syntax" - trans_op = "add/update" if op == "enabled" else "remove" - info = "{0}: {1}".format(ext_name, trans_op) - __update_cycle(0, 'handle_extension_started', agent_log_line.when, info) - continue - - match = re.match(GOAL_STATE_COMPLETED, agent_log_line.message) - if match: - duration = match.group('duration') - __update_cycle(1, 'handle_extension_duration', agent_log_line.when, duration) - - if not cycle_completed: - raise Exception('full cycle not completed') - - return "Extension cycle complete" - - diff --git a/dcr/scenarios/agent-bvt/check_firewall.py b/dcr/scenarios/agent-bvt/check_firewall.py deleted file mode 100644 index df7261a7f..000000000 --- a/dcr/scenarios/agent-bvt/check_firewall.py +++ /dev/null @@ -1,64 +0,0 @@ -import os -import pwd -import re -import subprocess -import sys - -if sys.version_info[0] == 3: - import http.client as httpclient -elif sys.version_info[0] == 2: - import httplib as httpclient - -WIRESERVER_ENDPOINT_FILE = '/var/lib/waagent/WireServerEndpoint' -VERSIONS_PATH = '/?comp=versions' - -AGENT_CONFIG_FILE = '/etc/waagent.conf' -OS_ENABLE_FIREWALL_RX = r'OS.EnableFirewall\s*=\s*(\S+)' - - -def __is_firewall_enabled(): - with open(AGENT_CONFIG_FILE, 'r') as config_fh: - for line in config_fh.readlines(): - if not line.startswith('#'): - update_match = re.match(OS_ENABLE_FIREWALL_RX, line, re.IGNORECASE) - if update_match: - return update_match.groups()[0].lower() == 'y' - - # The firewall is disabled by default. - return False - - -def run(*args): - p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - rc = p.wait() - if rc != 0: - return False, None - else: - o = list(map(lambda s: s.decode('utf-8').strip(), p.stdout.read())) - return True, o - - -def check_firewall(username): - if not __is_firewall_enabled(): - return "The firewall is not enabled, skipping checks" - - with open(WIRESERVER_ENDPOINT_FILE, 'r') as f: - wireserver_ip = f.read() - - uid = pwd.getpwnam(username)[2] - os.seteuid(uid) - - client = httpclient.HTTPConnection(wireserver_ip, timeout=1) - - try: - client.request('GET', VERSIONS_PATH) - success = True - except Exception as err: - print(err) - success = False - - if success: - raise Exception("Error -- user could connect to wireserver") - - return "Success -- user access to wireserver is blocked" - diff --git a/dcr/scenarios/agent-bvt/run.host.py b/dcr/scenarios/agent-bvt/run.host.py deleted file mode 100644 index c5ce45883..000000000 --- a/dcr/scenarios/agent-bvt/run.host.py +++ /dev/null @@ -1,23 +0,0 @@ -from dcr.scenario_utils.common_utils import execute_py_script_over_ssh_on_test_vms -from dcr.scenario_utils.extensions.CustomScriptExtension import add_cse -from dcr.scenario_utils.extensions.VMAccessExtension import add_and_verify_vmaccess -from dcr.scenario_utils.test_orchestrator import TestOrchestrator, TestFuncObj - -if __name__ == '__main__': - # Execute run1.py first - execute_py_script_over_ssh_on_test_vms(command="dcr/scenarios/agent-bvt/run1.py") - - # Add extensions from the Host - tests = [ - TestFuncObj("Add Cse", add_cse, raise_on_error=True), - TestFuncObj("Add VMAccess", add_and_verify_vmaccess, raise_on_error=True) - ] - - test_orchestrator = TestOrchestrator("AgentBVT-Host", tests=tests) - test_orchestrator.run_tests() - test_orchestrator.generate_report_on_orchestrator("test-results-bvt-host.xml") - assert not test_orchestrator.failed, f"Test Suite: {test_orchestrator.name} failed" - - # Execute run2.py finally - execute_py_script_over_ssh_on_test_vms(command="dcr/scenarios/agent-bvt/run2.py") - diff --git a/dcr/scenarios/agent-bvt/run1.py b/dcr/scenarios/agent-bvt/run1.py deleted file mode 100644 index 7c17b0ce9..000000000 --- a/dcr/scenarios/agent-bvt/run1.py +++ /dev/null @@ -1,16 +0,0 @@ -from dcr.scenario_utils.test_orchestrator import TestFuncObj, TestOrchestrator -from test_agent_basics import test_agent_version, check_hostname, check_ns_lookup, check_root_login - -if __name__ == '__main__': - tests = [ - TestFuncObj("check_agent_version", test_agent_version), - TestFuncObj("Check hostname", check_hostname), - TestFuncObj("Check NSLookup", check_ns_lookup), - TestFuncObj("Check Root Login", check_root_login) - ] - - test_orchestrator = TestOrchestrator("AgentBVTs-VM", tests=tests) - test_orchestrator.run_tests() - test_orchestrator.generate_report_on_vm("test-result-bvt-run1.xml") - assert not test_orchestrator.failed, f"Test Suite: {test_orchestrator.name} failed" - diff --git a/dcr/scenarios/agent-bvt/run2.py b/dcr/scenarios/agent-bvt/run2.py deleted file mode 100644 index a563b62c3..000000000 --- a/dcr/scenarios/agent-bvt/run2.py +++ /dev/null @@ -1,21 +0,0 @@ -from check_extension_timing import verify_extension_timing -from check_firewall import check_firewall -from dcr.scenario_utils.check_waagent_log import check_waagent_log_for_errors -from dcr.scenario_utils.models import get_vm_data_from_env -from dcr.scenario_utils.test_orchestrator import TestFuncObj, TestOrchestrator -from test_agent_basics import check_agent_processes, check_sudoers - -if __name__ == '__main__': - admin_username = get_vm_data_from_env().admin_username - tests = [ - TestFuncObj("check agent processes", check_agent_processes), - TestFuncObj("check agent log", check_waagent_log_for_errors), - TestFuncObj("verify extension timing", verify_extension_timing), - TestFuncObj("Check Firewall", lambda: check_firewall(admin_username)), - TestFuncObj("Check Sudoers", lambda: check_sudoers(admin_username)) - ] - - test_orchestrator = TestOrchestrator("AgentBVTs-VM", tests=tests) - test_orchestrator.run_tests() - test_orchestrator.generate_report_on_vm("test-result-bvt-run2.xml") - assert not test_orchestrator.failed, f"Test Suite: {test_orchestrator.name} failed" diff --git a/dcr/scenarios/agent-bvt/test_agent_basics.py b/dcr/scenarios/agent-bvt/test_agent_basics.py deleted file mode 100644 index b8c9483c8..000000000 --- a/dcr/scenarios/agent-bvt/test_agent_basics.py +++ /dev/null @@ -1,103 +0,0 @@ -import os -import re -import socket - -from dotenv import load_dotenv - -from dcr.scenario_utils.common_utils import execute_command_and_raise_on_error -from dcr.scenario_utils.models import get_vm_data_from_env - - -def test_agent_version(): - stdout, _ = execute_command_and_raise_on_error(['waagent', '-version'], timeout=30) - - # release_file contains: - # AGENT_VERSION = 'x.y.z' - load_dotenv() - expected_version = os.environ.get("AGENTVERSION") - - if "Goal state agent: {0}".format(expected_version) not in stdout: - raise Exception("expected version {0} not found".format(expected_version)) - - return stdout - - -def check_hostname(): - vm_name = get_vm_data_from_env().name - stdout, _ = execute_command_and_raise_on_error(['hostname'], timeout=30) - - if vm_name.lower() != stdout.lower(): - raise Exception("Hostname does not match! Expected: {0}, found: {1}".format(vm_name, stdout.strip())) - - return stdout - - -def check_ns_lookup(): - hostname, _ = execute_command_and_raise_on_error(['hostname'], timeout=30) - - ip = socket.gethostbyname(hostname) - msg = "Resolved IP: {0}".format(ip) - print(msg) - - return msg - - -def check_root_login(): - stdout, _ = execute_command_and_raise_on_error(['cat', '/etc/shadow'], timeout=30) - root_passwd_line = next(line for line in stdout.splitlines() if 'root' in line) - print(root_passwd_line) - root_passwd = root_passwd_line.split(":")[1] - - if any(val in root_passwd for val in ("!", "*", "x")): - return 'root login disabled' - else: - raise Exception('root login appears to be enabled: {0}'.format(root_passwd)) - - -def check_agent_processes(): - daemon_pattern = r'.*python.*waagent -daemon$' - handler_pattern = r'.*python.*-run-exthandlers' - status_pattern = r'^(\S+)\s+' - - std_out, _ = execute_command_and_raise_on_error(['ps', 'axo', 'stat,args'], timeout=30) - - daemon = False - ext_handler = False - agent_processes = [line for line in std_out.splitlines() if 'python' in line] - - for process in agent_processes: - if re.match(daemon_pattern, process): - daemon = True - elif re.match(handler_pattern, process): - ext_handler = True - else: - continue - - status = re.match(status_pattern, process).groups(1)[0] - if not(status.startswith('S') or status.startswith('R')): - raise Exception('process is not running: {0}'.format(process)) - - if not daemon: - raise Exception('daemon process not found:\n\n{0}'.format(std_out)) - if not ext_handler: - raise Exception('extension handler process not found:\n\n{0}'.format(std_out)) - - return 'expected processes found running' - - -def check_sudoers(user): - found = False - root = '/etc/sudoers.d/' - - for f in os.listdir(root): - sudoers = os.path.join(root, f) - with open(sudoers) as fh: - for entry in fh.readlines(): - if entry.startswith(user) and 'ALL=(ALL)' in entry: - print('entry found: {0}'.format(entry)) - found = True - - if not found: - raise Exception('user {0} not found'.format(user)) - - return "Found user {0} in list of sudoers".format(user) diff --git a/dcr/scenarios/agent-persist-firewall/access_wire_ip.sh b/dcr/scenarios/agent-persist-firewall/access_wire_ip.sh deleted file mode 100644 index 40d99ee9f..000000000 --- a/dcr/scenarios/agent-persist-firewall/access_wire_ip.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -# Helper file which tries to access Wireserver on system reboot. Also prints out iptable rules if non-root and still -# able to access Wireserver - -# Args: 0 1 -# Usage ./access_wire_ip.sh - -USER=$(whoami) -echo "$(date --utc +%FT%T.%3NZ): Running as user: $USER" - -function check_online -{ - ping 8.8.8.8 -c 1 -i .2 -t 30 > /dev/null 2>&1 && echo 0 || echo 1 -} - -# Check more, sleep less -MAX_CHECKS=10 -# Initial starting value for checks -CHECKS=0 -IS_ONLINE=$(check_online) - -# Loop while we're not online. -while [ "$IS_ONLINE" -eq 1 ]; do - - CHECKS=$((CHECKS + 1)) - if [ $CHECKS -gt $MAX_CHECKS ]; then - break - fi - - echo "$(date --utc +%FT%T.%3NZ): Network still not accessible" - # We're offline. Sleep for a bit, then check again - sleep 1; - IS_ONLINE=$(check_online) - -done - -if [ "$IS_ONLINE" -eq 1 ]; then - # We will never be able to get online. Kill script. - echo "Unable to connect to network, exiting now" - echo "ExitCode: 1" - exit 1 -fi - -echo "Finally online, Time: $(date --utc +%FT%T.%3NZ)" -echo "Trying to contact Wireserver as $USER to see if accessible" - -echo "" -echo "IPTables before accessing Wireserver" -sudo "$1" -t security -L -nxv -echo "" - -file_name="/var/tmp/wire-versions-root.xml" -if [[ "$USER" != "root" ]]; then - file_name="/var/tmp/wire-versions-non-root.xml" -fi - -WIRE_IP=$(cat /var/lib/waagent/WireServerEndpoint 2>/dev/null || echo '168.63.129.16' | tr -d '[:space:]') -wget --tries=3 "http://$WIRE_IP/?comp=versions" --timeout=5 -O "$file_name" -WIRE_EC=$? -echo "ExitCode: $WIRE_EC" - -if [[ "$USER" != "root" && "$WIRE_EC" == 0 ]]; then - echo "Wireserver should not be accessible for non-root user ($USER), IPTable rules -" - sudo "$1" -t security -L -nxv -fi \ No newline at end of file diff --git a/dcr/scenarios/agent-persist-firewall/persist_firewall_helpers.py b/dcr/scenarios/agent-persist-firewall/persist_firewall_helpers.py deleted file mode 100644 index ff7df4a44..000000000 --- a/dcr/scenarios/agent-persist-firewall/persist_firewall_helpers.py +++ /dev/null @@ -1,294 +0,0 @@ -import os -import re -import shutil -import subprocess -import time -from datetime import datetime - -from dcr.scenario_utils.common_utils import execute_with_retry, read_file, get_current_agent_name - -__ROOT_CRON_LOG = "/var/tmp/reboot-cron-root.log" -__NON_ROOT_CRON_LOG = "/var/tmp/reboot-cron-non-root.log" -__NON_ROOT_WIRE_XML = "/var/tmp/wire-versions-non-root.xml" -__ROOT_WIRE_XML = "/var/tmp/wire-versions-root.xml" - -SVG_DIR = os.path.join("/var", "log", "svgs") - - -def get_wire_ip(): - wireserver_endpoint_file = '/var/lib/waagent/WireServerEndpoint' - try: - with open(wireserver_endpoint_file, 'r') as f: - wireserver_ip = f.read() - except Exception as e: - print("unable to read wireserver ip: {0}".format(e)) - wireserver_ip = '168.63.129.16' - print("In the meantime -- Using the well-known WireServer address.") - - return wireserver_ip - - -def get_iptables_rules(): - pipe = subprocess.Popen(["iptables", "-t", "security", "-L", "-nxv"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdout, stderr = pipe.communicate() - exit_code = pipe.returncode - - return exit_code, stdout.strip().decode(), stderr.strip().decode() - - -def __move_file_with_date_suffix(file_name): - # Copy it over to /var/log/ for future debugging - try: - shutil.move(src=file_name, dst=os.path.join("/var", "log", - "{0}.{1}".format(os.path.basename(file_name), - datetime.utcnow().isoformat()))) - except: - pass - - -def __read_and_get_wire_versions_file(wire_version_file): - print("\nCheck Output of wire-versions file") - if not os.path.exists(wire_version_file): - print("\tFile: {0} not found".format(wire_version_file)) - return None - - lines = None - if os.stat(wire_version_file).st_size > 0: - print("\n\t{0} not empty".format(wire_version_file)) - with open(wire_version_file) as f: - lines = f.readlines() - else: - print("\n\t{0} is empty".format(wire_version_file)) - - return lines - - -def __verify_data_in_cron_logs(cron_log, verify, err_msg): - print("\nVerify Cron logs - ") - - def op(): - cron_logs_lines = read_file(cron_log) - if not cron_logs_lines: - raise Exception("Empty cron file, looks like cronjob didnt run") - - if not any("ExitCode" in line for line in cron_logs_lines): - raise Exception("Cron logs still incomplete, will try again in a minute") - - if not any(verify(line) for line in cron_logs_lines): - raise Exception("Verification failed! (UNEXPECTED): {0}".format(err_msg)) - - print("Verification succeeded. Cron logs as expected") - - execute_with_retry(op, sleep=60, max_retry=7) - - -def verify_wire_ip_reachable_for_root(): - # For root logs - - # Ensure the /var/log/wire-versions-root.xml is not-empty (generated by the cron job) - # Ensure the exit code in the /var/log/reboot-cron-root.log file is 0 - print("\nVerifying WireIP is reachable from root user - ") - - def check_exit_code(line): - pattern = "ExitCode:\\s(\\d+)" - return re.match(pattern, line) is not None and int(re.match(pattern, line).groups()[0]) == 0 - - __verify_data_in_cron_logs(cron_log=__ROOT_CRON_LOG, verify=check_exit_code, - err_msg="Exit Code should be 0 for root based cron job!") - - if __read_and_get_wire_versions_file(__ROOT_WIRE_XML) is None: - raise Exception("Wire version file should not be empty for root!") - - -def verify_wire_ip_unreachable_for_non_root(): - # For non-root - - # Ensure the /var/log/wire-versions-non-root.xml is empty (generated by the cron job) - # Ensure the exit code in the /var/log/reboot-cron-non-root.log file is non-0 - print("\nVerifying WireIP is unreachable from non-root users - ") - - def check_exit_code(line): - match = re.match("ExitCode:\\s(\\d+)", line) - return match is not None and int(match.groups()[0]) != 0 - - __verify_data_in_cron_logs(cron_log=__NON_ROOT_CRON_LOG, verify=check_exit_code, - err_msg="Exit Code should be non-0 for non-root cron job!") - - if __read_and_get_wire_versions_file(__NON_ROOT_WIRE_XML) is not None: - raise Exception("Wire version file should be empty for non-root!") - - -def verify_wire_ip_in_iptables(max_retry=5): - expected_wire_ip = get_wire_ip() - stdout, stderr = "", "" - expected_regexes = [ - r"DROP.*{0}\s+ctstate\sINVALID,NEW.*".format(expected_wire_ip), - r"ACCEPT.*{0}\s+owner UID match 0.*".format(expected_wire_ip) - ] - retry = 0 - found = False - while retry < max_retry and not found: - ec, stdout, stderr = get_iptables_rules() - if not all(re.search(regex, stdout, re.MULTILINE) is not None for regex in expected_regexes): - # Some distros take some time for iptables to setup, sleeping a bit to give it enough time - time.sleep(30) - retry += 1 - continue - found = True - - print("\nIPTABLES RULES:\n\tSTDOUT: {0}".format(stdout)) - if stderr: - print("\tSTDERR: {0}".format(stderr)) - - if not found: - raise Exception("IPTables NOT set properly - WireIP not found in IPTables") - else: - print("IPTables set properly") - - -def verify_system_rebooted(): - - # This is primarily a fail safe mechanism to ensure tests don't run if the VM didnt reboot properly - signal_file = "/var/log/reboot_time.txt" - if not os.path.exists(signal_file): - print("Signal file not found, checking uptime") - __execute_and_print_cmd(["uptime", "-s"]) - raise Exception("Signal file {0} not found! Reboot didnt work as expected!".format(signal_file)) - - try: - with open(signal_file) as sig: - reboot_time_str = sig.read().strip() - - reboot_time = datetime.strptime(reboot_time_str, "%Y-%m-%dT%H:%M:%S.%fZ") - now = datetime.utcnow() - print("\nCron file Reboot time: {0}; Current Time: {1}\n".format(reboot_time_str, now.isoformat())) - if now <= reboot_time: - raise Exception( - "The reboot time {0} is somehow greater than current time {1}".format(reboot_time_str, now.isoformat())) - finally: - # Finally delete file to keep state clean - os.rename(signal_file, "{0}-{1}".format(signal_file, datetime.utcnow().isoformat())) - - -def __execute_and_print_cmd(cmd): - pipe = subprocess.Popen(cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=False) - stdout, stderr = pipe.communicate() - exit_code = pipe.returncode - - print( - "\n\tCommand: {0}, ExitCode: {1}\n\tStdout: {2}\n\tStderr: {3}".format(' '.join(cmd), exit_code, stdout.strip(), - stderr.strip())) - return exit_code, stdout, stderr - - -def run_systemctl_command(service_name, command="is-enabled"): - cmd = ["systemctl", command, service_name] - return __execute_and_print_cmd(cmd) - - -def get_firewalld_rules(): - cmd = ["firewall-cmd", "--permanent", "--direct", "--get-all-passthroughs"] - return __execute_and_print_cmd(cmd) - - -def get_firewalld_running_state(): - cmd = ["firewall-cmd", "--state"] - return __execute_and_print_cmd(cmd) - - -def get_logs_from_journalctl(unit_name): - cmd = ["journalctl", "-u", unit_name, "-b", "-o", "short-precise"] - return __execute_and_print_cmd(cmd) - - -def generate_svg(svg_name): - # This is a good to have, but not must have. Not failing tests if we're unable to generate a SVG - print("Running systemd-analyze plot command to get the svg for boot execution order") - dest_dir = SVG_DIR - if not os.path.exists(dest_dir): - os.makedirs(dest_dir) - retry = 0 - ec = 1 - while ec > 0 and retry < 3: - cmd = "systemd-analyze plot > {0}".format(os.path.join(dest_dir, svg_name)) - print("\tCommand for Svg: {0}".format(cmd)) - pipe = subprocess.Popen(cmd, - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdout, stderr = pipe.communicate() - ec = pipe.returncode - if stdout or stderr: - print("\n\tSTDOUT: {0}\n\tSTDERR: {1}".format(stdout.strip(), stderr.strip())) - - if ec > 0: - retry += 1 - print("Failed with exit-code: {0}, retrying again in 60secs. Retry Attempt: {1}".format(ec, retry)) - time.sleep(60) - - -def firewalld_service_enabled(): - try: - exit_code, _, __ = get_firewalld_running_state() - return exit_code == 0 - except Exception as error: - print("\nFirewall service not running: {0}".format(error)) - - return False - - -def print_stateful_debug_data(): - """ - This function is used to print all debug data that we can capture to debug the scenario (which might not be - available on the log file). It would print the following if available (else just print the error) - - - The agent.service status - - Agent-network-setup.service status - - Agent-network-setup.service logs - - Firewall rules set using firewalld.service - - Output of Cron-logs for the current boot - - The state of iptables currently - """ - print("\n\n\nAll possible stateful Debug data (capturing before reboot) : ") - - agent_name = get_current_agent_name() - # - The agent.service status - run_systemctl_command("{0}.service".format(agent_name), "status") - - if firewalld_service_enabled(): - # - Firewall rules set using firewalld.service - get_firewalld_rules() - - # - Firewalld.service status - run_systemctl_command("firewalld.service", "status") - - else: - # - Agent-network-setup.service status - run_systemctl_command("{0}-network-setup.service".format(agent_name), "status") - - # - Agent-network-setup.service logs - # Sometimes the service status does not return logs, calling journalctl explicitly for fetching service logs - get_logs_from_journalctl(unit_name="{0}-network-setup.service".format(agent_name)) - - # - Print both Cron-logs contents (root and non-root) and if file is empty or not for Wire-version file - def _print_log_data(log_file): - try: - log_lines = read_file(log_file) - print("\nLogs for {0}: \n".format(log_file)) - for line in log_lines: - print("\t{0}".format(line)) - except Exception as error: - print("\nUnable to print logs for: {0}; Error: {1}".format(log_file, error)) - - for test_file in [__NON_ROOT_CRON_LOG, __NON_ROOT_WIRE_XML, __ROOT_CRON_LOG, __ROOT_WIRE_XML]: - # Move files over to the /var/log/ directory for bookkeeping - _print_log_data(test_file) - __move_file_with_date_suffix(test_file) - - # - The state of iptables currently - ec, stdout, stderr = get_iptables_rules() - print("\nIPTABLES RULES:\n\tSTDOUT: {0}".format(stdout)) - if stderr: - print("\tSTDERR: {0}".format(stderr)) diff --git a/dcr/scenarios/agent-persist-firewall/run.host.py b/dcr/scenarios/agent-persist-firewall/run.host.py deleted file mode 100644 index 2b2ef9073..000000000 --- a/dcr/scenarios/agent-persist-firewall/run.host.py +++ /dev/null @@ -1,41 +0,0 @@ -import asyncio -import os -import time - -from dcr.scenario_utils.azure_models import ComputeManager -from dcr.scenario_utils.common_utils import execute_py_script_over_ssh_on_test_vms, \ - execute_commands_concurrently_on_test_vms - -from persist_firewall_helpers import SVG_DIR - - -def get_svg_files(): - harvest_dir = os.path.join(os.environ['BUILD_ARTIFACTSTAGINGDIRECTORY'], "harvest") - scp_cmd = f"scp -o StrictHostKeyChecking=no {{username}}@{{ip}}:{SVG_DIR} {harvest_dir}" - asyncio.run(execute_commands_concurrently_on_test_vms([scp_cmd])) - - -if __name__ == '__main__': - # Execute run1.py first - try: - execute_py_script_over_ssh_on_test_vms(command="dcr/scenarios/agent-persist-firewall/run1.py") - - compute_manager = ComputeManager().compute_manager - # Restart VM and wait for it to come back up - compute_manager.restart() - - # Execute suite 2 - # Since the VM just restarted, wait for 10 secs before executing the script - time.sleep(10) - execute_py_script_over_ssh_on_test_vms(command="dcr/scenarios/agent-persist-firewall/run2.py") - - compute_manager.restart() - - # Execute suite 3 - # Since the VM just restarted, wait for 10 secs before executing the script - time.sleep(10) - execute_py_script_over_ssh_on_test_vms(command="dcr/scenarios/agent-persist-firewall/run3.py") - finally: - # Always try to fetch svg files off of the VM - get_svg_files() - diff --git a/dcr/scenarios/agent-persist-firewall/run1.py b/dcr/scenarios/agent-persist-firewall/run1.py deleted file mode 100644 index cac9b43aa..000000000 --- a/dcr/scenarios/agent-persist-firewall/run1.py +++ /dev/null @@ -1,13 +0,0 @@ -from dcr.scenario_utils.test_orchestrator import TestFuncObj, TestOrchestrator -from persist_firewall_helpers import verify_wire_ip_in_iptables - -if __name__ == '__main__': - tests = [ - TestFuncObj("Verify_Wire_IP_IPTables", verify_wire_ip_in_iptables) - ] - - test_orchestrator = TestOrchestrator("PersistFirewall-VM1", tests=tests) - test_orchestrator.run_tests() - test_orchestrator.generate_report_on_vm("test-result-pf-run1.xml") - assert not test_orchestrator.failed, f"Test Suite: {test_orchestrator.name} failed" - diff --git a/dcr/scenarios/agent-persist-firewall/run2.py b/dcr/scenarios/agent-persist-firewall/run2.py deleted file mode 100644 index 42a420f76..000000000 --- a/dcr/scenarios/agent-persist-firewall/run2.py +++ /dev/null @@ -1,58 +0,0 @@ -from dcr.scenario_utils.common_utils import get_current_agent_name -from dcr.scenario_utils.test_orchestrator import TestFuncObj, TestOrchestrator -from persist_firewall_helpers import verify_wire_ip_in_iptables, verify_system_rebooted, generate_svg, \ - verify_wire_ip_unreachable_for_non_root, verify_wire_ip_reachable_for_root, run_systemctl_command, \ - firewalld_service_enabled, print_stateful_debug_data - - -def check_external_service_status(): - agent_name = get_current_agent_name() - # Check if firewall active on the Vm - if firewalld_service_enabled(): - # If yes, then print its status - ec, _, __ = run_systemctl_command("firewalld.service", command="status") - if ec != 0: - raise Exception("Something wrong with firewalld.service!") - - # Else print status of our custom service - else: - service_name = "{0}-network-setup.service".format(agent_name) - - # Check if enabled, if not then raise Error - ec, stdout, stderr = run_systemctl_command(service_name, command="is-enabled") - if ec != 0: - raise Exception("Service should be enabled!") - - # Check if failed, if so then raise Error - ec, stdout, stderr = run_systemctl_command(service_name, command="is-failed") - if ec == 0: - raise Exception("The service should not be in a failed state!") - - # Finally print the status of the service - run_systemctl_command(service_name, command="status") - - print("\nDisable Guest Agent service for more verbose testing") - ec, _, __ = run_systemctl_command(service_name="{0}.service".format(agent_name), command="disable") - if ec != 0: - raise Exception("Agent not disabled properly!") - - -if __name__ == '__main__': - tests = [ - TestFuncObj("Verify system rebooted", verify_system_rebooted, raise_on_error=True), - TestFuncObj("Generate SVG", lambda: generate_svg(svg_name="agent_running.svg")), - TestFuncObj("Verify wireIP unreachable for non-root", verify_wire_ip_unreachable_for_non_root), - TestFuncObj("Verify wireIP reachable for root", verify_wire_ip_reachable_for_root), - TestFuncObj("Verify_Wire_IP_IPTables", lambda: verify_wire_ip_in_iptables(max_retry=1)), - TestFuncObj("Verify External services", check_external_service_status) - ] - - test_orchestrator = TestOrchestrator("PersistFirewall-VM2", tests=tests) - test_orchestrator.run_tests() - - # Print stateful debug data before reboot because the state might be lost after - print_stateful_debug_data() - - test_orchestrator.generate_report_on_vm("test-result-pf-run2.xml") - assert not test_orchestrator.failed, f"Test Suite: {test_orchestrator.name} failed" - diff --git a/dcr/scenarios/agent-persist-firewall/run3.py b/dcr/scenarios/agent-persist-firewall/run3.py deleted file mode 100644 index 6a7963ebc..000000000 --- a/dcr/scenarios/agent-persist-firewall/run3.py +++ /dev/null @@ -1,36 +0,0 @@ -from dcr.scenario_utils.check_waagent_log import check_waagent_log_for_errors -from dcr.scenario_utils.common_utils import get_current_agent_name -from dcr.scenario_utils.test_orchestrator import TestFuncObj, TestOrchestrator -from persist_firewall_helpers import verify_wire_ip_in_iptables, run_systemctl_command, verify_system_rebooted, \ - generate_svg, verify_wire_ip_unreachable_for_non_root, verify_wire_ip_reachable_for_root - - -def ensure_agent_not_running(): - print("Verifying agent not running") - agent_service_name = "{0}.service".format(get_current_agent_name()) - ec, _, __ = run_systemctl_command(agent_service_name, "is-enabled") - if ec == 0: - raise Exception("{0} is enabled!".format(agent_service_name)) - - ec, _, __ = run_systemctl_command(agent_service_name, "is-active") - if ec == 0: - raise Exception("{0} should not be active!".format(agent_service_name)) - - -if __name__ == '__main__': - tests = [ - TestFuncObj("Verify system rebooted", verify_system_rebooted, raise_on_error=True), - TestFuncObj("Ensure agent not running", ensure_agent_not_running), - TestFuncObj("Generate SVG", lambda: generate_svg(svg_name="agent_not_running.svg")), - TestFuncObj("Verify wire IP unreachable for non-root", verify_wire_ip_unreachable_for_non_root), - TestFuncObj("Verify wire IP reachable for root", verify_wire_ip_reachable_for_root), - # Considering the rules should be set on reboot, not adding a retry check - TestFuncObj("Verify wire IP in IPTables", lambda: verify_wire_ip_in_iptables(max_retry=1)), - TestFuncObj("Check agent log", check_waagent_log_for_errors) - ] - - test_orchestrator = TestOrchestrator("PersistFirewall-VM3", tests=tests) - test_orchestrator.run_tests() - test_orchestrator.generate_report_on_vm("test-result-pf-run3.xml") - assert not test_orchestrator.failed, f"Test Suite: {test_orchestrator.name} failed" - diff --git a/dcr/scenarios/agent-persist-firewall/setup.sh b/dcr/scenarios/agent-persist-firewall/setup.sh deleted file mode 100644 index 73ef46419..000000000 --- a/dcr/scenarios/agent-persist-firewall/setup.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env bash - -# 1 2 3 -# Usage: -set -euxo pipefail - -d=$(which date) -ipt=$(which iptables) -username="dcr" -script_dir=$(dirname "$0") -cp "$script_dir/access_wire_ip.sh" "/usr/bin/" -chmod 700 "/usr/bin/access_wire_ip.sh" -mkdir -p /home/$username || echo "this is only needed for Suse VMs for running cron jobs as non-root" -# Setup Cron jobs -echo "@reboot ($d --utc +\\%FT\\%T.\\%3NZ && /usr/bin/access_wire_ip.sh $ipt) > /var/tmp/reboot-cron-root.log 2>&1" | crontab -u root - -echo "@reboot ($d --utc +\\%FT\\%T.\\%3NZ && /usr/bin/access_wire_ip.sh $ipt) > /var/tmp/reboot-cron-non-root.log 2>&1" | crontab -u $username - -(crontab -l 2>/dev/null; echo "@reboot ($d --utc +\%FT\%T.\%3NZ) > /var/log/reboot_time.txt 2>&1") | crontab -u root - -s=$(which systemctl) -(crontab -l 2>/dev/null; echo "@reboot ($s status walinuxagent-network-setup.service || $s status waagent-network-setup.service) > /var/log/reboot_network_setup.txt 2>&1)") | crontab -u root - - -# Enable Firewall for all distros -sed -i 's/OS.EnableFirewall=n/OS.EnableFirewall=y/g' /etc/waagent.conf - -# Restart agent to pick up the new conf -systemctl restart waagent || systemctl restart walinuxagent - -# Ensure that the setup file exists -file="wa*-network-setup.service" -[ "$(ls /usr/lib/systemd/system/$file /lib/systemd/system/$file 2>/dev/null | wc -w)" -gt 0 ] && echo "agent-network-setup file exists" || echo "agent-network-setup file does not exists" \ No newline at end of file diff --git a/dcr/scenarios/ext-seq-multiple-dependencies/config.json b/dcr/scenarios/ext-seq-multiple-dependencies/config.json deleted file mode 100644 index ad89d1397..000000000 --- a/dcr/scenarios/ext-seq-multiple-dependencies/config.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "location": "West Central US" -} diff --git a/dcr/scenarios/ext-seq-multiple-dependencies/ext_seq.py b/dcr/scenarios/ext-seq-multiple-dependencies/ext_seq.py deleted file mode 100644 index 61c648afc..000000000 --- a/dcr/scenarios/ext-seq-multiple-dependencies/ext_seq.py +++ /dev/null @@ -1,171 +0,0 @@ -import re -import uuid -from datetime import datetime -from time import sleep - -from azure.mgmt.resource.resources.models import DeploymentMode, DeploymentProperties, Deployment -from msrestazure.azure_exceptions import CloudError - -from dcr.scenario_utils.azure_models import ComputeManager -from dcr.scenario_utils.logging_utils import LoggingHandler -from dcr.scenario_utils.models import get_vm_data_from_env - - -class ExtensionSequencingTestClass(LoggingHandler): - - # This is the base ARM template that's used for deploying extensions for this scenario. These templates build on - # top of each other. i.e., 01_test runs first, then 02_test builds on top of it and so on and so forth. - extension_template = { - "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json", - "contentVersion": "1.0.0.0", - "resources": [ - { - "type": "Microsoft.Compute/virtualMachineScaleSets", - "name": "", - "location": "[resourceGroup().location]", - "apiVersion": "2018-06-01", - "properties": { - "virtualMachineProfile": { - "extensionProfile": { - "extensions": [] - } - } - } - } - ] - } - - def __init__(self): - super().__init__() - self.__vm_data = get_vm_data_from_env() - self.__compute_manager = ComputeManager().compute_manager - - # Update the VMSS name - ExtensionSequencingTestClass.extension_template['resources'][0]['name'] = self.__vm_data.name - - def deploy_extensions(self, ext_json): - self.log.info(f"Deploying extension template: {ext_json}") - - retry = 0 - max_retry = 5 - while retry < max_retry: - try: - props = DeploymentProperties(template=ext_json, - mode=DeploymentMode.incremental) - poller = self.__compute_manager.resource_client.deployments.begin_create_or_update( - self.__vm_data.rg_name, 'TestDeployment', Deployment(properties=props)) - # Wait a max of 10 mins - poller.wait(timeout=10 * 60) - if poller.done(): - break - else: - raise TimeoutError("Extension deployment timed out after 10 mins") - except CloudError as ce: - self.log.warning(f"Cloud Error: {ce}", exc_info=True) - retry += 1 - err_msg = str(ce) - if "'code': 'Conflict'" in err_msg and retry < max_retry: - self.log.warning( - "({0}/{1}) Conflict Error when deploying extension in VMSS, trying again in 1 sec (Error: {2})".format( - retry, max_retry, ce)) - # Since this was a conflicting operation, sleeping for a second before retrying - sleep(1) - else: - raise - - self.log.info("Successfully deployed extensions") - - @staticmethod - def get_dependency_map(ext_json) -> dict: - dependency_map = dict() - - vmss = ext_json['resources'][0] - extensions = vmss['properties']['virtualMachineProfile']['extensionProfile']['extensions'] - - for ext in extensions: - ext_name = ext['name'] - provisioned_after = ext['properties'].get('provisionAfterExtensions') - dependency_map[ext_name] = provisioned_after - - return dependency_map - - @staticmethod - def __get_time(ext, test_guid): - if ext.statuses[0].time is not None: - # This is populated if `configurationAppliedTime` is provided in the status file of extension - return ext.statuses[0].time - - if ext.statuses[0].message is not None: - # In our tests, for CSE and RunCommand, we would execute this command to get the time when it was enabled - - # echo 'GUID: $(date +%Y-%m-%dT%H:%M:%S.%3NZ)' - match = re.search(r"{0}: ([\d-]+T[\d:.]+Z)".format(test_guid), ext.statuses[0].message) - if match is not None: - return datetime.strptime(match.group(1), "%Y-%m-%dT%H:%M:%S.%fZ") - - # If nothing else works, just return the minimum datetime - return datetime.min - - def get_sorted_extension_names(self, test_guid): - # Retrieve the VMSS extension instances - vmss_vm_extensions = self.__compute_manager.get_vm_instance_view().extensions - - # Log the extension enabled datetime - for ext in vmss_vm_extensions: - ext.time = self.__get_time(ext, test_guid) - self.log.info("Extension {0} Status: {1}".format(ext.name, ext.statuses[0])) - - # sort the extensions based on their enabled datetime - sorted_extensions = sorted(vmss_vm_extensions, key=lambda ext_: ext_.time) - self.log.info("Sorted extension names with time: {0}".format( - ', '.join(["{0}: {1}".format(ext.name, ext.time) for ext in sorted_extensions]))) - return [ext.name for ext in sorted_extensions] - - def validate_extension_sequencing(self, dependency_map, sorted_extension_names): - installed_ext = dict() - - # Iterate through the extensions in the enabled order and validate if their depending - # extensions are already enabled prior to that. - for ext in sorted_extension_names: - # Check if the depending extension are already installed - if ext not in dependency_map: - # Some extensions might be installed by policy, continue in this case - self.log.info("Unwanted extension found in Instance view: {0}".format(ext)) - continue - if dependency_map[ext] is not None: - for dep in dependency_map[ext]: - if installed_ext.get(dep) is None: - # The depending extension is not installed prior to the current extension - raise Exception("{0} is not installed prior to {1}".format(dep, ext)) - - # Mark the current extension as installed - installed_ext[ext] = ext - - self.log.info("Validated extension sequencing") - - def run(self, extension_template): - - # Update the settings for each extension to make sure they're always unique to force CRP to generate a new - # sequence number each time - ext_json = ExtensionSequencingTestClass.extension_template.copy() - test_guid = str(uuid.uuid4()) - for ext in extension_template: - ext["properties"]["settings"].update({ - "commandToExecute": "echo \"{0}: $(date +%Y-%m-%dT%H:%M:%S.%3NZ)\"".format(test_guid) - }) - - # We update the extensions here, they are specific to the scenario that we want to test out (01_test, 02_test..) - ext_json['resources'][0]['properties']['virtualMachineProfile']['extensionProfile'][ - 'extensions'] = extension_template - - # Deploy VMSS extensions with sequence - self.deploy_extensions(ext_json) - - # Build the dependency map from the list of extensions in the extension profile - dependency_map = self.get_dependency_map(ext_json) - self.log.info("Dependency map: {0}".format(dependency_map)) - - # Get the extensions sorted based on their enabled datetime - sorted_extension_names = self.get_sorted_extension_names(test_guid) - self.log.info("Sorted extensions: {0}".format(sorted_extension_names)) - - self.validate_extension_sequencing(dependency_map, sorted_extension_names) diff --git a/dcr/scenarios/ext-seq-multiple-dependencies/ext_seq_tests.py b/dcr/scenarios/ext-seq-multiple-dependencies/ext_seq_tests.py deleted file mode 100644 index b1be968da..000000000 --- a/dcr/scenarios/ext-seq-multiple-dependencies/ext_seq_tests.py +++ /dev/null @@ -1,196 +0,0 @@ -def add_extensions_with_dependency_template(): - return [ - { - "name": "GATestExt", - "properties": { - "publisher": "Microsoft.Azure.Extensions.Edp", - "type": "GATestExtGo", - "typeHandlerVersion": "1.0", - "autoUpgradeMinorVersion": True, - "settings": { - "name": "Enabling GA Test Extension" - } - } - }, - { - "name": "RunCommand", - "properties": { - "provisionAfterExtensions": ["GATestExt"], - "publisher": "Microsoft.CPlat.Core", - "type": "RunCommandLinux", - "typeHandlerVersion": "1.0", - "autoUpgradeMinorVersion": True, - "settings": {} - } - }, - { - "name": "CSE", - "properties": { - "provisionAfterExtensions": ["RunCommand", "GATestExt"], - "publisher": "Microsoft.Azure.Extensions", - "type": "CustomScript", - "typeHandlerVersion": "2.1", - "autoUpgradeMinorVersion": True, - "settings": {} - } - } - ] - - -def remove_dependent_extension_template(): - return [ - { - "name": "GATestExt", - "properties": { - "publisher": "Microsoft.Azure.Extensions.Edp", - "type": "GATestExtGo", - "typeHandlerVersion": "1.0", - "autoUpgradeMinorVersion": True, - "settings": { - "name": "Enabling GA Test Extension" - } - } - }, - { - "name": "CSE", - "properties": { - "provisionAfterExtensions": ["GATestExt"], - "publisher": "Microsoft.Azure.Extensions", - "type": "CustomScript", - "typeHandlerVersion": "2.1", - "autoUpgradeMinorVersion": True, - "settings": {} - } - } - ] - - -def remove_all_dependencies_template(): - return [ - { - "name": "GATestExt", - "properties": { - "publisher": "Microsoft.Azure.Extensions.Edp", - "type": "GATestExtGo", - "typeHandlerVersion": "1.0", - "autoUpgradeMinorVersion": True, - "settings": { - "name": "Enabling GA Test Extension" - } - } - }, - { - "name": "RunCommand", - "properties": { - "publisher": "Microsoft.CPlat.Core", - "type": "RunCommandLinux", - "typeHandlerVersion": "1.0", - "autoUpgradeMinorVersion": True, - "settings": {} - } - }, - { - "name": "CSE", - "properties": { - "publisher": "Microsoft.Azure.Extensions", - "type": "CustomScript", - "typeHandlerVersion": "2.1", - "autoUpgradeMinorVersion": True, - "settings": {} - } - } - ] - - -def add_more_dependencies_template(): - return [ - { - "name": "GATestExt", - "properties": { - "publisher": "Microsoft.Azure.Extensions.Edp", - "type": "GATestExtGo", - "typeHandlerVersion": "1.0", - "autoUpgradeMinorVersion": True, - "settings": { - "name": "Enabling GA Test Extension" - } - } - }, - { - "name": "RunCommand", - "properties": { - "publisher": "Microsoft.CPlat.Core", - "type": "RunCommandLinux", - "typeHandlerVersion": "1.0", - "autoUpgradeMinorVersion": True, - "settings": {} - } - }, - { - "name": "CSE", - "properties": { - "provisionAfterExtensions": ["RunCommand", "GATestExt"], - "publisher": "Microsoft.Azure.Extensions", - "type": "CustomScript", - "typeHandlerVersion": "2.1", - "autoUpgradeMinorVersion": True, - "settings": {} - } - } - ] - - -def single_dependencies_template(): - return [ - { - "name": "GATestExt", - "properties": { - "publisher": "Microsoft.Azure.Extensions.Edp", - "type": "GATestExtGo", - "typeHandlerVersion": "1.0", - "autoUpgradeMinorVersion": True, - "settings": { - "name": "Enabling GA Test Extension" - } - } - }, - { - "name": "RunCommand", - "properties": { - "provisionAfterExtensions": ["CSE"], - "publisher": "Microsoft.CPlat.Core", - "type": "RunCommandLinux", - "typeHandlerVersion": "1.0", - "autoUpgradeMinorVersion": True, - "settings": {} - } - }, - { - "name": "CSE", - "properties": { - "provisionAfterExtensions": ["GATestExt"], - "publisher": "Microsoft.Azure.Extensions", - "type": "CustomScript", - "typeHandlerVersion": "2.1", - "autoUpgradeMinorVersion": True, - "settings": {} - } - } - ] - - -def delete_extensions_template(): - return [ - { - "name": "GATestExt", - "properties": { - "publisher": "Microsoft.Azure.Extensions.Edp", - "type": "GATestExtGo", - "typeHandlerVersion": "1.0", - "autoUpgradeMinorVersion": True, - "settings": { - "name": "Enabling GA Test Extension" - } - } - } - ] diff --git a/dcr/scenarios/ext-seq-multiple-dependencies/run.host.py b/dcr/scenarios/ext-seq-multiple-dependencies/run.host.py deleted file mode 100644 index d75acb805..000000000 --- a/dcr/scenarios/ext-seq-multiple-dependencies/run.host.py +++ /dev/null @@ -1,27 +0,0 @@ -from dcr.scenario_utils.test_orchestrator import TestFuncObj, TestOrchestrator -from ext_seq import ExtensionSequencingTestClass -from ext_seq_tests import add_extensions_with_dependency_template, remove_dependent_extension_template, \ - remove_all_dependencies_template, add_more_dependencies_template, single_dependencies_template, \ - delete_extensions_template - - -def main(): - ext_seq = ExtensionSequencingTestClass() - - tests = [ - TestFuncObj("Add Extensions with dependencies", lambda: ext_seq.run(add_extensions_with_dependency_template()), raise_on_error=True), - TestFuncObj("Remove dependent extension", lambda: ext_seq.run(remove_dependent_extension_template())), - TestFuncObj("Remove all dependencies", lambda: ext_seq.run(remove_all_dependencies_template())), - TestFuncObj("Add more dependencies", lambda: ext_seq.run(add_more_dependencies_template())), - TestFuncObj("single dependencies", lambda: ext_seq.run(single_dependencies_template())), - TestFuncObj("Delete extensions", lambda: ext_seq.run(delete_extensions_template())) - ] - - test_orchestrator = TestOrchestrator("ExtSeqDependency-Host", tests=tests) - test_orchestrator.run_tests() - test_orchestrator.generate_report_on_orchestrator("test-results-ext-seq-host.xml") - assert not test_orchestrator.failed, f"Test Suite: {test_orchestrator.name} failed" - - -if __name__ == '__main__': - main() diff --git a/dcr/scenarios/ext-seq-multiple-dependencies/run.py b/dcr/scenarios/ext-seq-multiple-dependencies/run.py deleted file mode 100644 index bd28188ce..000000000 --- a/dcr/scenarios/ext-seq-multiple-dependencies/run.py +++ /dev/null @@ -1,15 +0,0 @@ -import socket - -from dcr.scenario_utils.check_waagent_log import check_waagent_log_for_errors -from dcr.scenario_utils.test_orchestrator import TestFuncObj, TestOrchestrator - - -if __name__ == '__main__': - tests = [ - TestFuncObj("check agent log", check_waagent_log_for_errors) - ] - - test_orchestrator = TestOrchestrator("ExtSeqDependency-VM", tests=tests) - test_orchestrator.run_tests() - test_orchestrator.generate_report_on_vm(f"test-result-ext-seq-vm-{socket.gethostname()}.xml") - assert not test_orchestrator.failed, f"Test Suite: {test_orchestrator.name} failed" diff --git a/dcr/scenarios/ext-seq-multiple-dependencies/template.json b/dcr/scenarios/ext-seq-multiple-dependencies/template.json deleted file mode 100644 index 358b264a3..000000000 --- a/dcr/scenarios/ext-seq-multiple-dependencies/template.json +++ /dev/null @@ -1,304 +0,0 @@ -{ - "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", - "contentVersion": "1.0.0.0", - "parameters": { - "adminUsername": { - "type": "string" - }, - "adminPasswordOrKey": { - "type": "string" - }, - "vmSize": { - "type": "string", - "defaultValue": "Standard_B2s" - }, - "vmName": { - "type": "string" - }, - "scenarioPrefix": { - "type": "string", - "defaultValue": "dcr" - }, - "imagePublisher": { - "type": "string", - "defaultValue": "Canonical" - }, - "imageOffer": { - "type": "string", - "defaultValue": "UbuntuServer" - }, - "imageVersion": { - "type": "string", - "defaultValue": "latest" - }, - "imageSku": { - "type": "string", - "defaultValue": "18.04-LTS" - }, - "location": { - "type": "string", - "defaultValue": "[resourceGroup().location]", - "metadata": { - "description": "Location for all resources." - } - }, - "dnsLabelPrefix": { - "type": "string", - "defaultValue": "[toLower(format('simplelinuxvm-{0}', uniqueString(resourceGroup().id)))]", - "metadata": { - "description": "Unique DNS Name for the Public IP used to access the Virtual Machine." - } - } - }, - "variables": { - "nicName": "[concat(parameters('scenarioPrefix'),'Nic')]", - "vnetAddressPrefix": "10.130.0.0/16", - "subnetName": "[concat(parameters('scenarioPrefix'),'Subnet')]", - "subnetPrefix": "10.130.0.0/24", - "publicIPAddressName": "[concat(parameters('scenarioPrefix'),'PublicIp')]", - "lbIpName": "[concat(parameters('scenarioPrefix'),'PublicLbIp')]", - "lbIpId": "[resourceId('Microsoft.Network/publicIPAddresses', variables('lbIpName'))]", - "virtualNetworkName": "[concat(parameters('scenarioPrefix'),'Vnet')]", - "lbName": "[concat(parameters('scenarioPrefix'),'lb')]", - "bepoolName": "[concat(variables('lbName'), 'bepool')]", - "natpoolName": "[concat(variables('lbName'), 'natpool')]", - "feIpConfigName": "[concat(variables('lbName'), 'fepool', 'IpConfig')]", - "sshProbeName": "[concat(variables('lbName'), 'probe')]", - "vnetID": "[resourceId('Microsoft.Network/virtualNetworks',variables('virtualNetworkName'))]", - "subnetRef": "[concat(variables('vnetID'),'/subnets/',variables('subnetName'))]", - "lbId": "[resourceId('Microsoft.Network/loadBalancers', variables('lbName'))]", - "bepoolID": "[concat(variables('lbId'), '/backendAddressPools/', variables('bepoolName'))]", - "natpoolID": "[concat(variables('lbId'), '/inboundNatPools/', variables('natpoolName'))]", - "feIpConfigId": "[concat(variables('lbId'), '/frontendIPConfigurations/', variables('feIpConfigName'))]", - "sshProbeId": "[concat(variables('lbId'), '/probes/', variables('sshProbeName'))]", - "sshKeyPath": "[concat('/home/', parameters('adminUsername'), '/.ssh/authorized_keys')]", - "networkSecurityGroupName": "networkSecurityGroup1" - }, - "resources": [ - { - "apiVersion": "2017-06-01", - "type": "Microsoft.Network/networkSecurityGroups", - "name": "[variables('networkSecurityGroupName')]", - "location": "[parameters('location')]", - "properties": { - "securityRules": [ - { - "name": "SSH", - "properties": { - "description": "Locks inbound down to jenkins ip range.", - "protocol": "Tcp", - "sourcePortRange": "*", - "destinationPortRange": "22", - "sourceAddressPrefix": "*", - "destinationAddressPrefix": "*", - "access": "Allow", - "priority": 100, - "direction": "Inbound" - } - } - ] - } - }, - { - "apiVersion": "2016-12-01", - "type": "Microsoft.Network/virtualNetworks", - "name": "[variables('virtualNetworkName')]", - "location": "[parameters('location')]", - "dependsOn": [ - "[concat('Microsoft.Network/networkSecurityGroups/', variables('networkSecurityGroupName'))]" - ], - "properties": { - "addressSpace": { - "addressPrefixes": [ - "[variables('vnetAddressPrefix')]" - ] - }, - "subnets": [ - { - "name": "[variables('subnetName')]", - "properties": { - "addressPrefix": "[variables('subnetPrefix')]", - "networkSecurityGroup": { - "id": "[resourceId('Microsoft.Network/networkSecurityGroups', variables('networkSecurityGroupName'))]" - } - } - } - ] - } - }, - { - "type": "Microsoft.Network/publicIPAddresses", - "name": "[variables('lbIpName')]", - "location": "[parameters('location')]", - "apiVersion": "2017-04-01", - "properties": { - "publicIPAllocationMethod": "Dynamic", - "dnsSettings": { - "domainNameLabel": "[parameters('dnsLabelPrefix')]" - } - } - }, - { - "type": "Microsoft.Network/loadBalancers", - "name": "[variables('lbName')]", - "location": "[parameters('location')]", - "apiVersion": "2016-03-30", - "dependsOn": [ - "[concat('Microsoft.Network/virtualNetworks/', variables('virtualNetworkName'))]", - "[concat('Microsoft.Network/publicIPAddresses/', variables('lbIpName'))]" - ], - "properties": { - "frontendIPConfigurations": [ - { - "name": "[variables('feIpConfigName')]", - "properties": { - "PublicIpAddress": { - "id": "[variables('lbIpId')]" - } - } - } - ], - "backendAddressPools": [ - { - "name": "[variables('bepoolName')]" - } - ], - "inboundNatPools": [ - { - "name": "[variables('natpoolName')]", - "properties": { - "FrontendIPConfiguration": { - "Id": "[variables('feIpConfigId')]" - }, - "BackendPort": 22, - "Protocol": "tcp", - "FrontendPortRangeStart": 3500, - "FrontendPortRangeEnd": 4500 - } - } - ], - "loadBalancingRules": [ - { - "name": "ProbeRule", - "properties": { - "frontendIPConfiguration": { - "id": "[variables('feIpConfigId')]" - }, - "backendAddressPool": { - "id": "[variables('bepoolID')]" - }, - "protocol": "Tcp", - "frontendPort": 80, - "backendPort": 80, - "idleTimeoutInMinutes": 5, - "probe": { - "id": "[variables('sshProbeId')]" - } - } - } - ], - "probes": [ - { - "name": "[variables('sshProbeName')]", - "properties": { - "protocol": "tcp", - "port": 22, - "intervalInSeconds": 5, - "numberOfProbes": 2 - } - } - ] - } - }, - { - "apiVersion": "2018-06-01", - "type": "Microsoft.Compute/virtualMachineScaleSets", - "name": "[parameters('vmName')]", - "location": "[parameters('location')]", - "dependsOn": [ - "[concat('Microsoft.Network/virtualNetworks/', variables('virtualNetworkName'))]", - "[concat('Microsoft.Network/loadBalancers/', variables('lbName'))]" - ], - "sku": { - "name": "[parameters('vmSize')]", - "tier": "Standard", - "capacity": 3 - }, - "properties": { - "virtualMachineProfile": { - "extensionProfile": {}, - "osProfile": { - "computerNamePrefix": "[parameters('vmName')]", - "adminUsername": "[parameters('adminUsername')]", - "linuxConfiguration": { - "disablePasswordAuthentication": true, - "ssh": { - "publicKeys": [ - { - "path": "[variables('sshKeyPath')]", - "keyData": "[parameters('adminPasswordOrKey')]" - } - ] - } - } - }, - "storageProfile": { - "osDisk": { - "osType": "Linux", - "createOption": "FromImage" - }, - "imageReference": { - "publisher": "[parameters('imagePublisher')]", - "offer": "[parameters('imageOffer')]", - "sku": "[parameters('imageSku')]", - "version": "[parameters('imageVersion')]" - } - }, - "networkProfile": { - "healthProbe": { - "id": "[variables('sshProbeId')]" - }, - "networkInterfaceConfigurations": [ - { - "name": "[variables('nicName')]", - "properties": { - "primary": true, - "ipConfigurations": [ - { - "name": "ipconfig1", - "properties": { - "primary": true, - "publicIPAddressConfiguration": { - "name": "[variables('publicIPAddressName')]", - "properties": { - "idleTimeoutInMinutes": 15 - } - }, - "subnet": { - "id": "[variables('subnetRef')]" - }, - "loadBalancerBackendAddressPools": [ - { - "id": "[variables('bepoolID')]" - } - ], - "loadBalancerInboundNatPools": [ - { - "id": "[variables('natpoolID')]" - } - ] - } - } - ] - } - } - ] - } - }, - "upgradePolicy": { - "mode": "Automatic" - } - } - } - ] -} \ No newline at end of file diff --git a/dcr/scenarios/extension-telemetry-pipeline/etp_helpers.py b/dcr/scenarios/extension-telemetry-pipeline/etp_helpers.py deleted file mode 100644 index 9fd06587e..000000000 --- a/dcr/scenarios/extension-telemetry-pipeline/etp_helpers.py +++ /dev/null @@ -1,180 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import print_function - -import glob -import json -import os -import time -import uuid -from datetime import datetime, timedelta -from random import choice - - -def get_collect_telemetry_thread_name(): - return "TelemetryEventsCollector" - - -def wait_for_extension_events_dir_empty(timeout=timedelta(minutes=2)): - # By ensuring events dir to be empty, we verify that the telemetry events collector has completed its run - event_dirs = glob.glob(os.path.join("/var/log/azure/", "*", "events")) - start_time = datetime.now() - - assert event_dirs, "No extension event directories exist!" - - while (start_time + timeout) >= datetime.now(): - all_dir_empty = True - for event_dir in event_dirs: - if not os.path.exists(event_dir) or len(os.listdir(event_dir)) != 0: - print("Dir: {0} not empty".format(event_dir)) - all_dir_empty = False - break - - if all_dir_empty: - return - - time.sleep(5) - - raise AssertionError("Extension events dir not empty!") - - -def add_extension_events_and_get_count(bad_event_count=0, no_of_events_per_extension=50, extension_names=None): - print("Creating random extension events now. No of Good Events: {0}, No of Bad Events: {1}".format( - no_of_events_per_extension - bad_event_count, bad_event_count)) - - def missing_key(make_bad_event): - key = choice(list(make_bad_event.keys())) - del make_bad_event[key] - return "MissingKeyError: {0}".format(key) - - def oversize_error(make_bad_event): - make_bad_event["EventLevel"] = "ThisIsAnOversizeErrorOnSteroids\n" * 300 - return "OversizeEventError" - - def empty_message(make_bad_event): - make_bad_event["Message"] = "" - return "EmptyMessageError" - - def oversize_file_limit(make_bad_event): - make_bad_event["EventLevel"] = "MakeThisFileGreatAgain\n" * 30000 - return "OversizeEventFileSize" - - sample_ext_event = { - "EventLevel": "INFO", - "Message": "Starting IaaS ScriptHandler Extension v1", - "Version": "1.0", - "TaskName": "Extension Info", - "EventPid": "3228", - "EventTid": "1", - "OperationId": "519e4beb-018a-4bd9-8d8e-c5226cf7f56e", - "TimeStamp": "2019-12-12T01:20:05.0950244Z" - } - - sample_messages = [ - "Starting IaaS ScriptHandler Extension v1", - "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", - "The quick brown fox jumps over the lazy dog", - "Cursus risus at ultrices mi.", - "Doing Something", - "Iaculis eu non diam phasellus.", - "Doing other thing", - "Look ma, lemons", - "Pretium quam vulputate dignissim suspendisse.", - "Man this is insane", - "I wish it worked as it should and not as it ain't", - "Ut faucibus pulvinar elementum integer enim neque volutpat ac tincidunt." - "Did you get any of that?", - ] - - # Currently the GA cant send special chars in telemetry as the unicode changes were reverted. - # Once its enabled again, we would add these messages back to our tests. - # Should be enabled when this task is completed - https://msazure.visualstudio.com/One/_workitems/edit/8733946 - non_english_messages = [ - "Non-English message - 此文字不是英文的" - "κόσμε", - "�", - "Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen Wolther spillede på xylofon.", - "Falsches Üben von Xylophonmusik quält jeden größeren Zwerg", - "Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich", - "Heizölrückstoßabdämpfung", - "Γαζέες καὶ μυρτιὲς δὲν θὰ βρῶ πιὰ στὸ χρυσαφὶ ξέφωτο", - "Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία", - "El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba a su querido cachorro.", - "Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à côté de l'alcôve ovoïde, où les bûches", - "se consument dans l'âtre, ce qui lui permet de penser à la cænogenèse de l'être dont il est question", - "dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui, pense-t-il, diminue çà et là la qualité de son œuvre.", - "D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh", - "Árvíztűrő tükörfúrógép", - "Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa", - "Sævör grét áðan því úlpan var ónýt", - "いろはにほへとちりぬるを わかよたれそつねならむ うゐのおくやまけふこえて あさきゆめみしゑひもせす", - "イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン", - "? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה" - "Pchnąć w tę łódź jeża lub ośm skrzyń fig", - "В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!", - "๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน", - "Pijamalı hasta, yağız şoföre çabucak güvendi." - ] - - last_err = -1 - error_map = { - 0: missing_key, - 1: oversize_error, - 2: empty_message - } - - ext_log_dir = "/var/log/azure/" - - total_counts = {} - - for ext_dir in os.listdir(ext_log_dir): - events_dir = os.path.join(ext_log_dir, ext_dir, "events") - # If specific extensions are provided, only add the events for them - if not os.path.isdir(events_dir) or (extension_names is not None and ext_dir not in extension_names): - continue - - new_opr_id = str(uuid.uuid4()) - event_list = [] - good_count = 0 - bad_count = 0 - - for _ in range(no_of_events_per_extension): - event = sample_ext_event.copy() - event["OperationId"] = new_opr_id - event["TimeStamp"] = datetime.utcnow().strftime(u'%Y-%m-%dT%H:%M:%S.%fZ') - event["Message"] = choice(sample_messages) - - if bad_count < bad_event_count: - # Make this event a bad event by cycling through the possible errors - last_err += 1 - reason = error_map[last_err % len(error_map)](event) - bad_count += 1 - - # Missing key error might delete the TaskName key from the event - if "TaskName" in event: - event["TaskName"] = "{0}. BTW a bad event: {1}".format(event["TaskName"], reason) - else: - event["EventLevel"] = "{0}. BTW a bad event: {1}".format(event["EventLevel"], reason) - else: - good_count += 1 - event_list.append(event) - - file_name = os.path.join(events_dir, '{0}.json'.format(int(time.time() * 1000000))) - with open("{0}.tmp".format(file_name), 'w+') as f: - json.dump(event_list, f) - - os.rename("{0}.tmp".format(file_name), file_name) - - counts = { - "good": good_count, - "bad": bad_count - } - - print("OperationId: {0}; Extension: {1}; Count: {2}".format(new_opr_id, ext_dir, counts)) - - if ext_dir in total_counts: - total_counts[ext_dir]['good'] += good_count - total_counts[ext_dir]['bad'] += bad_count - else: - total_counts[ext_dir] = counts - - return total_counts diff --git a/dcr/scenarios/extension-telemetry-pipeline/run.host.py b/dcr/scenarios/extension-telemetry-pipeline/run.host.py deleted file mode 100644 index e7f0d4800..000000000 --- a/dcr/scenarios/extension-telemetry-pipeline/run.host.py +++ /dev/null @@ -1,19 +0,0 @@ -from dcr.scenario_utils.extensions.CustomScriptExtension import add_cse -from dcr.scenario_utils.extensions.VMAccessExtension import add_and_verify_vmaccess -from dcr.scenario_utils.test_orchestrator import TestFuncObj, TestOrchestrator - - -def main(): - tests = [ - TestFuncObj("Add Cse", lambda: add_cse(), raise_on_error=True), - TestFuncObj("Add VMAccess", lambda: add_and_verify_vmaccess(), raise_on_error=True) - ] - - test_orchestrator = TestOrchestrator("ETPTests-Host", tests=tests) - test_orchestrator.run_tests() - test_orchestrator.generate_report_on_orchestrator("test-results-etp-host.xml") - assert not test_orchestrator.failed, f"Test Suite: {test_orchestrator.name} failed" - - -if __name__ == '__main__': - main() diff --git a/dcr/scenarios/extension-telemetry-pipeline/run.py b/dcr/scenarios/extension-telemetry-pipeline/run.py deleted file mode 100644 index 3bff11e11..000000000 --- a/dcr/scenarios/extension-telemetry-pipeline/run.py +++ /dev/null @@ -1,102 +0,0 @@ -import glob -import os -import random -import time - -from dcr.scenario_utils.agent_log_parser import parse_agent_log_file -from dcr.scenario_utils.check_waagent_log import is_data_in_waagent_log, check_waagent_log_for_errors -from dcr.scenario_utils.extensions.CustomScriptExtension import CustomScriptExtension -from dcr.scenario_utils.extensions.VMAccessExtension import VMAccessExtension -from dcr.scenario_utils.test_orchestrator import TestFuncObj -from dcr.scenario_utils.test_orchestrator import TestOrchestrator -from etp_helpers import add_extension_events_and_get_count, wait_for_extension_events_dir_empty, \ - get_collect_telemetry_thread_name - - -def add_good_extension_events_and_verify(extension_names): - max_events = random.randint(10, 50) - print("Creating a total of {0} events".format(max_events)) - ext_event_count = add_extension_events_and_get_count(no_of_events_per_extension=max_events, - extension_names=extension_names) - - # Ensure that the event collector ran after adding the events - wait_for_extension_events_dir_empty() - - # Sleep for a min to ensure that the TelemetryService has enough time to send events and report errors if any - time.sleep(60) - telemetry_event_collector_name = get_collect_telemetry_thread_name() - errors_reported = False - for agent_log_line in parse_agent_log_file(): - if agent_log_line.thread == telemetry_event_collector_name and agent_log_line.is_error: - if not errors_reported: - print( - f"waagent.log contains the following errors emitted by the {telemetry_event_collector_name} thread (none expected):") - errors_reported = True - print(agent_log_line.text.rstrip()) - - for ext_name in ext_event_count: - good_count = ext_event_count[ext_name]['good'] - is_data_in_waagent_log("Collected {0} events for extension: {1}".format(good_count, ext_name)) - - -def add_bad_events_and_verify_count(extension_names): - max_events = random.randint(15, 50) - print("Creating a total of {0} events".format(max_events)) - extension_event_count = add_extension_events_and_get_count(bad_event_count=random.randint(5, max_events - 5), - no_of_events_per_extension=max_events, - extension_names=extension_names) - - # Ensure that the event collector ran after adding the events - wait_for_extension_events_dir_empty() - - # Sleep for a min to ensure that the TelemetryService has enough time to send events and report errors if any - time.sleep(60) - - for ext_name in extension_event_count: - good_count = extension_event_count[ext_name]['good'] - is_data_in_waagent_log("Dropped events for Extension: {0}".format(ext_name)) - is_data_in_waagent_log("Collected {0} events for extension: {1}".format(good_count, ext_name)) - - -def verify_etp_enabled(): - # Assert from log if ETP is enabled - is_data_in_waagent_log('Extension Telemetry pipeline enabled: True') - - # Since ETP is enabled, events dir should have been created for all extensions - event_dirs = glob.glob(os.path.join("/var/log/azure/", "*", "events")) - assert event_dirs, "No extension event directories exist!" - - if not all(os.path.exists(event_dir) for event_dir in event_dirs): - raise AssertionError("Event directory not found for all extensions!") - - -def check_agent_log(): - # Since we're injecting bad events in the add_bad_events_and_verify_count() function test, - # we expect some warnings to be emitted by the agent. - # We're already verifying if these warnings are being emitted properly in the specified test, so ignoring those here. - ignore = [ - { - 'message': r"Dropped events for Extension: Microsoft\.(OSTCExtensions.VMAccessForLinux|Azure.Extensions.CustomScript); Details:", - 'if': lambda log_line: log_line.level == "WARNING" and log_line.thread == get_collect_telemetry_thread_name() - } - ] - check_waagent_log_for_errors(ignore=ignore) - - -if __name__ == '__main__': - - extensions_to_verify = [CustomScriptExtension.META_DATA.handler_name, VMAccessExtension.META_DATA.handler_name] - tests = [ - TestFuncObj("Verify ETP enabled", verify_etp_enabled, raise_on_error=True, retry=3), - TestFuncObj("Add Good extension events and verify", - lambda: add_good_extension_events_and_verify(extensions_to_verify)), - TestFuncObj("Add Bad extension events and verify", - lambda: add_bad_events_and_verify_count(extensions_to_verify)), - TestFuncObj("Verify all events processed", wait_for_extension_events_dir_empty), - TestFuncObj("Check Agent log", check_agent_log), - ] - - test_orchestrator = TestOrchestrator("ETPTests-VM", tests=tests) - test_orchestrator.run_tests() - test_orchestrator.generate_report_on_vm("test-result-etp-vm.xml") - assert not test_orchestrator.failed, f"Test Suite: {test_orchestrator.name} failed" diff --git a/dcr/scenarios/extension-telemetry-pipeline/setup.sh b/dcr/scenarios/extension-telemetry-pipeline/setup.sh deleted file mode 100644 index f9aa67118..000000000 --- a/dcr/scenarios/extension-telemetry-pipeline/setup.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env bash - -# 1 2 3 -# Usage: -set -euxo pipefail - -if systemctl status walinuxagent;then - agent="walinuxagent" -else - agent="waagent" -fi - -systemctl stop $agent -# Change ETP collection period for faster testing and turn on verbose -echo 'Debug.EtpCollectionPeriod=30' >> /etc/waagent.conf -sed -i 's/Logs.Verbose=n/Logs.Verbose=y/g' /etc/waagent.conf -# Moving the log to create a new fresh log for testing -mv /var/log/waagent.log /var/log/waagent.old.log -systemctl start $agent -systemctl status $agent diff --git a/dcr/scripts/__init__.py b/dcr/scripts/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/dcr/scripts/build_agent_zip.sh b/dcr/scripts/build_agent_zip.sh deleted file mode 100755 index a9747049b..000000000 --- a/dcr/scripts/build_agent_zip.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -# https://linuxcommand.org/lc3_man_pages/seth.html -# -e Exit immediately if a command exits with a non-zero status. -# -u Treat unset variables as an error when substituting. -# -x Print commands and their arguments as they are executed. -# -o pipefail the return value of a pipeline is the status of the last command to exit with a non-zero status, -# or zero if no command exited with a non-zero status -set -euxo pipefail - -version=$(grep '^AGENT_VERSION' "$BUILD_SOURCESDIRECTORY/azurelinuxagent/common/version.py" | sed "s/.*'\([^']\+\)'.*/\1/") -# Azure Pipelines adds an extra quote at the end of the variable if we enable bash debugging as it prints an extra line - https://developercommunity.visualstudio.com/t/pipeline-variable-incorrectly-inserts-single-quote/375679 -set +x; echo "##vso[task.setvariable variable=agentVersion]$version"; set -x -sudo ./makepkg.py -sudo cp ./eggs/WALinuxAgent-$version.zip "$BUILD_SOURCESDIRECTORY/dcr" -sudo cp -r ./eggs/WALinuxAgent-$version "$BUILD_SOURCESDIRECTORY/dcr" \ No newline at end of file diff --git a/dcr/scripts/get_pypy.sh b/dcr/scripts/get_pypy.sh deleted file mode 100755 index f61dbe89d..000000000 --- a/dcr/scripts/get_pypy.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env bash - -# https://linuxcommand.org/lc3_man_pages/seth.html -# -e Exit immediately if a command exits with a non-zero status. -# -u Treat unset variables as an error when substituting. -# -x Print commands and their arguments as they are executed. -# -o pipefail the return value of a pipeline is the status of the last command to exit with a non-zero status, -# or zero if no command exited with a non-zero status -set -euxo pipefail - -pushd "$BUILD_SOURCESDIRECTORY/dcr" -curl "https://downloads.python.org/pypy/pypy3.7-v7.3.5-linux64.tar.bz2" -o "pypy.tar.bz2" -mkdir "pypy" -tar xf "$BUILD_SOURCESDIRECTORY/dcr/pypy.tar.bz2" -C "pypy" -pypy_path=$(ls -d pypy/*/bin/pypy3) -rm -rf "pypy.tar.bz2" -popd - -# Azure Pipelines adds an extra quote at the end of the variable if we enable bash debugging as it prints an extra line - https://developercommunity.visualstudio.com/t/pipeline-variable-incorrectly-inserts-single-quote/375679 -set +x -echo "##vso[task.setvariable variable=pypyPath]/home/$ADMINUSERNAME/dcr/$pypy_path" \ No newline at end of file diff --git a/dcr/scripts/install_pip_packages.sh b/dcr/scripts/install_pip_packages.sh deleted file mode 100644 index 5fd563bbf..000000000 --- a/dcr/scripts/install_pip_packages.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash - -# 1 2 3 -# Usage: - -# https://linuxcommand.org/lc3_man_pages/seth.html -# -e Exit immediately if a command exits with a non-zero status. -# -u Treat unset variables as an error when substituting. -# -x Print commands and their arguments as they are executed. -# -o pipefail the return value of a pipeline is the status of the last command to exit with a non-zero status, -# or zero if no command exited with a non-zero status -set -euxo pipefail - -$PYPYPATH -m ensurepip -$PYPYPATH -m pip install -r "$1" diff --git a/dcr/scripts/move_scenario.sh b/dcr/scripts/move_scenario.sh deleted file mode 100755 index 0ec6c73b5..000000000 --- a/dcr/scripts/move_scenario.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env bash - -# https://linuxcommand.org/lc3_man_pages/seth.html -# -e Exit immediately if a command exits with a non-zero status. -# -u Treat unset variables as an error when substituting. -# -x Print commands and their arguments as they are executed. -# -o pipefail the return value of a pipeline is the status of the last command to exit with a non-zero status, -# or zero if no command exited with a non-zero status -set -euxo pipefail - -# Delete all scenarios except for the one we're running in this VM -shopt -s extglob -pushd "$BUILD_SOURCESDIRECTORY/dcr/scenarios" -rm -rf !("$SCENARIONAME") -popd - -# Move contents of the remaining scenario to a directory called scenario -# This is done to be able to import the yml easily as importing a yml template can only be static, it cant be dynamic -mkdir "$BUILD_SOURCESDIRECTORY/dcr/scenario" -cp -r "$BUILD_SOURCESDIRECTORY/dcr/scenarios/$SCENARIONAME"/* "$BUILD_SOURCESDIRECTORY/dcr/scenario/" diff --git a/dcr/scripts/orchestrator/__init__.py b/dcr/scripts/orchestrator/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/dcr/scripts/orchestrator/execute_ssh_on_vm.py b/dcr/scripts/orchestrator/execute_ssh_on_vm.py deleted file mode 100644 index f10f5e4df..000000000 --- a/dcr/scripts/orchestrator/execute_ssh_on_vm.py +++ /dev/null @@ -1,61 +0,0 @@ -import asyncio -import os -import sys -import time - -from enum import Enum - -from dcr.scenario_utils.common_utils import execute_commands_concurrently_on_test_vms -from dcr.scenario_utils.logging_utils import get_logger - -logger = get_logger("dcr.scripts.orchestrator.execute_ssh_on_vm") - - -class SetupCommands: - setup_vm = "setup_vm" - fetch_results = "fetch_results" - harvest = "harvest" - - -async def run_tasks(command: str): - ssh_cmd = f'ssh -o StrictHostKeyChecking=no {{username}}@{{ip}}' - sources_dir = os.environ.get('BUILD_SOURCESDIRECTORY') - artifact_dir = os.environ.get('BUILD_ARTIFACTSTAGINGDIRECTORY') - - if command == SetupCommands.setup_vm: - dcr_root_dir = f"/home/{{username}}/dcr" - pypy_path = os.environ.get("PYPYPATH") - agent_version = os.environ.get("AGENTVERSION") - - setup_commands = [ - f"scp -o StrictHostKeyChecking=no -r {sources_dir}/dcr/ {{username}}@{{ip}}:~/", - f'{ssh_cmd} "sudo PYPYPATH={pypy_path} bash {dcr_root_dir}/scripts/install_pip_packages.sh {dcr_root_dir}/requirements.txt"', - f'{ssh_cmd} "sudo bash {dcr_root_dir}/scripts/setup_agent.sh {agent_version}"' - ] - return await execute_commands_concurrently_on_test_vms(commands=setup_commands, timeout=15) - elif command == SetupCommands.fetch_results: - commands = [ - f"scp -o StrictHostKeyChecking=no {{username}}@{{ip}}:~/test-result*.xml {artifact_dir}" - ] - try: - # Try fetching test results in a best effort scenario, if unable to fetch, dont throw an error - return await execute_commands_concurrently_on_test_vms(commands=commands, timeout=15) - except Exception as err: - logger.warning(f"Unable to fetch test results; Error: {err}", exc_info=True) - elif command == SetupCommands.harvest: - commands = [ - f"bash {sources_dir}/dcr/scripts/test-vm/harvest.sh {{username}} {{ip}} {artifact_dir}/harvest" - ] - return await execute_commands_concurrently_on_test_vms(commands=commands, timeout=15) - else: - cmd = f'{ssh_cmd} "{command}"' - return await execute_commands_concurrently_on_test_vms(commands=[cmd], timeout=15) - - -if __name__ == '__main__': - start_time = time.time() - print(f"Start Time: {start_time}") - try: - print(asyncio.run(run_tasks(command=sys.argv[1]))) - finally: - print(f"End time: {time.time()}; Duration: {time.time() - start_time} secs") diff --git a/dcr/scripts/orchestrator/generate_test_files.py b/dcr/scripts/orchestrator/generate_test_files.py deleted file mode 100644 index 527f7d8b9..000000000 --- a/dcr/scripts/orchestrator/generate_test_files.py +++ /dev/null @@ -1,36 +0,0 @@ -import glob -import os -import shutil -import sys - -from junitparser import JUnitXml - -from dcr.scenario_utils.logging_utils import get_logger - -logger = get_logger("dcr.scripts.orchestrator.generate_test_files") - - -def merge_xml_files(test_file_pattern): - xml_data = JUnitXml() - staging_dir = os.environ['BUILD_ARTIFACTSTAGINGDIRECTORY'] - - for test_file in glob.glob(test_file_pattern): - xml_data += JUnitXml.fromfile(test_file) - # Move file to harvest dir to save state and not publish the same test twice - shutil.move(test_file, os.path.join(staging_dir, "harvest", os.path.basename(test_file))) - - if xml_data.tests > 0: - # Merge all files into a single file for cleaner output - output_file_name = f"test-results-{os.environ['SCENARIONAME']}-{os.environ['DISTRONAME']}.xml" - xml_data.write(os.path.join(staging_dir, output_file_name)) - else: - logger.info(f"No test files found for pattern: {test_file_pattern}") - - -if __name__ == "__main__": - try: - merge_xml_files(test_file_pattern=sys.argv[1]) - except Exception as err: - logger.exception( - f"Ran into error when trying to merge test cases. Ignoring the rest: {err}") - diff --git a/dcr/scripts/orchestrator/set_environment.py b/dcr/scripts/orchestrator/set_environment.py deleted file mode 100644 index ff511f168..000000000 --- a/dcr/scripts/orchestrator/set_environment.py +++ /dev/null @@ -1,64 +0,0 @@ -import json -import os.path - -from dcr.scenario_utils.logging_utils import get_logger - -logger = get_logger("dcr.script.orchestrator.set_environment") -add_variable_to_pipeline = '##vso[task.setvariable variable={name};]{value}' - - -def _check_if_file_in_scenario_and_set_variable(file_name: str, name: str, true_value: str, false_val: str = None): - """ - We have certain scenarios in the tests where we determine what type of test to run based on the availability of the file. - Check if file is present in the current scenario, and if so, set the variable name. - Syntax for setting the variable : https://docs.microsoft.com/en-us/azure/devops/pipelines/scripts/logging-commands?view=azure-devops&tabs=bash#setvariable-initialize-or-modify-the-value-of-a-variable - Eg: echo "##vso[task.setvariable variable=;]" - """ - file_path = os.path.join(scenario_path, file_name) - if os.path.exists(file_path): - logger.info(f"Found file: {file_path}, setting variable: {name}") - print(add_variable_to_pipeline.format(name=name, value=true_value)) - elif false_val is not None: - print(add_variable_to_pipeline.format(name=name, value=false_val)) - - -def _override_config(): - """ - This function reads the config.json file present in the scenario and makes all the variables available to the whole - job as environment variables. - It also overrides existing variables with the same name if available. - Note: This function expects config.json to be a flat JSON - """ - config_path = os.path.join(scenario_path, "config.json") - if not os.path.exists(config_path): - logger.info(f"Config file: {config_path} not available") - return - - with open(config_path, encoding="utf-8") as config_fh: - config_data = json.load(config_fh) - for key, val in config_data.items(): - print(add_variable_to_pipeline.format(name=key, value=val)) - - -if __name__ == '__main__': - """ - This script sets the environment for the current job. - It determines what files to run and what not. - Eg: If we're supposed to run run.host.py or run.py - """ - __dcr_dir = os.path.join(os.environ.get("BUILD_SOURCESDIRECTORY"), "dcr") - scenario_path = os.path.join(__dcr_dir, "scenario") - template_dir = os.path.join(__dcr_dir, "templates") - - _check_if_file_in_scenario_and_set_variable(file_name="run.py", name="runPy", true_value="true") - _check_if_file_in_scenario_and_set_variable(file_name="run.host.py", name="runHost", true_value="true") - _check_if_file_in_scenario_and_set_variable(file_name="setup.sh", name="runScenarioSetup", true_value="true") - _check_if_file_in_scenario_and_set_variable(file_name="template.json", name="templateFile", - true_value=os.path.join(scenario_path, "template.json"), - false_val=os.path.join(template_dir, "deploy-linux-vm.json")) - _check_if_file_in_scenario_and_set_variable(file_name="parameters.json", name="parametersFile", - true_value=os.path.join(scenario_path, "parameters.json"), - false_val=os.path.join(template_dir, "deploy-linux-vm-params.json")) - - # Check if config.json exists and add to environment - _override_config() diff --git a/dcr/scripts/setup_agent.sh b/dcr/scripts/setup_agent.sh deleted file mode 100644 index 1e5769472..000000000 --- a/dcr/scripts/setup_agent.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -# https://linuxcommand.org/lc3_man_pages/seth.html -# -e Exit immediately if a command exits with a non-zero status. -# -u Treat unset variables as an error when substituting. -# -x Print commands and their arguments as they are executed. -# -o pipefail the return value of a pipeline is the status of the last command to exit with a non-zero status, -# or zero if no command exited with a non-zero status -set -euxo pipefail - -# $1 $2 $3 $4 $5 $6 $7 -# Usage: AgentVersion - -# Copy agent zip file to /var/lib/waagent to force it to auto-update -[ -z "$1" ] && version="9.9.9.9" || version=$1 - -if systemctl status walinuxagent;then - agent="walinuxagent" -else - agent="waagent" -fi - -sudo systemctl stop $agent - -# We need to force the agent to AutoUpdate to enable our testing -sed -i 's/AutoUpdate.Enabled=n/AutoUpdate.Enabled=y/g' /etc/waagent.conf -# Move the older agent log file to ensure we have a clean slate when testing agent logs -mv /var/log/waagent.log /var/log/waagent.old.log - -sudo cp -r ./dcr/*-$version /var/lib/waagent -sudo systemctl daemon-reload && sudo systemctl start $agent - -sudo systemctl status $agent --no-pager -waagent --version \ No newline at end of file diff --git a/dcr/scripts/test-vm/harvest.sh b/dcr/scripts/test-vm/harvest.sh deleted file mode 100644 index d1715ee59..000000000 --- a/dcr/scripts/test-vm/harvest.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env bash -# 1 2 3 -# Usage: - -# https://linuxcommand.org/lc3_man_pages/seth.html -# -e Exit immediately if a command exits with a non-zero status. -# -u Treat unset variables as an error when substituting. -# -x Print commands and their arguments as they are executed. -# -o pipefail the return value of a pipeline is the status of the last command to exit with a non-zero status, -# or zero if no command exited with a non-zero status -set -euxo pipefail - -ssh -o "StrictHostKeyChecking no" "$1"@"$2" "sudo tar --exclude='journal/*' --exclude='omsbundle' --exclude='omsagent' --exclude='mdsd' --exclude='scx*' --exclude='*.so' --exclude='*__LinuxDiagnostic__*' --exclude='*.zip' --exclude='*.deb' --exclude='*.rpm' -czf logs-$2.tgz /var/log /var/lib/waagent/ /etc/waagent.conf" -# Some distros do not have "other" permissions (e.g., mariner1.0), so change the -# owning user so we can grab them below (during the scp command). -ssh -o "StrictHostKeyChecking no" "$1"@"$2" "sudo chown $1 logs-$2.tgz" - -# Create directory if doesn't exist -mkdir -p "$3" -scp -o "StrictHostKeyChecking no" "$1@$2:logs-$2.tgz" "$3/logs-$2.tgz" \ No newline at end of file diff --git a/dcr/templates/arm-delete.yml b/dcr/templates/arm-delete.yml deleted file mode 100644 index 047b34548..000000000 --- a/dcr/templates/arm-delete.yml +++ /dev/null @@ -1,33 +0,0 @@ -parameters: - - name: scenarios - type: object - - - name: distros - type: object - - - name: rgPrefix - type: string - -jobs: - - job: "DeleteRG" - dependsOn: "Wait" - condition: always() - strategy: - matrix: - ${{ each distro in parameters.distros }}: - ${{ each scenario in parameters.scenarios }}: - ${{ format('{0}-{1}', distro.name, scenario) }}: - scenarioName: ${{ scenario }} - distroName: ${{ distro.name }} - rgName: ${{ format('{0}-{1}-{2}', parameters.rgPrefix, scenario, distro.name) }} - maxParallel: 50 - - steps: - - task: AzureResourceManagerTemplateDeployment@3 - displayName: "Delete test RG" - inputs: - deploymentScope: 'Resource Group' - azureResourceManagerConnection: '$(azureConnection)' - subscriptionId: '$(subId)' - action: 'DeleteRG' - resourceGroupName: '$(rgName)' \ No newline at end of file diff --git a/dcr/templates/deploy-linux-vm-params.json b/dcr/templates/deploy-linux-vm-params.json deleted file mode 100644 index aa29215e3..000000000 --- a/dcr/templates/deploy-linux-vm-params.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#", - "contentVersion": "1.0.0.0", - "parameters": { - "vmName": { - "value": "simpleLinuxVM" - } - } -} \ No newline at end of file diff --git a/dcr/templates/deploy-linux-vm.json b/dcr/templates/deploy-linux-vm.json deleted file mode 100644 index ecff1d62a..000000000 --- a/dcr/templates/deploy-linux-vm.json +++ /dev/null @@ -1,280 +0,0 @@ -{ - "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", - "contentVersion": "1.0.0.0", - "metadata": { - "_generator": { - "name": "bicep", - "version": "0.4.1.14562", - "templateHash": "16607361201936431976" - } - }, - "parameters": { - "vmName": { - "type": "string", - "defaultValue": "simpleLinuxVM", - "metadata": { - "description": "The name of your Virtual Machine." - } - }, - "adminUsername": { - "type": "string", - "metadata": { - "description": "Username for the Virtual Machine." - } - }, - "authenticationType": { - "type": "string", - "defaultValue": "sshPublicKey", - "allowedValues": [ - "sshPublicKey", - "password" - ], - "metadata": { - "description": "Type of authentication to use on the Virtual Machine. SSH key is recommended." - } - }, - "adminPasswordOrKey": { - "type": "secureString", - "metadata": { - "description": "SSH Key or password for the Virtual Machine. SSH key is recommended." - } - }, - "dnsLabelPrefix": { - "type": "string", - "defaultValue": "[toLower(format('simplelinuxvm-{0}', uniqueString(resourceGroup().id)))]", - "metadata": { - "description": "Unique DNS Name for the Public IP used to access the Virtual Machine." - } - }, - "imagePublisher": { - "type": "string", - "defaultValue": "Canonical" - }, - "imageOffer": { - "type": "string", - "defaultValue": "UbuntuServer" - }, - "imageVersion": { - "type": "string", - "defaultValue": "latest" - }, - "imageSku": { - "type": "string", - "defaultValue": "18.04-LTS" - }, - "location": { - "type": "string", - "defaultValue": "[resourceGroup().location]", - "metadata": { - "description": "Location for all resources." - } - }, - "vmSize": { - "type": "string", - "defaultValue": "Standard_B2s", - "metadata": { - "description": "The size of the VM" - } - }, - "virtualNetworkName": { - "type": "string", - "defaultValue": "vNet", - "metadata": { - "description": "Name of the VNET" - } - }, - "subnetName": { - "type": "string", - "defaultValue": "Subnet", - "metadata": { - "description": "Name of the subnet in the virtual network" - } - }, - "networkSecurityGroupName": { - "type": "string", - "defaultValue": "SecGroupNet", - "metadata": { - "description": "Name of the Network Security Group" - } - } - }, - "functions": [], - "variables": { - "publicIPAddressName": "[format('{0}PublicIP', parameters('vmName'))]", - "networkInterfaceName": "[format('{0}NetInt', parameters('vmName'))]", - "osDiskType": "Standard_LRS", - "subnetAddressPrefix": "10.1.0.0/24", - "addressPrefix": "10.1.0.0/16", - "linuxConfiguration": { - "disablePasswordAuthentication": true, - "ssh": { - "publicKeys": [ - { - "path": "[format('/home/{0}/.ssh/authorized_keys', parameters('adminUsername'))]", - "keyData": "[parameters('adminPasswordOrKey')]" - } - ] - } - } - }, - "resources": [ - { - "type": "Microsoft.Network/networkInterfaces", - "apiVersion": "2020-06-01", - "name": "[variables('networkInterfaceName')]", - "location": "[parameters('location')]", - "properties": { - "ipConfigurations": [ - { - "name": "ipconfig1", - "properties": { - "subnet": { - "id": "[resourceId('Microsoft.Network/virtualNetworks/subnets', parameters('virtualNetworkName'), parameters('subnetName'))]" - }, - "privateIPAllocationMethod": "Dynamic", - "publicIPAddress": { - "id": "[resourceId('Microsoft.Network/publicIPAddresses', variables('publicIPAddressName'))]" - } - } - } - ], - "networkSecurityGroup": { - "id": "[resourceId('Microsoft.Network/networkSecurityGroups', parameters('networkSecurityGroupName'))]" - } - }, - "dependsOn": [ - "[resourceId('Microsoft.Network/networkSecurityGroups', parameters('networkSecurityGroupName'))]", - "[resourceId('Microsoft.Network/publicIPAddresses', variables('publicIPAddressName'))]", - "[resourceId('Microsoft.Network/virtualNetworks/subnets', parameters('virtualNetworkName'), parameters('subnetName'))]" - ] - }, - { - "type": "Microsoft.Network/networkSecurityGroups", - "apiVersion": "2020-06-01", - "name": "[parameters('networkSecurityGroupName')]", - "location": "[parameters('location')]", - "properties": { - "securityRules": [ - { - "name": "SSH_service_tag", - "properties": { - "priority": 100, - "protocol": "Tcp", - "access": "Allow", - "direction": "Inbound", - "sourceAddressPrefix": "*", - "sourcePortRange": "*", - "destinationAddressPrefix": "*", - "destinationPortRange": "22" - } - } - ] - } - }, - { - "type": "Microsoft.Network/virtualNetworks", - "apiVersion": "2020-06-01", - "name": "[parameters('virtualNetworkName')]", - "location": "[parameters('location')]", - "dependsOn": [ - "[concat('Microsoft.Network/networkSecurityGroups/', parameters('networkSecurityGroupName'))]" - ], - "properties": { - "addressSpace": { - "addressPrefixes": [ - "[variables('addressPrefix')]" - ] - } - } - }, - { - "type": "Microsoft.Network/virtualNetworks/subnets", - "apiVersion": "2020-06-01", - "name": "[format('{0}/{1}', parameters('virtualNetworkName'), parameters('subnetName'))]", - "properties": { - "addressPrefix": "[variables('subnetAddressPrefix')]", - "privateEndpointNetworkPolicies": "Enabled", - "privateLinkServiceNetworkPolicies": "Enabled", - "networkSecurityGroup": { - "id": "[resourceId('Microsoft.Network/networkSecurityGroups', parameters('networkSecurityGroupName'))]" - } - }, - "dependsOn": [ - "[resourceId('Microsoft.Network/virtualNetworks', parameters('virtualNetworkName'))]" - ] - }, - { - "type": "Microsoft.Network/publicIPAddresses", - "apiVersion": "2020-06-01", - "name": "[variables('publicIPAddressName')]", - "location": "[parameters('location')]", - "sku": { - "name": "Basic" - }, - "properties": { - "publicIPAllocationMethod": "Dynamic", - "publicIPAddressVersion": "IPv4", - "dnsSettings": { - "domainNameLabel": "[parameters('dnsLabelPrefix')]" - }, - "idleTimeoutInMinutes": 4 - } - }, - { - "type": "Microsoft.Compute/virtualMachines", - "apiVersion": "2020-06-01", - "name": "[parameters('vmName')]", - "location": "[parameters('location')]", - "properties": { - "hardwareProfile": { - "vmSize": "[parameters('vmSize')]" - }, - "storageProfile": { - "osDisk": { - "createOption": "FromImage", - "managedDisk": { - "storageAccountType": "[variables('osDiskType')]" - }, - "diskSizeGB": 32 - }, - "imageReference": { - "publisher": "[parameters('imagePublisher')]", - "offer": "[parameters('imageOffer')]", - "sku": "[parameters('imageSku')]", - "version": "[parameters('imageVersion')]" - } - }, - "networkProfile": { - "networkInterfaces": [ - { - "id": "[resourceId('Microsoft.Network/networkInterfaces', variables('networkInterfaceName'))]" - } - ] - }, - "osProfile": { - "computerName": "[parameters('vmName')]", - "adminUsername": "[parameters('adminUsername')]", - "adminPassword": "[parameters('adminPasswordOrKey')]", - "linuxConfiguration": "[if(equals(parameters('authenticationType'), 'password'), null(), variables('linuxConfiguration'))]" - } - }, - "dependsOn": [ - "[resourceId('Microsoft.Network/networkInterfaces', variables('networkInterfaceName'))]" - ] - } - ], - "outputs": { - "adminUsername": { - "type": "string", - "value": "[parameters('adminUsername')]" - }, - "hostname": { - "type": "string", - "value": "[reference(resourceId('Microsoft.Network/publicIPAddresses', variables('publicIPAddressName'))).dnsSettings.fqdn]" - }, - "sshCommand": { - "type": "string", - "value": "[format('ssh {0}@{1}', parameters('adminUsername'), reference(resourceId('Microsoft.Network/publicIPAddresses', variables('publicIPAddressName'))).dnsSettings.fqdn)]" - } - } -} \ No newline at end of file diff --git a/dcr/templates/setup-vm-and-execute-tests.yml b/dcr/templates/setup-vm-and-execute-tests.yml deleted file mode 100644 index b5779d7db..000000000 --- a/dcr/templates/setup-vm-and-execute-tests.yml +++ /dev/null @@ -1,207 +0,0 @@ -parameters: - - name: scenarios - type: object - - - name: distros - type: object - - - name: rgPrefix - type: string - -jobs: - - job: "CreateVM" - displayName: "Setup VM and Run Test" - - strategy: - matrix: - ${{ each distro in parameters.distros }}: - ${{ each scenario in parameters.scenarios }}: - ${{ format('{0}-{1}', distro.name, scenario) }}: - scenarioName: ${{ scenario }} - imagePublisher: ${{ distro.publisher }} - imageOffer: ${{ distro.offer }} - imageSku: ${{ distro.sku }} - imageVersion: ${{ distro.version }} - distroName: ${{ distro.name }} - distroSetupPath: ${{ distro.setupPath }} - rgName: ${{ format('{0}-{1}-{2}', parameters.rgPrefix, scenario, distro.name) }} - maxParallel: 50 - - steps: - - task: InstallSSHKey@0 - displayName: 'Install SSH Key to agent' - name: "InstallKey" - inputs: - knownHostsEntry: 'github.com $(SSH_PUBLIC)' # Adding a dummy known host for github.com as leaving it empty is not allowed by this task - sshPublicKey: '$(SSH_PUBLIC)' - sshKeySecureFile: 'id_rsa' - - - task: AzureKeyVault@2 - displayName: "Fetch secrets from KV" - inputs: - azureSubscription: '$(azureConnection)' - KeyVaultName: 'dcrV2SPs' - SecretsFilter: '*' - RunAsPreJob: true - - - task: UsePythonVersion@0 - displayName: "Set host python version" - inputs: - versionSpec: '3.7' - addToPath: true - architecture: 'x64' - - - script: | - mkdir -p "$(Build.ArtifactStagingDirectory)/harvest" - displayName: "Create harvest directories" - - - bash: $(Build.SourcesDirectory)/dcr/scripts/build_agent_zip.sh - displayName: "Build Agent Zip" - - - bash: $(Build.SourcesDirectory)/dcr/scripts/get_pypy.sh - displayName: "Get PyPy" - - - bash: $(Build.SourcesDirectory)/dcr/scripts/move_scenario.sh - displayName: "Move scenarios" - - - script: pip install -r $(Build.SourcesDirectory)/dcr/requirements.txt - displayName: "Install pip modules on orchestrator" - - - task: PythonScript@0 - inputs: - scriptSource: 'filePath' - scriptPath: '$(Build.SourcesDirectory)/dcr/scripts/orchestrator/set_environment.py' - env: - PYTHONPATH: $(Build.SourcesDirectory) - displayName: "Set Environment" - - - task: AzureResourceManagerTemplateDeployment@3 - name: "deployVM" - timeoutInMinutes: 10 - inputs: - deploymentScope: 'Resource Group' - azureResourceManagerConnection: '$(azureConnection)' - subscriptionId: '$(subId)' - action: 'Create Or Update Resource Group' - resourceGroupName: '$(rgName)' - location: '$(location)' - templateLocation: 'Linked artifact' - csmFile: '$(templateFile)' - csmParametersFile: '$(parametersFile)' - overrideParameters: '-vmName "$(vmName)" -adminUsername "$(adminUsername)" -adminPasswordOrKey "$(SSH_PUBLIC)" -imagePublisher "$(imagePublisher)" -imageOffer "$(imageOffer)" -imageSku $(imageSku) -imageVersion $(imageVersion)' - deploymentMode: 'Complete' - deploymentOutputs: 'armDeploymentOutput' - - - task: AzureCLI@2 - displayName: "Get VMIp" - inputs: - azureSubscription: '$(azureConnection)' - scriptType: 'bash' - scriptLocation: 'inlineScript' - inlineScript: | - az vm list-ip-addresses --resource-group $(rgName) --name $(vmName) --query "[].virtualMachine.network.publicIpAddresses[0].ipAddress" --output tsv > $(Build.SourcesDirectory)/dcr/.vm_ips || echo "No VM Ips" - az vmss list-instance-public-ips --name $(vmName) --resource-group $(rgName) --query "[].ipAddress" --output tsv > $(Build.SourcesDirectory)/dcr/.vmss_ips || echo "No VMSS IPs" - - - script: | - printenv > $(Build.SourcesDirectory)/dcr/.env - displayName: 'Get all environment variables' - name: 'setOutputVars' - - - task: PythonScript@0 - inputs: - scriptSource: 'filePath' - scriptPath: '$(Build.SourcesDirectory)/dcr/scripts/orchestrator/execute_ssh_on_vm.py' - arguments: 'setup_vm' - env: - PYTHONPATH: $(Build.SourcesDirectory) - displayName: "Setup test VM" - - - task: PythonScript@0 - inputs: - scriptSource: 'filePath' - scriptPath: '$(Build.SourcesDirectory)/dcr/scripts/orchestrator/execute_ssh_on_vm.py' - arguments: '"sudo bash /home/$(adminUsername)/$(distroSetupPath)"' - env: - PYTHONPATH: $(Build.SourcesDirectory) - condition: and(succeeded(), not(eq(variables.distroSetupPath, ''))) - displayName: 'Execute Distro Setup on test VM' - - - task: PythonScript@0 - name: "runScenarioSetup" - inputs: - scriptSource: 'filePath' - scriptPath: '$(Build.SourcesDirectory)/dcr/scripts/orchestrator/execute_ssh_on_vm.py' - arguments: '"sudo bash /home/$(adminUsername)/dcr/scenario/setup.sh"' - env: - PYTHONPATH: $(Build.SourcesDirectory) - condition: and(succeeded(), eq(variables.runScenarioSetup, 'true')) - displayName: "Execute Scenario Setup on test VM" - - # This task is needed to ensure we execute the following tasks even if a single one of them fails - - bash: echo "##vso[task.setvariable variable=executeTests]true" - displayName: "Start executing tests" - - - task: PythonScript@0 - inputs: - scriptSource: 'filePath' - scriptPath: '$(Build.SourcesDirectory)/dcr/scenario/run.host.py' - env: - PYTHONPATH: $(Build.SourcesDirectory) - # Add all KeyVault secrets explicitly as they're not added by default to the environment vars - AZURE_CLIENT_ID: $(AZURE-CLIENT-ID) - AZURE_CLIENT_SECRET: $(AZURE-CLIENT-SECRET) - AZURE_TENANT_ID: $(AZURE-TENANT-ID) - displayName: "Run the test file on the Orchestrator" - condition: and(eq(variables.executeTests, 'true'), eq(variables.runHost, 'true')) - - - task: PythonScript@0 - inputs: - scriptSource: 'filePath' - scriptPath: '$(Build.SourcesDirectory)/dcr/scripts/orchestrator/execute_ssh_on_vm.py' - arguments: '"sudo PYTHONPATH=. $(pypyPath) dcr/scenario/run.py"' - env: - PYTHONPATH: $(Build.SourcesDirectory) - condition: and(eq(variables.executeTests, 'true'), eq(variables.runPy, 'true')) - displayName: "Execute test suite on VM" - - - task: PythonScript@0 - inputs: - scriptSource: 'filePath' - scriptPath: '$(Build.SourcesDirectory)/dcr/scripts/orchestrator/execute_ssh_on_vm.py' - arguments: 'fetch_results' - env: - PYTHONPATH: $(Build.SourcesDirectory) - condition: eq(variables.executeTests, 'true') - displayName: 'Fetch test results' - - - task: PythonScript@0 - inputs: - scriptSource: 'filePath' - scriptPath: '$(Build.SourcesDirectory)/dcr/scripts/orchestrator/generate_test_files.py' - arguments: '"$(Build.ArtifactStagingDirectory)/test-result*.xml"' - env: - PYTHONPATH: $(Build.SourcesDirectory) - condition: eq(variables.executeTests, 'true') - displayName: 'Merge test results' - - - task: PublishTestResults@2 - condition: eq(variables.executeTests, 'true') - inputs: - testResultsFormat: 'JUnit' - testResultsFiles: '$(Build.ArtifactStagingDirectory)/test-result*.xml' - testRunTitle: 'Publish test results for $(scenarioName)-$(distroName)' - - - task: PythonScript@0 - inputs: - scriptSource: 'filePath' - scriptPath: '$(Build.SourcesDirectory)/dcr/scripts/orchestrator/execute_ssh_on_vm.py' - arguments: 'harvest' - env: - PYTHONPATH: $(Build.SourcesDirectory) - condition: and(failed(), eq(variables.executeTests, 'true')) - displayName: 'Fetch Harvest results' - - - publish: $(Build.ArtifactStagingDirectory)/harvest - artifact: $(rgName)-harvest - condition: and(failed(), eq(variables.executeTests, 'true')) - displayName: 'Publish Harvest logs' diff --git a/dcr/templates/vars.yml b/dcr/templates/vars.yml deleted file mode 100644 index 6b41ac6a1..000000000 --- a/dcr/templates/vars.yml +++ /dev/null @@ -1,21 +0,0 @@ -# Template file for the common variables between the 2 pipelines - -variables: - rgPrefix: 'dcr-v2-test' - - vmName: 'dcrLinuxVM' - adminUsername: 'dcr' - - # Public Cloud Data - azureConnection: 'AzLinux DCR Public (8e037ad4-618f-4466-8bc8-5099d41ac15b)' - subId: '8e037ad4-618f-4466-8bc8-5099d41ac15b' - location: 'East US 2' - - # ToDo: Create new pipelines for Fairfax and Mooncake - fairfaxConn: 'VMGuestAgentAndExtensionsFairfax (8e5abcac-74f0-4955-9dfb-fe3fe36f8d19)' - fairfaxSub: '8e5abcac-74f0-4955-9dfb-fe3fe36f8d19' - fairfaxLocation: 'usgovarizona' - - mooncakeConn: 'Guest Agent Mooncake ( 557a8daa-8ac8-4caa-88e4-3b6f939978b9 )' - mooncakeSub: '557a8daa-8ac8-4caa-88e4-3b6f939978b9' - mooncakeLocation: 'china north 2' \ No newline at end of file diff --git a/makepkg.py b/makepkg.py index 11e90b95a..5ec04d5d8 100755 --- a/makepkg.py +++ b/makepkg.py @@ -1,14 +1,14 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 +import argparse import glob -import os +import logging import os.path import shutil import subprocess import sys -from azurelinuxagent.common.version import AGENT_NAME, AGENT_VERSION, \ - AGENT_LONG_VERSION +from azurelinuxagent.common.version import AGENT_NAME, AGENT_VERSION, AGENT_LONG_VERSION from azurelinuxagent.ga.update import AGENT_MANIFEST_FILE MANIFEST = '''[{{ @@ -48,62 +48,77 @@ PUBLISH_MANIFEST_FILE = 'manifest.xml' -output_path = os.path.join(os.getcwd(), "eggs") # pylint: disable=invalid-name -target_path = os.path.join(output_path, AGENT_LONG_VERSION) # pylint: disable=invalid-name -bin_path = os.path.join(target_path, "bin") # pylint: disable=invalid-name -egg_path = os.path.join(bin_path, AGENT_LONG_VERSION + ".egg") # pylint: disable=invalid-name -manifest_path = os.path.join(target_path, AGENT_MANIFEST_FILE) # pylint: disable=invalid-name -publish_manifest_path = os.path.join(target_path, PUBLISH_MANIFEST_FILE) # pylint: disable=invalid-name -pkg_name = os.path.join(output_path, AGENT_LONG_VERSION + ".zip") # pylint: disable=invalid-name -family = 'Test' # pylint: disable=C0103 -if len(sys.argv) > 1: - family = sys.argv[1] # pylint: disable=invalid-name - -def do(*args): # pylint: disable=C0103,W0621 +def do(*args): try: - subprocess.check_output(args, stderr=subprocess.STDOUT) + return subprocess.check_output(args, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: # pylint: disable=C0103 - print("ERROR: {0}".format(str(e))) - print("\t{0}".format(" ".join(args))) - print(e.output) - sys.exit(1) - + raise Exception("[{0}] failed:\n{1}\n{2}".format(" ".join(args), str(e), e.output)) + + +def run(agent_family, output_directory, log): + output_path = os.path.join(output_directory, "eggs") + target_path = os.path.join(output_path, AGENT_LONG_VERSION) + bin_path = os.path.join(target_path, "bin") + egg_path = os.path.join(bin_path, AGENT_LONG_VERSION + ".egg") + manifest_path = os.path.join(target_path, AGENT_MANIFEST_FILE) + publish_manifest_path = os.path.join(target_path, PUBLISH_MANIFEST_FILE) + pkg_name = os.path.join(output_path, AGENT_LONG_VERSION + ".zip") + + if os.path.isdir(target_path): + shutil.rmtree(target_path) + elif os.path.isfile(target_path): + os.remove(target_path) + if os.path.isfile(pkg_name): + os.remove(pkg_name) + os.makedirs(bin_path) + log.info("Created {0} directory".format(target_path)) + + setup_path = os.path.join(os.path.dirname(__file__), "setup.py") + args = ["python3", setup_path, "bdist_egg", "--dist-dir={0}".format(bin_path)] + + log.info("Creating egg {0}".format(egg_path)) + do(*args) + + egg_name = os.path.join("bin", os.path.basename( + glob.glob(os.path.join(bin_path, "*"))[0])) + + log.info("Writing {0}".format(manifest_path)) + with open(manifest_path, mode='w') as manifest: + manifest.write(MANIFEST.format(AGENT_NAME, egg_name)) + + log.info("Writing {0}".format(publish_manifest_path)) + with open(publish_manifest_path, mode='w') as publish_manifest: + publish_manifest.write(PUBLISH_MANIFEST.format(AGENT_VERSION, agent_family)) + + cwd = os.getcwd() + os.chdir(target_path) + try: + log.info("Creating package {0}".format(pkg_name)) + do("zip", "-r", pkg_name, egg_name) + do("zip", "-j", pkg_name, AGENT_MANIFEST_FILE) + do("zip", "-j", pkg_name, PUBLISH_MANIFEST_FILE) + finally: + os.chdir(cwd) -if os.path.isdir(target_path): - shutil.rmtree(target_path) -elif os.path.isfile(target_path): - os.remove(target_path) -if os.path.isfile(pkg_name): - os.remove(pkg_name) -os.makedirs(bin_path) -print("Created {0} directory".format(target_path)) + log.info("Package {0} successfully created".format(pkg_name)) -args = ["python", "setup.py", "bdist_egg", "--dist-dir={0}".format(bin_path)] # pylint: disable=invalid-name -print("Creating egg {0}".format(egg_path)) -do(*args) +if __name__ == "__main__": + logging.basicConfig(format='%(message)s', level=logging.INFO) -egg_name = os.path.join("bin", os.path.basename( # pylint: disable=invalid-name - glob.glob(os.path.join(bin_path, "*"))[0])) + parser = argparse.ArgumentParser() + parser.add_argument('family', metavar='family', nargs='?', default='Test', help='Agent family') + parser.add_argument('-o', '--output', default=os.getcwd(), help='Output directory') -print("Writing {0}".format(manifest_path)) -with open(manifest_path, mode='w') as manifest: - manifest.write(MANIFEST.format(AGENT_NAME, egg_name)) + arguments = parser.parse_args() -print("Writing {0}".format(publish_manifest_path)) -with open(publish_manifest_path, mode='w') as publish_manifest: - publish_manifest.write(PUBLISH_MANIFEST.format(AGENT_VERSION, - family)) + try: + run(arguments.family, arguments.output, logging) -cwd = os.getcwd() # pylint: disable=invalid-name -os.chdir(target_path) -print("Creating package {0}".format(pkg_name)) -do("zip", "-r", pkg_name, egg_name) -do("zip", "-j", pkg_name, AGENT_MANIFEST_FILE) -do("zip", "-j", pkg_name, PUBLISH_MANIFEST_FILE) -os.chdir(cwd) + except Exception as exception: + logging.error(str(exception)) + sys.exit(1) -print("Package {0} successfully created".format(pkg_name)) -sys.exit(0) + sys.exit(0) diff --git a/setup.py b/setup.py index 17d130867..8f5d92b42 100755 --- a/setup.py +++ b/setup.py @@ -248,6 +248,12 @@ def get_data_files(name, version, fullname): # pylint: disable=R0912 set_conf_files(data_files, src=["config/photonos/waagent.conf"]) set_systemd_files(data_files, dest=systemd_dir_path, src=["init/photonos/waagent.service"]) + elif name == 'fedora': + set_bin_files(data_files, dest=agent_bin_path) + set_conf_files(data_files) + set_logrotate_files(data_files) + set_udev_files(data_files) + set_systemd_files(data_files, dest=systemd_dir_path) else: # Use default setting set_bin_files(data_files, dest=agent_bin_path) @@ -282,7 +288,10 @@ def initialize_options(self): self.lnx_distro_version = DISTRO_VERSION self.lnx_distro_fullname = DISTRO_FULL_NAME self.register_service = False - self.skip_data_files = False + # All our data files are system-wide files that are not included in the egg; skip them when + # creating an egg. + self.skip_data_files = "bdist_egg" in sys.argv + # pylint: enable=attribute-defined-outside-init def finalize_options(self): diff --git a/test-requirements.txt b/test-requirements.txt index f335db282..3c54ab997 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -13,3 +13,9 @@ wrapt==1.12.0; python_version > '2.6' and python_version < '3.6' pylint; python_version > '2.6' and python_version < '3.6' pylint==2.8.3; python_version >= '3.6' +# Requirements to run pylint on the end-to-end tests source code +assertpy +azure-core +azure-identity +azure-mgmt-compute>=22.1.0 +azure-mgmt-resource>=15.0.0 diff --git a/tests/common/mock_cgroup_environment.py b/tests/common/mock_cgroup_environment.py index 10d499077..e38471060 100644 --- a/tests/common/mock_cgroup_environment.py +++ b/tests/common/mock_cgroup_environment.py @@ -91,6 +91,7 @@ class UnitFilePaths: walinuxagent = "/lib/systemd/system/walinuxagent.service" + logcollector = "/lib/systemd/system/azure-walinuxagent-logcollector.slice" azure = "/lib/systemd/system/azure.slice" vmextensions = "/lib/systemd/system/azure-vmextensions.slice" extensionslice = "/lib/systemd/system/azure-vmextensions-Microsoft.CPlat.Extension.slice" diff --git a/tests/common/osutil/test_default.py b/tests/common/osutil/test_default.py index ac5510243..ab4fa5c99 100644 --- a/tests/common/osutil/test_default.py +++ b/tests/common/osutil/test_default.py @@ -35,7 +35,7 @@ from azurelinuxagent.common.utils.flexible_version import FlexibleVersion from azurelinuxagent.common.utils.networkutil import AddFirewallRules from tests.common.mock_environment import MockEnvironment -from tests.tools import AgentTestCase, patch, open_patch, load_data, data_dir +from tests.tools import AgentTestCase, patch, open_patch, load_data, data_dir, is_python_version_26_or_34, skip_if_predicate_true actual_get_proc_net_route = 'azurelinuxagent.common.osutil.default.DefaultOSUtil._get_proc_net_route' @@ -950,6 +950,7 @@ def test_remove_firewall_should_not_retry_invalid_rule(self): self.assertFalse(osutil._enable_firewall) + @skip_if_predicate_true(is_python_version_26_or_34, "Disabled on Python 2.6 and 3.4 for now. Need to revisit to fix it") def test_get_nic_state(self): state = osutil.DefaultOSUtil().get_nic_state() self.assertNotEqual(state, {}) diff --git a/tests/common/test_cgroupconfigurator.py b/tests/common/test_cgroupconfigurator.py index 62a7211a7..7e2dc45b4 100644 --- a/tests/common/test_cgroupconfigurator.py +++ b/tests/common/test_cgroupconfigurator.py @@ -33,12 +33,13 @@ from azurelinuxagent.common.cgroupconfigurator import CGroupConfigurator, DisableCgroups from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.event import WALAEventOperation -from azurelinuxagent.common.exception import CGroupsException, ExtensionError, ExtensionErrorCodes +from azurelinuxagent.common.exception import CGroupsException, ExtensionError, ExtensionErrorCodes, \ + AgentMemoryExceededException from azurelinuxagent.common.future import ustr from azurelinuxagent.common.utils import shellutil, fileutil from tests.common.mock_environment import MockCommand from tests.common.mock_cgroup_environment import mock_cgroup_environment, UnitFilePaths -from tests.tools import AgentTestCase, patch, mock_sleep, i_am_root, data_dir +from tests.tools import AgentTestCase, patch, mock_sleep, i_am_root, data_dir, is_python_version_26_or_34, skip_if_predicate_true from tests.utils.miscellaneous_tools import format_processes, wait_for @@ -187,6 +188,27 @@ def test_initialize_should_create_unit_files_when_the_agent_service_file_is_not_ self.assertTrue(os.path.exists(agent_drop_in_file_cpu_accounting), "{0} was not created".format(agent_drop_in_file_cpu_accounting)) self.assertTrue(os.path.exists(agent_drop_in_file_memory_accounting), "{0} was not created".format(agent_drop_in_file_memory_accounting)) + def test_initialize_should_update_logcollector_memorylimit(self): + with self._get_cgroup_configurator(initialize=False) as configurator: + log_collector_unit_file = configurator.mocks.get_mapped_path(UnitFilePaths.logcollector) + original_memory_limit = "MemoryLimit=30M" + + # The mock creates the slice unit file with memory limit + configurator.mocks.add_data_file(os.path.join(data_dir, 'init', "azure-walinuxagent-logcollector.slice"), + UnitFilePaths.logcollector) + if not os.path.exists(log_collector_unit_file): + raise Exception("{0} should have been created during test setup".format(log_collector_unit_file)) + if not fileutil.findre_in_file(log_collector_unit_file, original_memory_limit): + raise Exception("MemoryLimit was not set correctly. Expected: {0}. Got:\n{1}".format( + original_memory_limit, fileutil.read_file(log_collector_unit_file))) + + configurator.initialize() + + # initialize() should update the unit file to remove the memory limit + self.assertFalse(fileutil.findre_in_file(log_collector_unit_file, original_memory_limit), + "Log collector slice unit file was not updated correctly. Expected no memory limit. Got:\n{0}".format( + fileutil.read_file(log_collector_unit_file))) + def test_setup_extension_slice_should_create_unit_files(self): with self._get_cgroup_configurator() as configurator: # get the paths to the mocked files @@ -504,6 +526,7 @@ def test_start_extension_command_should_disable_cgroups_and_invoke_the_command_d self.assertEqual(len(CGroupsTelemetry._tracked), 0, "No cgroups should have been created") + @skip_if_predicate_true(is_python_version_26_or_34, "Disabled on Python 2.6 and 3.4 for now. Need to revisit to fix it") @attr('requires_sudo') @patch('time.sleep', side_effect=lambda _: mock_sleep()) def test_start_extension_command_should_not_use_fallback_option_if_extension_fails(self, *args): @@ -541,6 +564,7 @@ def test_start_extension_command_should_not_use_fallback_option_if_extension_fai # wasn't truncated. self.assertIn("Running scope as unit", ustr(context_manager.exception)) + @skip_if_predicate_true(is_python_version_26_or_34, "Disabled on Python 2.6 and 3.4 for now. Need to revisit to fix it") @attr('requires_sudo') @patch('time.sleep', side_effect=lambda _: mock_sleep()) @patch("azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN", 5) @@ -986,3 +1010,15 @@ def test_check_cgroups_should_disable_cgroups_when_a_check_fails(self): finally: for p in patchers: p.stop() + + def test_check_agent_memory_usage_should_raise_a_cgroups_exception_when_the_limit_is_exceeded(self): + metrics = [MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.TOTAL_MEM_USAGE, AGENT_NAME_TELEMETRY, conf.get_agent_memory_quota() + 1), + MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.SWAP_MEM_USAGE, AGENT_NAME_TELEMETRY, conf.get_agent_memory_quota() + 1)] + + with self.assertRaises(AgentMemoryExceededException) as context_manager: + with self._get_cgroup_configurator() as configurator: + with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_tracked_metrics") as tracked_metrics: + tracked_metrics.return_value = metrics + configurator.check_agent_memory_usage() + + self.assertIn("The agent memory limit {0} bytes exceeded".format(conf.get_agent_memory_quota()), ustr(context_manager.exception), "An incorrect exception was raised") \ No newline at end of file diff --git a/tests/common/test_event.py b/tests/common/test_event.py index 7191f3c30..de5ad7353 100644 --- a/tests/common/test_event.py +++ b/tests/common/test_event.py @@ -44,7 +44,7 @@ from tests.protocol import mockwiredata from tests.protocol.mocks import mock_wire_protocol, MockHttpResponse from tests.protocol.HttpRequestPredicates import HttpRequestPredicates -from tests.tools import AgentTestCase, data_dir, load_data, patch, skip_if_predicate_true +from tests.tools import AgentTestCase, data_dir, load_data, patch, skip_if_predicate_true, is_python_version_26_or_34 from tests.utils.event_logger_tools import EventLoggerTools @@ -161,12 +161,12 @@ def create_event_and_return_container_id(): # pylint: disable=inconsistent-retu self.assertEqual(contained_id, 'c6d5526c-5ac2-4200-b6e2-56f2b70c5ab2', "Incorrect container ID") protocol.mock_wire_data.set_container_id('AAAAAAAA-BBBB-CCCC-DDDD-EEEEEEEEEEEE') - protocol.update_goal_state() + protocol.client.update_goal_state() contained_id = create_event_and_return_container_id() self.assertEqual(contained_id, 'AAAAAAAA-BBBB-CCCC-DDDD-EEEEEEEEEEEE', "Incorrect container ID") protocol.mock_wire_data.set_container_id('11111111-2222-3333-4444-555555555555') - protocol.update_goal_state() + protocol.client.update_goal_state() contained_id = create_event_and_return_container_id() self.assertEqual(contained_id, '11111111-2222-3333-4444-555555555555', "Incorrect container ID") @@ -414,6 +414,7 @@ def test_collect_events_should_be_able_to_process_events_with_non_ascii_characte self.assertEqual(len(event_list), 1) self.assertEqual(TestEvent._get_event_message(event_list[0]), u'World\u05e2\u05d9\u05d5\u05ea \u05d0\u05d7\u05e8\u05d5\u05ea\u0906\u091c') + @skip_if_predicate_true(is_python_version_26_or_34, "Disabled on Python 2.6 and 3.4 for now. Need to revisit to fix it") def test_collect_events_should_ignore_invalid_event_files(self): self._create_test_event_file("custom_script_1.tld") # a valid event self._create_test_event_file("custom_script_utf-16.tld") diff --git a/tests/data/events/extension_events/int_type/1519934744.json b/tests/data/events/extension_events/int_type/1519934744.json new file mode 100644 index 000000000..01773a9ad --- /dev/null +++ b/tests/data/events/extension_events/int_type/1519934744.json @@ -0,0 +1,10 @@ +{ + "EventLevel": "INFO", + "Message": "Accept int value for eventpid and eventtid", + "Version": "1", + "TaskName": "Downloading files", + "EventPid": 3228, + "EventTid": 1, + "OpErAtiOnID": "519e4beb-018a-4bd9-8d8e-c5226cf7f56e", + "TimeStamp": "2023-03-13T01:21:05.1960563Z" +} \ No newline at end of file diff --git a/tests/data/ga/WALinuxAgent-9.9.9.9-no_manifest.zip b/tests/data/ga/WALinuxAgent-9.9.9.9-no_manifest.zip new file mode 100644 index 000000000..8d84af378 Binary files /dev/null and b/tests/data/ga/WALinuxAgent-9.9.9.9-no_manifest.zip differ diff --git a/tests/data/ga/fake_extension.zip b/tests/data/ga/fake_extension.zip new file mode 100644 index 000000000..ae4a38d2e Binary files /dev/null and b/tests/data/ga/fake_extension.zip differ diff --git a/tests/data/init/azure-walinuxagent-logcollector.slice b/tests/data/init/azure-walinuxagent-logcollector.slice new file mode 100644 index 000000000..63c09d431 --- /dev/null +++ b/tests/data/init/azure-walinuxagent-logcollector.slice @@ -0,0 +1,9 @@ +[Unit] +Description=Slice for Azure VM Agent Periodic Log Collector +DefaultDependencies=no +Before=slices.target +[Slice] +CPUAccounting=yes +CPUQuota=5% +MemoryAccounting=yes +MemoryLimit=30M \ No newline at end of file diff --git a/tests/ga/test_collect_telemetry_events.py b/tests/ga/test_collect_telemetry_events.py index f429bd52a..bdd763eff 100644 --- a/tests/ga/test_collect_telemetry_events.py +++ b/tests/ga/test_collect_telemetry_events.py @@ -309,6 +309,16 @@ def test_it_should_parse_special_chars_properly(self): self._assert_handler_data_in_event_list(telemetry_events, extensions_with_count) + def test_it_should_parse_int_type_for_eventpid_or_eventtid_properly(self): + with self._create_extension_telemetry_processor() as extension_telemetry_processor: + extensions_with_count = self._create_random_extension_events_dir_with_events(2, os.path.join( + self._TEST_DATA_DIR, "int_type")) + + extension_telemetry_processor.run() + telemetry_events = self._get_handlers_with_version(extension_telemetry_processor.event_list) + + self._assert_handler_data_in_event_list(telemetry_events, extensions_with_count) + def _setup_and_assert_tests_for_max_sizes(self, no_of_extensions=2, expected_count=None): with self._create_extension_telemetry_processor() as extension_telemetry_processor: extensions_with_count = self._create_random_extension_events_dir_with_events(no_of_extensions, diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py index 1f261ee26..2272a1907 100644 --- a/tests/ga/test_extension.py +++ b/tests/ga/test_extension.py @@ -79,6 +79,7 @@ def raise_ioerror(*args): # pylint: disable=unused-argument e.errno = EIO raise e + class TestExtensionCleanup(AgentTestCase): def setUp(self): @@ -140,8 +141,6 @@ def test_cleanup_leaves_installed_extensions(self): exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() - self.assertEqual(no_of_exts, TestExtensionCleanup._count_packages(), - "No of extensions in config doesn't match the packages") self.assertEqual(no_of_exts, TestExtensionCleanup._count_extension_directories(), "No of extension directories doesnt match the no of extensions in GS") self._assert_ext_handler_status(protocol.aggregate_status, "Ready", expected_ext_handler_count=no_of_exts, @@ -151,8 +150,6 @@ def test_cleanup_removes_uninstalled_extensions(self): with self._setup_test_env(mockwiredata.DATA_FILE_MULTIPLE_EXT) as (exthandlers_handler, protocol, no_of_exts): exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() - self.assertEqual(no_of_exts, TestExtensionCleanup._count_packages(), - "No of extensions in config doesn't match the packages") self._assert_ext_handler_status(protocol.aggregate_status, "Ready", expected_ext_handler_count=no_of_exts, version="1.0.0") @@ -242,8 +239,6 @@ def assert_extension_seq_no(expected_seq_no): # Run 1 - GS has no required features and contains 5 extensions exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() - self.assertEqual(orig_no_of_exts, TestExtensionCleanup._count_packages(), - "No of extensions in config doesn't match the packages") self.assertEqual(orig_no_of_exts, TestExtensionCleanup._count_extension_directories(), "No of extension directories doesnt match the no of extensions in GS") self._assert_ext_handler_status(protocol.aggregate_status, "Ready", expected_ext_handler_count=orig_no_of_exts, @@ -261,8 +256,6 @@ def assert_extension_seq_no(expected_seq_no): exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() self.assertGreater(orig_no_of_exts, 1, "No of extensions to check should be > 1") - self.assertEqual(orig_no_of_exts, TestExtensionCleanup._count_packages(), - "No of extensions should not be changed") self.assertEqual(orig_no_of_exts, TestExtensionCleanup._count_extension_directories(), "No of extension directories should not be changed") self._assert_ext_handler_status(protocol.aggregate_status, "Ready", expected_ext_handler_count=orig_no_of_exts, @@ -286,8 +279,6 @@ def assert_extension_seq_no(expected_seq_no): protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() - self.assertEqual(1, TestExtensionCleanup._count_packages(), - "No of extensions should not be changed") self.assertEqual(1, TestExtensionCleanup._count_extension_directories(), "No of extension directories should not be changed") self._assert_ext_handler_status(protocol.aggregate_status, "Ready", expected_ext_handler_count=1, @@ -519,7 +510,7 @@ def _set_up_update_test_and_update_gs(self, patch_command, *args): test_data.set_incarnation(2) test_data.set_extensions_config_version("1.0.1") test_data.set_manifest_version('1.0.1') - protocol.update_goal_state() + protocol.client.update_goal_state() # Ensure the patched command fails patch_command.return_value = "exit 1" @@ -551,7 +542,7 @@ def test_ext_handler(self, *args): # Test goal state changed test_data.set_incarnation(2) test_data.set_extensions_config_sequence_number(1) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -564,7 +555,7 @@ def test_ext_handler(self, *args): test_data.set_incarnation(3) test_data.set_extensions_config_version("1.1.1") test_data.set_extensions_config_sequence_number(2) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -576,7 +567,7 @@ def test_ext_handler(self, *args): test_data.set_incarnation(4) test_data.set_extensions_config_version("1.2.0") test_data.set_extensions_config_sequence_number(3) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -587,7 +578,7 @@ def test_ext_handler(self, *args): # Test disable test_data.set_incarnation(5) test_data.set_extensions_config_state(ExtensionRequestedState.Disabled) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -597,7 +588,7 @@ def test_ext_handler(self, *args): # Test uninstall test_data.set_incarnation(6) test_data.set_extensions_config_state(ExtensionRequestedState.Uninstall) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -606,7 +597,7 @@ def test_ext_handler(self, *args): # Test uninstall again! test_data.set_incarnation(7) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -629,7 +620,7 @@ def _assert_handler_status_and_manifest_download_count(protocol, test_data, mani # Update Incarnation test_data.set_incarnation(2) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -682,120 +673,6 @@ def test_it_should_process_valid_extensions_if_present(self, mock_get, mock_cryp expected_handlers.remove(handler.name) self.assertEqual(0, len(expected_handlers), "All handlers not reported status") - def test_ext_zip_file_packages_removed_in_update_case(self, *args): - # Test enable scenario. - test_data = mockwiredata.WireProtocolData(mockwiredata.DATA_FILE) - exthandlers_handler, protocol = self._create_mock(test_data, *args) # pylint: disable=no-value-for-parameter - - exthandlers_handler.run() - exthandlers_handler.report_ext_handlers_status() - - self._assert_handler_status(protocol.report_vm_status, "Ready", 1, "1.0.0") - self._assert_ext_status(protocol.report_vm_status, "success", 0) - self._assert_ext_pkg_file_status(expected_to_be_present=True, extension_version="1.0.0") - - # Update the package - test_data.set_incarnation(2) - test_data.set_extensions_config_sequence_number(1) - test_data.set_extensions_config_version("1.1.0") - protocol.update_goal_state() - - exthandlers_handler.run() - exthandlers_handler.report_ext_handlers_status() - - self._assert_handler_status(protocol.report_vm_status, "Ready", 1, "1.1.0") - self._assert_ext_status(protocol.report_vm_status, "success", 1) - self._assert_ext_pkg_file_status(expected_to_be_present=False, extension_version="1.0.0") - self._assert_ext_pkg_file_status(expected_to_be_present=True, extension_version="1.1.0") - - # Update the package second time - test_data.set_incarnation(3) - test_data.set_extensions_config_sequence_number(2) - test_data.set_extensions_config_version("1.2.0") - protocol.update_goal_state() - - exthandlers_handler.run() - exthandlers_handler.report_ext_handlers_status() - - self._assert_handler_status(protocol.report_vm_status, "Ready", 1, "1.2.0") - self._assert_ext_status(protocol.report_vm_status, "success", 2) - self._assert_ext_pkg_file_status(expected_to_be_present=False, extension_version="1.1.0") - self._assert_ext_pkg_file_status(expected_to_be_present=True, extension_version="1.2.0") - - def test_ext_zip_file_packages_removed_in_uninstall_case(self, *args): - # Test enable scenario. - test_data = mockwiredata.WireProtocolData(mockwiredata.DATA_FILE) - exthandlers_handler, protocol = self._create_mock(test_data, *args) # pylint: disable=no-value-for-parameter - extension_version = "1.0.0" - - exthandlers_handler.run() - exthandlers_handler.report_ext_handlers_status() - - self._assert_handler_status(protocol.report_vm_status, "Ready", 1, extension_version) - self._assert_ext_status(protocol.report_vm_status, "success", 0) - self._assert_ext_pkg_file_status(expected_to_be_present=True, extension_version=extension_version) - - # Test uninstall - test_data.set_incarnation(2) - test_data.set_extensions_config_state(ExtensionRequestedState.Uninstall) - protocol.update_goal_state() - - exthandlers_handler.run() - exthandlers_handler.report_ext_handlers_status() - - self._assert_no_handler_status(protocol.report_vm_status) - self._assert_ext_pkg_file_status(expected_to_be_present=False, extension_version=extension_version) - - def test_ext_zip_file_packages_removed_in_update_and_uninstall_case(self, *args): - # Test enable scenario. - test_data = mockwiredata.WireProtocolData(mockwiredata.DATA_FILE) - exthandlers_handler, protocol = self._create_mock(test_data, *args) # pylint: disable=no-value-for-parameter - - exthandlers_handler.run() - exthandlers_handler.report_ext_handlers_status() - - self._assert_handler_status(protocol.report_vm_status, "Ready", 1, "1.0.0") - self._assert_ext_status(protocol.report_vm_status, "success", 0) - self._assert_ext_pkg_file_status(expected_to_be_present=True, extension_version="1.0.0") - - # Update the package - test_data.set_incarnation(2) - test_data.set_extensions_config_sequence_number(1) - test_data.set_extensions_config_version("1.1.0") - protocol.update_goal_state() - - exthandlers_handler.run() - exthandlers_handler.report_ext_handlers_status() - - self._assert_handler_status(protocol.report_vm_status, "Ready", 1, "1.1.0") - self._assert_ext_status(protocol.report_vm_status, "success", 1) - self._assert_ext_pkg_file_status(expected_to_be_present=False, extension_version="1.0.0") - self._assert_ext_pkg_file_status(expected_to_be_present=True, extension_version="1.1.0") - - # Update the package second time - test_data.set_incarnation(3) - test_data.set_extensions_config_sequence_number(2) - test_data.set_extensions_config_version("1.2.0") - protocol.update_goal_state() - - exthandlers_handler.run() - exthandlers_handler.report_ext_handlers_status() - - self._assert_handler_status(protocol.report_vm_status, "Ready", 1, "1.2.0") - self._assert_ext_status(protocol.report_vm_status, "success", 2) - self._assert_ext_pkg_file_status(expected_to_be_present=False, extension_version="1.1.0") - self._assert_ext_pkg_file_status(expected_to_be_present=True, extension_version="1.2.0") - - # Test uninstall - test_data.set_incarnation(4) - test_data.set_extensions_config_state(ExtensionRequestedState.Uninstall) - protocol.update_goal_state() - - exthandlers_handler.run() - exthandlers_handler.report_ext_handlers_status() - - self._assert_no_handler_status(protocol.report_vm_status) - self._assert_ext_pkg_file_status(expected_to_be_present=False, extension_version="1.2.0") def test_it_should_ignore_case_when_parsing_plugin_settings(self, mock_get, mock_crypt_util, *args): test_data = mockwiredata.WireProtocolData(mockwiredata.DATA_FILE_CASE_MISMATCH_EXT) @@ -843,7 +720,7 @@ def test_ext_handler_no_settings(self, *args): # Uninstall the Plugin and make sure Disable called test_data.set_incarnation(2) test_data.set_extensions_config_state(ExtensionRequestedState.Uninstall) - protocol.update_goal_state() + protocol.client.update_goal_state() with enable_invocations(test_ext) as invocation_record: exthandlers_handler.run() @@ -922,7 +799,7 @@ def test_ext_handler_sequencing(self, *args): dep_ext_level_4 = extension_emulator(name="OSTCExtensions.OtherExampleHandlerLinux") test_data.ext_conf = test_data.ext_conf.replace("dependencyLevel=\"2\"", "dependencyLevel=\"3\"") test_data.ext_conf = test_data.ext_conf.replace("dependencyLevel=\"1\"", "dependencyLevel=\"4\"") - protocol.update_goal_state() + protocol.client.update_goal_state() with enable_invocations(dep_ext_level_3, dep_ext_level_4) as invocation_record: exthandlers_handler.run() @@ -949,7 +826,7 @@ def test_ext_handler_sequencing(self, *args): # the last one disabled. test_data.set_incarnation(3) test_data.set_extensions_config_state(ExtensionRequestedState.Disabled) - protocol.update_goal_state() + protocol.client.update_goal_state() with enable_invocations(dep_ext_level_3, dep_ext_level_4) as invocation_record: exthandlers_handler.run() @@ -981,7 +858,7 @@ def test_ext_handler_sequencing(self, *args): dep_ext_level_6 = extension_emulator(name="OSTCExtensions.ExampleHandlerLinux") test_data.ext_conf = test_data.ext_conf.replace("dependencyLevel=\"3\"", "dependencyLevel=\"6\"") test_data.ext_conf = test_data.ext_conf.replace("dependencyLevel=\"4\"", "dependencyLevel=\"5\"") - protocol.update_goal_state() + protocol.client.update_goal_state() with enable_invocations(dep_ext_level_5, dep_ext_level_6) as invocation_record: exthandlers_handler.run() @@ -1061,7 +938,7 @@ def mock_fail_extension_commands(args, **kwargs): _assert_event_reported_only_on_incarnation_change(expected_count=1) test_data.set_incarnation(2) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -1072,7 +949,7 @@ def mock_fail_extension_commands(args, **kwargs): # Test it recovers on a new goal state if Handler succeeds test_data.set_incarnation(3) test_data.set_extensions_config_sequence_number(1) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -1084,7 +961,7 @@ def mock_fail_extension_commands(args, **kwargs): # Update incarnation to confirm extension invocation order test_data.set_incarnation(4) - protocol.update_goal_state() + protocol.client.update_goal_state() dep_ext_level_2 = extension_emulator(name="OSTCExtensions.ExampleHandlerLinux") dep_ext_level_1 = extension_emulator(name="OSTCExtensions.OtherExampleHandlerLinux") @@ -1146,7 +1023,7 @@ def test_ext_handler_rollingupgrade(self, *args): # Test goal state changed test_data.set_incarnation(2) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -1157,7 +1034,7 @@ def test_ext_handler_rollingupgrade(self, *args): # Test minor version bump test_data.set_incarnation(3) test_data.set_extensions_config_version("1.1.0") - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -1168,7 +1045,7 @@ def test_ext_handler_rollingupgrade(self, *args): # Test hotfix version bump test_data.set_incarnation(4) test_data.set_extensions_config_version("1.1.1") - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -1179,7 +1056,7 @@ def test_ext_handler_rollingupgrade(self, *args): # Test disable test_data.set_incarnation(5) test_data.set_extensions_config_state(ExtensionRequestedState.Disabled) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -1189,7 +1066,7 @@ def test_ext_handler_rollingupgrade(self, *args): # Test uninstall test_data.set_incarnation(6) test_data.set_extensions_config_state(ExtensionRequestedState.Uninstall) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -1198,7 +1075,7 @@ def test_ext_handler_rollingupgrade(self, *args): # Test uninstall again! test_data.set_incarnation(7) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -1208,7 +1085,7 @@ def test_ext_handler_rollingupgrade(self, *args): # Test re-install test_data.set_incarnation(8) test_data.set_extensions_config_state(ExtensionRequestedState.Enabled) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -1219,7 +1096,7 @@ def test_ext_handler_rollingupgrade(self, *args): # Test version bump post-re-install test_data.set_incarnation(9) test_data.set_extensions_config_version("1.2.0") - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -1230,7 +1107,7 @@ def test_ext_handler_rollingupgrade(self, *args): # Test rollback test_data.set_incarnation(10) test_data.set_extensions_config_version("1.1.0") - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -1293,7 +1170,7 @@ def test_it_should_not_delete_extension_events_directory_on_extension_uninstall( # Uninstall extensions now test_data.set_extensions_config_state(ExtensionRequestedState.Uninstall) test_data.set_incarnation(2) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -1315,7 +1192,7 @@ def test_it_should_uninstall_unregistered_extensions_properly(self, *args): # Since the installed version is not in PIR anymore, we need to also remove it from manifest file test_data.manifest = test_data.manifest.replace("1.0.0", "9.9.9") test_data.set_incarnation(2) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -1789,7 +1666,7 @@ def test_extensions_deleted(self, *args): test_data.set_incarnation(2) test_data.set_extensions_config_version("1.0.1") test_data.set_manifest_version('1.0.1') - protocol.update_goal_state() + protocol.client.update_goal_state() # Ensure new extension can be enabled exthandlers_handler.run() @@ -1876,7 +1753,7 @@ def test_disable_failure_with_exception_handling(self, patch_get_disable_command # Next incarnation, disable extension test_data.set_incarnation(2) test_data.set_extensions_config_state(ExtensionRequestedState.Disabled) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -1907,7 +1784,7 @@ def test_uninstall_failure(self, patch_get_uninstall_command, *args): # Next incarnation, disable extension test_data.set_incarnation(2) test_data.set_extensions_config_state(ExtensionRequestedState.Uninstall) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -1964,7 +1841,7 @@ def mock_popen(*args, **kwargs): # If the incarnation number changes (there's a new goal state), ensure we go through the entire upgrade # process again. test_data.set_incarnation(3) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -2011,7 +1888,7 @@ def test_extension_upgrade_failure_when_prev_version_disable_fails_and_recovers_ # Force a new goal state incarnation, only then will we attempt the upgrade again test_data.set_incarnation(3) - protocol.update_goal_state() + protocol.client.update_goal_state() # Ensure disable won't fail by making launch_command a no-op with patch('azurelinuxagent.ga.exthandlers.ExtHandlerInstance.launch_command') as patch_launch_command: # pylint: disable=unused-variable @@ -2234,7 +2111,7 @@ def test_uninstall_rc_env_var_should_report_not_run_for_non_update_calls_to_exth # Initiating another run which shouldn't have any failed env variables in it if no failures # Updating Incarnation test_data.set_incarnation(3) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -3478,7 +3355,7 @@ def http_get_handler(url, *_, **kwargs): # Update GoalState protocol.mock_wire_data.set_incarnation(2) - protocol.update_goal_state() + protocol.client.update_goal_state() with patch.object(conf, 'get_extensions_enabled', return_value=False): assert_extensions_called(exthandlers_handler, expected_call_count=0) @@ -3492,7 +3369,7 @@ def http_get_handler(url, *_, **kwargs): # Enabled on_hold property in artifact_blob mock_in_vm_artifacts_profile_response = MockHttpResponse(200, body='{ "onHold": true }'.encode('utf-8')) - protocol.client.update_goal_state(force_update=True) + protocol.client.reset_goal_state() with patch.object(conf, 'get_extensions_enabled', return_value=True): with patch.object(conf, "get_enable_overprovisioning", return_value=True): @@ -3500,7 +3377,7 @@ def http_get_handler(url, *_, **kwargs): # Disabled on_hold property in artifact_blob mock_in_vm_artifacts_profile_response = MockHttpResponse(200, body='{ "onHold": false }'.encode('utf-8')) - protocol.client.update_goal_state(force_update=True) + protocol.client.reset_goal_state() with patch.object(conf, 'get_extensions_enabled', return_value=True): with patch.object(conf, "get_enable_overprovisioning", return_value=True): @@ -3533,7 +3410,7 @@ def http_get_handler(url, *_, **kwargs): return None protocol.set_http_handlers(http_get_handler=http_get_handler) - protocol.client.update_goal_state(force_update=True) + protocol.client.reset_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() diff --git a/tests/ga/test_exthandlers.py b/tests/ga/test_exthandlers.py index 69079cf7f..67b077177 100644 --- a/tests/ga/test_exthandlers.py +++ b/tests/ga/test_exthandlers.py @@ -31,9 +31,8 @@ from azurelinuxagent.common.utils.extensionprocessutil import TELEMETRY_MESSAGE_MAX_LEN, format_stdout_stderr, \ read_output from azurelinuxagent.ga.exthandlers import parse_ext_status, ExtHandlerInstance, ExtCommandEnvVariable, \ - ExtensionStatusError, _DEFAULT_SEQ_NO -from tests.protocol import mockwiredata -from tests.protocol.mocks import mock_wire_protocol + ExtensionStatusError, _DEFAULT_SEQ_NO, get_exthandlers_handler, ExtHandlerState +from tests.protocol.mocks import mock_wire_protocol, mockwiredata from tests.tools import AgentTestCase, patch, mock_sleep, clear_singleton_instances @@ -288,6 +287,29 @@ def test_command_extension_log_truncates_correctly(self, mock_log_dir): with open(log_file_path) as truncated_log_file: self.assertEqual(truncated_log_file.read(), "{second_line}\n".format(second_line=second_line)) + def test_it_should_report_the_message_in_the_hearbeat(self): + def heartbeat_with_message(): + return {'code': 0, 'formattedMessage': {'lang': 'en-US', 'message': 'This is a heartbeat message'}, + 'status': 'ready'} + + with mock_wire_protocol(mockwiredata.DATA_FILE) as protocol: + with patch("azurelinuxagent.common.protocol.wire.WireProtocol.report_vm_status", return_value=None): + with patch("azurelinuxagent.ga.exthandlers.ExtHandlerInstance.collect_heartbeat", + side_effect=heartbeat_with_message): + with patch("azurelinuxagent.ga.exthandlers.ExtHandlerInstance.get_handler_state", + return_value=ExtHandlerState.Enabled): + with patch("azurelinuxagent.ga.exthandlers.ExtHandlerInstance.collect_ext_status", + return_value=None): + exthandlers_handler = get_exthandlers_handler(protocol) + exthandlers_handler.run() + vm_status = exthandlers_handler.report_ext_handlers_status() + ext_handler = vm_status.vmAgent.extensionHandlers[0] + self.assertEqual(ext_handler.message, + heartbeat_with_message().get('formattedMessage').get('message'), + "Extension handler messages don't match") + self.assertEqual(ext_handler.status, heartbeat_with_message().get('status'), + "Extension handler statuses don't match") + class LaunchCommandTestCase(AgentTestCase): """ Test cases for launch_command diff --git a/tests/ga/test_exthandlers_download_extension.py b/tests/ga/test_exthandlers_download_extension.py index 3a9683889..556254fa3 100644 --- a/tests/ga/test_exthandlers_download_extension.py +++ b/tests/ga/test_exthandlers_download_extension.py @@ -96,6 +96,9 @@ def _create_invalid_zip_file(filename): with open(filename, "w") as file: # pylint: disable=redefined-builtin file.write("An invalid ZIP file\n") + def _get_extension_base_dir(self): + return self.extension_dir + def _get_extension_package_file(self): return os.path.join(self.agent_dir, self.ext_handler_instance.get_extension_package_zipfile_name()) @@ -103,7 +106,7 @@ def _get_extension_command_file(self): return os.path.join(self.extension_dir, DownloadExtensionTestCase._extension_command) def _assert_download_and_expand_succeeded(self): - self.assertTrue(os.path.exists(self._get_extension_package_file()), "The extension package was not downloaded to the expected location") + self.assertTrue(os.path.exists(self._get_extension_base_dir()), "The extension package was not downloaded to the expected location") self.assertTrue(os.path.exists(self._get_extension_command_file()), "The extension package was not expanded to the expected location") @staticmethod @@ -246,9 +249,11 @@ def stream(_, destination, **__): self._assert_download_and_expand_succeeded() def test_it_should_raise_an_exception_when_all_downloads_fail(self): - def stream(_, __, **___): - DownloadExtensionTestCase._create_invalid_zip_file(self._get_extension_package_file()) + def stream(_, target_file, **___): + stream.target_file = target_file + DownloadExtensionTestCase._create_invalid_zip_file(target_file) return True + stream.target_file = None with DownloadExtensionTestCase.create_mock_stream(stream) as mock_stream: with self.assertRaises(ExtensionDownloadError) as context_manager: @@ -260,5 +265,5 @@ def stream(_, __, **___): self.assertEqual(context_manager.exception.code, ExtensionErrorCodes.PluginManifestDownloadError) self.assertFalse(os.path.exists(self.extension_dir), "The extension directory was not removed") - self.assertFalse(os.path.exists(self._get_extension_package_file()), "The extension package was not removed") + self.assertFalse(os.path.exists(stream.target_file), "The extension package was not removed") diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index d5700bc91..5853b23ef 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -188,7 +188,7 @@ def setUp(self): CGroupsTelemetry.reset() clear_singleton_instances(ProtocolUtil) protocol = WireProtocol('endpoint') - protocol.update_goal_state = MagicMock() + protocol.client.update_goal_state = MagicMock() self.get_protocol = patch('azurelinuxagent.common.protocol.util.ProtocolUtil.get_protocol', return_value=protocol) self.get_protocol.start() diff --git a/tests/ga/test_multi_config_extension.py b/tests/ga/test_multi_config_extension.py index a9a07bd67..365052f5d 100644 --- a/tests/ga/test_multi_config_extension.py +++ b/tests/ga/test_multi_config_extension.py @@ -253,7 +253,7 @@ def __setup_and_assert_disable_scenario(self, exthandlers_handler, protocol): self.test_data['ext_conf'] = os.path.join(self._MULTI_CONFIG_TEST_DATA, 'ext_conf_mc_disabled_extensions.xml') protocol.mock_wire_data = WireProtocolData(self.test_data) protocol.mock_wire_data.set_incarnation(2) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -319,7 +319,7 @@ def test_it_should_execute_and_report_multi_config_extensions_properly(self): # Case 3: Uninstall Multi-config handler (with enabled extensions) and single config extension protocol.mock_wire_data.set_incarnation(3) protocol.mock_wire_data.set_extensions_config_state(ExtensionRequestedState.Uninstall) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() self.assertEqual(0, len(protocol.aggregate_status['aggregateStatus']['handlerAggregateStatus']), @@ -333,7 +333,7 @@ def test_it_should_report_unregistered_version_error_per_extension(self): failing_version = "19.12.1221" protocol.mock_wire_data.set_extensions_config_version(failing_version) protocol.mock_wire_data.set_incarnation(2) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() self.assertEqual(no_of_extensions, @@ -411,7 +411,7 @@ def test_it_should_only_disable_enabled_extensions_on_update(self): self.test_data['ext_conf'] = os.path.join(self._MULTI_CONFIG_TEST_DATA, 'ext_conf_mc_update_extensions.xml') protocol.mock_wire_data = WireProtocolData(self.test_data) protocol.mock_wire_data.set_incarnation(2) - protocol.update_goal_state() + protocol.client.update_goal_state() new_version = "1.1.0" new_first_ext = extension_emulator(name="OSTCExtensions.ExampleHandlerLinux.firstExtension", @@ -460,7 +460,7 @@ def test_it_should_retry_update_sequence_per_extension_if_previous_failed(self): self.test_data['ext_conf'] = os.path.join(self._MULTI_CONFIG_TEST_DATA, 'ext_conf_mc_update_extensions.xml') protocol.mock_wire_data = WireProtocolData(self.test_data) protocol.mock_wire_data.set_incarnation(2) - protocol.update_goal_state() + protocol.client.update_goal_state() new_version = "1.1.0" _, fail_action = Actions.generate_unique_fail() @@ -529,7 +529,7 @@ def test_it_should_report_disabled_extension_errors_if_update_failed(self): self.test_data['ext_conf'] = os.path.join(self._MULTI_CONFIG_TEST_DATA, 'ext_conf_mc_update_extensions.xml') protocol.mock_wire_data = WireProtocolData(self.test_data) protocol.mock_wire_data.set_incarnation(2) - protocol.update_goal_state() + protocol.client.update_goal_state() new_version = "1.1.0" fail_code, fail_action = Actions.generate_unique_fail() @@ -655,7 +655,7 @@ def __assert_state_file(handler_name, handler_version, extensions, state, not_pr self.test_data['ext_conf'] = os.path.join(self._MULTI_CONFIG_TEST_DATA, 'ext_conf_mc_disabled_extensions.xml') protocol.mock_wire_data = WireProtocolData(self.test_data) protocol.mock_wire_data.set_incarnation(2) - protocol.update_goal_state() + protocol.client.update_goal_state() ext_handler.run() ext_handler.report_ext_handlers_status() @@ -781,7 +781,7 @@ def mock_popen(cmd, *_, **kwargs): 'ext_conf_mc_update_extensions.xml') protocol.mock_wire_data = WireProtocolData(self.test_data) protocol.mock_wire_data.set_incarnation(2) - protocol.update_goal_state() + protocol.client.update_goal_state() exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() @@ -961,7 +961,7 @@ def test_it_should_report_status_correctly_for_unsupported_goal_state(self): self.test_data['ext_conf'] = "wire/ext_conf_required_features.xml" protocol.mock_wire_data = WireProtocolData(self.test_data) protocol.mock_wire_data.set_incarnation(2) - protocol.update_goal_state() + protocol.client.update_goal_state() # Assert the extension status is the same as we reported for Incarnation 1. self.__run_and_assert_generic_case(exthandlers_handler, protocol, no_of_extensions=4, with_message=False) @@ -1021,7 +1021,7 @@ def test_it_should_check_every_time_if_handler_supports_mc(self): with self.__setup_generic_test_env() as (exthandlers_handler, protocol, old_exts): protocol.mock_wire_data.set_incarnation(2) - protocol.update_goal_state() + protocol.client.update_goal_state() # Mock manifest to not support multiple extensions with patch('azurelinuxagent.ga.exthandlers.HandlerManifest.supports_multiple_extensions', return_value=False): diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index 695b6d578..e5f15fbd0 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -19,8 +19,7 @@ import zipfile from datetime import datetime, timedelta -from threading import currentThread -from azurelinuxagent.common.protocol.imds import ComputeInfo +from threading import current_thread from tests.common.osutil.test_default import TestOSUtil import azurelinuxagent.common.osutil.default as osutil @@ -28,17 +27,16 @@ from azurelinuxagent.common import conf from azurelinuxagent.common.event import EVENTS_DIRECTORY, WALAEventOperation -from azurelinuxagent.common.exception import ProtocolError, UpdateError, ResourceGoneError, HttpError +from azurelinuxagent.common.exception import ProtocolError, UpdateError, HttpError, \ + ExitException, AgentMemoryExceededException from azurelinuxagent.common.future import ustr, httpclient from azurelinuxagent.common.persist_firewall_rules import PersistFirewallRulesHandler -from azurelinuxagent.common.protocol.hostplugin import URI_FORMAT_GET_API_VERSIONS, HOST_PLUGIN_PORT, \ - URI_FORMAT_GET_EXTENSION_ARTIFACT, HostPluginProtocol +from azurelinuxagent.common.protocol.hostplugin import HostPluginProtocol from azurelinuxagent.common.protocol.restapi import VMAgentFamily, \ ExtHandlerPackage, ExtHandlerPackageList, Extension, VMStatus, ExtHandlerStatus, ExtensionStatus, \ VMAgentUpdateStatuses from azurelinuxagent.common.protocol.util import ProtocolUtil -from azurelinuxagent.common.protocol.wire import WireProtocol -from azurelinuxagent.common.utils import fileutil, restutil, textutil, timeutil +from azurelinuxagent.common.utils import fileutil, textutil, timeutil from azurelinuxagent.common.utils.archive import ARCHIVE_DIRECTORY_NAME, AGENT_STATUS_FILE from azurelinuxagent.common.utils.flexible_version import FlexibleVersion from azurelinuxagent.common.utils.networkutil import FirewallCmdDirectCommands, AddFirewallRules @@ -54,7 +52,7 @@ from tests.protocol.mocks import mock_wire_protocol, MockHttpResponse from tests.protocol.mockwiredata import DATA_FILE, DATA_FILE_MULTIPLE_EXT, DATA_FILE_VM_SETTINGS from tests.tools import AgentTestCase, AgentTestCaseWithGetVmSizeMock, data_dir, DEFAULT, patch, load_bin_data, Mock, MagicMock, \ - clear_singleton_instances + clear_singleton_instances, is_python_version_26_or_34, skip_if_predicate_true from tests.protocol import mockwiredata from tests.protocol.HttpRequestPredicates import HttpRequestPredicates @@ -194,7 +192,7 @@ def rename_agent_bin(self, path, dst_v): shutil.move(src_bin, dst_bin) def agents(self): - return [GuestAgent(is_fast_track_goal_state=False, path=path) for path in self.agent_dirs()] + return [GuestAgent.from_installed_agent(path) for path in self.agent_dirs()] def agent_count(self): return len(self.agent_dirs()) @@ -313,7 +311,7 @@ def replicate_agents(self, shutil.copytree(from_path, to_path) self.rename_agent_bin(to_path, dst_v) if not is_available: - GuestAgent(is_fast_track_goal_state=False, path=to_path).mark_failure(is_fatal=True) + GuestAgent.from_installed_agent(to_path).mark_failure(is_fatal=True) return dst_v @@ -405,13 +403,15 @@ def setUp(self): self.agent_path = os.path.join(self.tmp_dir, self._get_agent_name()) def test_creation(self): - self.assertRaises(UpdateError, GuestAgent, "A very bad file name") - n = "{0}-a.bad.version".format(AGENT_NAME) - self.assertRaises(UpdateError, GuestAgent, n) + with self.assertRaises(UpdateError): + GuestAgent.from_installed_agent("A very bad file name") + + with self.assertRaises(UpdateError): + GuestAgent.from_installed_agent("{0}-a.bad.version".format(AGENT_NAME)) self.expand_agents() - agent = GuestAgent(is_fast_track_goal_state=False, path=self.agent_path) + agent = GuestAgent.from_installed_agent(self.agent_path) self.assertNotEqual(None, agent) self.assertEqual(self._get_agent_name(), agent.name) self.assertEqual(self._get_agent_version(), agent.version) @@ -432,11 +432,10 @@ def test_creation(self): self.assertFalse(agent.is_blacklisted) self.assertTrue(agent.is_available) - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") - def test_clear_error(self, mock_downloaded): # pylint: disable=unused-argument + def test_clear_error(self): self.expand_agents() - agent = GuestAgent(is_fast_track_goal_state=False, path=self.agent_path) + agent = GuestAgent.from_installed_agent(self.agent_path) agent.mark_failure(is_fatal=True) self.assertTrue(agent.error.last_failure > 0.0) @@ -450,25 +449,19 @@ def test_clear_error(self, mock_downloaded): # pylint: disable=unused-argument self.assertFalse(agent.is_blacklisted) self.assertEqual(agent.is_blacklisted, agent.error.is_blacklisted) - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_loaded") - def test_is_available(self, mock_loaded, mock_downloaded): # pylint: disable=unused-argument - agent = GuestAgent(is_fast_track_goal_state=False, path=self.agent_path) + def test_is_available(self): + self.expand_agents() - self.assertFalse(agent.is_available) - agent._unpack() - self.assertTrue(agent.is_available) + agent = GuestAgent.from_installed_agent(self.agent_path) + self.assertTrue(agent.is_available) agent.mark_failure(is_fatal=True) self.assertFalse(agent.is_available) - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_loaded") - def test_is_blacklisted(self, mock_loaded, mock_downloaded): # pylint: disable=unused-argument - agent = GuestAgent(is_fast_track_goal_state=False, path=self.agent_path) - self.assertFalse(agent.is_blacklisted) + def test_is_blacklisted(self): + self.expand_agents() - agent._unpack() + agent = GuestAgent.from_installed_agent(self.agent_path) self.assertFalse(agent.is_blacklisted) self.assertEqual(agent.is_blacklisted, agent.error.is_blacklisted) @@ -476,42 +469,13 @@ def test_is_blacklisted(self, mock_loaded, mock_downloaded): # pylint: disable= self.assertTrue(agent.is_blacklisted) self.assertEqual(agent.is_blacklisted, agent.error.is_blacklisted) - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_loaded") - def test_resource_gone_error_not_blacklisted(self, mock_loaded, mock_downloaded): # pylint: disable=unused-argument - try: - mock_downloaded.side_effect = ResourceGoneError() - agent = GuestAgent(is_fast_track_goal_state=False, path=self.agent_path) - self.assertFalse(agent.is_blacklisted) - except ResourceGoneError: - pass - except: # pylint: disable=bare-except - self.fail("Exception was not expected!") - - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_loaded") - def test_ioerror_not_blacklisted(self, mock_loaded, mock_downloaded): # pylint: disable=unused-argument - try: - mock_downloaded.side_effect = IOError() - agent = GuestAgent(is_fast_track_goal_state=False, path=self.agent_path) - self.assertFalse(agent.is_blacklisted) - except IOError: - pass - except: # pylint: disable=bare-except - self.fail("Exception was not expected!") - - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_loaded") - def test_is_downloaded(self, mock_loaded, mock_downloaded): # pylint: disable=unused-argument - agent = GuestAgent(is_fast_track_goal_state=False, path=self.agent_path) - self.assertFalse(agent.is_downloaded) - agent._unpack() + def test_is_downloaded(self): + self.expand_agents() + agent = GuestAgent.from_installed_agent(self.agent_path) self.assertTrue(agent.is_downloaded) - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_loaded") - def test_mark_failure(self, mock_loaded, mock_downloaded): # pylint: disable=unused-argument - agent = GuestAgent(is_fast_track_goal_state=False, path=self.agent_path) + def test_mark_failure(self): + agent = GuestAgent.from_installed_agent(self.agent_path) agent.mark_failure() self.assertEqual(1, agent.error.failure_count) @@ -520,59 +484,31 @@ def test_mark_failure(self, mock_loaded, mock_downloaded): # pylint: disable=un self.assertEqual(2, agent.error.failure_count) self.assertTrue(agent.is_blacklisted) - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_loaded") - def test_unpack(self, mock_loaded, mock_downloaded): # pylint: disable=unused-argument - agent = GuestAgent(is_fast_track_goal_state=False, path=self.agent_path) - self.assertFalse(os.path.isdir(agent.get_agent_dir())) - agent._unpack() - self.assertTrue(os.path.isdir(agent.get_agent_dir())) - self.assertTrue(os.path.isfile(agent.get_agent_manifest_path())) - - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_loaded") - def test_unpack_fail(self, mock_loaded, mock_downloaded): # pylint: disable=unused-argument - agent = GuestAgent(is_fast_track_goal_state=False, path=self.agent_path) - self.assertFalse(os.path.isdir(agent.get_agent_dir())) - os.remove(agent.get_agent_pkg_path()) - self.assertRaises(UpdateError, agent._unpack) - - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_loaded") - def test_load_manifest(self, mock_loaded, mock_downloaded): # pylint: disable=unused-argument - agent = GuestAgent(is_fast_track_goal_state=False, path=self.agent_path) - agent._unpack() + def test_load_manifest(self): + self.expand_agents() + agent = GuestAgent.from_installed_agent(self.agent_path) agent._load_manifest() self.assertEqual(agent.manifest.get_enable_command(), agent.get_agent_cmd()) - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_loaded") - def test_load_manifest_missing(self, mock_loaded, mock_downloaded): # pylint: disable=unused-argument - agent = GuestAgent(is_fast_track_goal_state=False, path=self.agent_path) - self.assertFalse(os.path.isdir(agent.get_agent_dir())) - agent._unpack() + def test_load_manifest_missing(self): + self.expand_agents() + agent = GuestAgent.from_installed_agent(self.agent_path) os.remove(agent.get_agent_manifest_path()) self.assertRaises(UpdateError, agent._load_manifest) - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_loaded") - def test_load_manifest_is_empty(self, mock_loaded, mock_downloaded): # pylint: disable=unused-argument - agent = GuestAgent(is_fast_track_goal_state=False, path=self.agent_path) - self.assertFalse(os.path.isdir(agent.get_agent_dir())) - agent._unpack() + def test_load_manifest_is_empty(self): + self.expand_agents() + agent = GuestAgent.from_installed_agent(self.agent_path) self.assertTrue(os.path.isfile(agent.get_agent_manifest_path())) with open(agent.get_agent_manifest_path(), "w") as file: # pylint: disable=redefined-builtin json.dump(EMPTY_MANIFEST, file) self.assertRaises(UpdateError, agent._load_manifest) - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_loaded") - def test_load_manifest_is_malformed(self, mock_loaded, mock_downloaded): # pylint: disable=unused-argument - agent = GuestAgent(is_fast_track_goal_state=False, path=self.agent_path) - self.assertFalse(os.path.isdir(agent.get_agent_dir())) - agent._unpack() + def test_load_manifest_is_malformed(self): + self.expand_agents() + agent = GuestAgent.from_installed_agent(self.agent_path) self.assertTrue(os.path.isfile(agent.get_agent_manifest_path())) with open(agent.get_agent_manifest_path(), "w") as file: # pylint: disable=redefined-builtin @@ -580,165 +516,84 @@ def test_load_manifest_is_malformed(self, mock_loaded, mock_downloaded): # pyli self.assertRaises(UpdateError, agent._load_manifest) def test_load_error(self): - agent = GuestAgent(is_fast_track_goal_state=False, path=self.agent_path) + agent = GuestAgent.from_installed_agent(self.agent_path) agent.error = None agent._load_error() self.assertTrue(agent.error is not None) - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_loaded") - @patch("azurelinuxagent.ga.update.restutil.http_get") - def test_download(self, mock_http_get, mock_loaded, mock_downloaded): # pylint: disable=unused-argument + def test_download(self): self.remove_agents() self.assertFalse(os.path.isdir(self.agent_path)) - agent_pkg = load_bin_data(self._get_agent_file_name(), self._agent_zip_dir) - mock_http_get.return_value = ResponseMock(response=agent_pkg) - - pkg = ExtHandlerPackage(version=str(self._get_agent_version())) - pkg.uris.append(None) - agent = GuestAgent(is_fast_track_goal_state=False, pkg=pkg) - agent._download() - - self.assertTrue(os.path.isfile(agent.get_agent_pkg_path())) - - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_loaded") - @patch("azurelinuxagent.ga.update.restutil.http_get") - def test_download_fail(self, mock_http_get, mock_loaded, mock_downloaded): # pylint: disable=unused-argument - self.remove_agents() - self.assertFalse(os.path.isdir(self.agent_path)) + agent_uri = 'https://foo.blob.core.windows.net/bar/OSTCExtensions.WALinuxAgent__1.0.0' - mock_http_get.return_value = ResponseMock(status=restutil.httpclient.SERVICE_UNAVAILABLE) + def http_get_handler(uri, *_, **__): + if uri == agent_uri: + response = load_bin_data(self._get_agent_file_name(), self._agent_zip_dir) + return MockHttpResponse(status=httpclient.OK, body=response) + return None pkg = ExtHandlerPackage(version=str(self._get_agent_version())) - pkg.uris.append(None) - agent = GuestAgent(is_fast_track_goal_state=False, pkg=pkg) - - self.assertRaises(UpdateError, agent._download) - self.assertFalse(os.path.isfile(agent.get_agent_pkg_path())) - self.assertFalse(agent.is_downloaded) - - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") - @patch("azurelinuxagent.ga.update.GuestAgent._ensure_loaded") - @patch("azurelinuxagent.ga.update.restutil.http_get") - @patch("azurelinuxagent.ga.update.restutil.http_post") - def test_download_fallback(self, mock_http_post, mock_http_get, mock_loaded, mock_downloaded): # pylint: disable=unused-argument - self.remove_agents() - self.assertFalse(os.path.isdir(self.agent_path)) + pkg.uris.append(agent_uri) - mock_http_get.return_value = ResponseMock( - status=restutil.httpclient.SERVICE_UNAVAILABLE, - response="") + with mock_wire_protocol(mockwiredata.DATA_FILE) as protocol: + protocol.set_http_handlers(http_get_handler=http_get_handler) + agent = GuestAgent.from_agent_package(pkg, protocol, False) - ext_uri = 'ext_uri' - host_uri = 'host_uri' - api_uri = URI_FORMAT_GET_API_VERSIONS.format(host_uri, HOST_PLUGIN_PORT) - art_uri = URI_FORMAT_GET_EXTENSION_ARTIFACT.format(host_uri, HOST_PLUGIN_PORT) - mock_host = HostPluginProtocol(host_uri) + self.assertTrue(os.path.isdir(agent.get_agent_dir())) + self.assertTrue(agent.is_downloaded) - pkg = ExtHandlerPackage(version=str(self._get_agent_version())) - pkg.uris.append(ext_uri) - agent = GuestAgent(is_fast_track_goal_state=False, pkg=pkg) - agent.host = mock_host - - # ensure fallback fails gracefully, no http - self.assertRaises(UpdateError, agent._download) - self.assertEqual(mock_http_get.call_count, 2) - self.assertEqual(mock_http_get.call_args_list[0][0][0], ext_uri) - self.assertEqual(mock_http_get.call_args_list[1][0][0], api_uri) - - # ensure fallback fails gracefully, artifact api failure - with patch.object(HostPluginProtocol, - "ensure_initialized", - return_value=True): - self.assertRaises(UpdateError, agent._download) - self.assertEqual(mock_http_get.call_count, 4) - - self.assertEqual(mock_http_get.call_args_list[2][0][0], ext_uri) - - self.assertEqual(mock_http_get.call_args_list[3][0][0], art_uri) - a, k = mock_http_get.call_args_list[3] # pylint: disable=unused-variable - self.assertEqual(False, k['use_proxy']) - - # ensure fallback works as expected - with patch.object(HostPluginProtocol, - "get_artifact_request", - return_value=[art_uri, {}]): - self.assertRaises(UpdateError, agent._download) - self.assertEqual(mock_http_get.call_count, 6) - - a, k = mock_http_get.call_args_list[3] - self.assertEqual(False, k['use_proxy']) - - self.assertEqual(mock_http_get.call_args_list[4][0][0], ext_uri) - a, k = mock_http_get.call_args_list[4] - - self.assertEqual(mock_http_get.call_args_list[5][0][0], art_uri) - a, k = mock_http_get.call_args_list[5] - self.assertEqual(False, k['use_proxy']) - - @patch("azurelinuxagent.ga.update.restutil.http_get") - def test_ensure_downloaded(self, mock_http_get): + def test_download_fail(self): self.remove_agents() self.assertFalse(os.path.isdir(self.agent_path)) - agent_pkg = load_bin_data(self._get_agent_file_name(), self._agent_zip_dir) - mock_http_get.return_value = ResponseMock(response=agent_pkg) + agent_uri = 'https://foo.blob.core.windows.net/bar/OSTCExtensions.WALinuxAgent__1.0.0' - pkg = ExtHandlerPackage(version=str(self._get_agent_version())) - pkg.uris.append(None) - agent = GuestAgent(is_fast_track_goal_state=False, pkg=pkg) + def http_get_handler(uri, *_, **__): + if uri in (agent_uri, 'http://168.63.129.16:32526/extensionArtifact'): + return MockHttpResponse(status=httpclient.SERVICE_UNAVAILABLE) + return None - self.assertTrue(os.path.isfile(agent.get_agent_manifest_path())) - self.assertTrue(agent.is_downloaded) + agent_version = self._get_agent_version() + pkg = ExtHandlerPackage(version=str(agent_version)) + pkg.uris.append(agent_uri) - @patch("azurelinuxagent.ga.update.GuestAgent._download", side_effect=UpdateError) - def test_ensure_failure_in_download_cleans_up_filesystem(self, _): - self.remove_agents() - self.assertFalse(os.path.isdir(self.agent_path)) + with mock_wire_protocol(mockwiredata.DATA_FILE) as protocol: + protocol.set_http_handlers(http_get_handler=http_get_handler) + with patch("azurelinuxagent.ga.update.add_event") as add_event: + agent = GuestAgent.from_agent_package(pkg, protocol, False) - pkg = ExtHandlerPackage(version=str(self._get_agent_version())) - pkg.uris.append(None) - agent = GuestAgent(is_fast_track_goal_state=False, pkg=pkg) + self.assertFalse(os.path.isfile(self.agent_path)) - self.assertFalse(agent.is_blacklisted, "The agent should not be blacklisted if unable to unpack/download") - self.assertFalse(os.path.exists(agent.get_agent_dir()), "Agent directory should be cleaned up") - self.assertFalse(os.path.exists(agent.get_agent_pkg_path()), "Agent package should be cleaned up") + messages = [kwargs['message'] for _, kwargs in add_event.call_args_list if kwargs['op'] == 'Install' and kwargs['is_success'] == False] + self.assertEqual(1, len(messages), "Expected exactly 1 install error/ Got: {0}".format(add_event.call_args_list)) + self.assertIn(str.format('[UpdateError] Unable to download Agent WALinuxAgent-{0}', agent_version), messages[0], "The install error does not include the expected message") - @patch("azurelinuxagent.ga.update.GuestAgent._download") - @patch("azurelinuxagent.ga.update.GuestAgent._unpack", side_effect=UpdateError) - def test_ensure_downloaded_unpack_failure_cleans_file_system(self, *_): - self.assertFalse(os.path.isdir(self.agent_path)) + self.assertFalse(agent.is_blacklisted, "Download failures should not blacklist the Agent") - pkg = ExtHandlerPackage(version=str(self._get_agent_version())) - pkg.uris.append(None) - agent = GuestAgent(is_fast_track_goal_state=False, pkg=pkg) + def test_invalid_agent_package_does_not_blacklist_the_agent(self): + agent_uri = 'https://foo.blob.core.windows.net/bar/OSTCExtensions.WALinuxAgent__9.9.9.9' - self.assertFalse(agent.is_blacklisted, "The agent should not be blacklisted if unable to unpack/download") - self.assertFalse(os.path.exists(agent.get_agent_dir()), "Agent directory should be cleaned up") - self.assertFalse(os.path.exists(agent.get_agent_pkg_path()), "Agent package should be cleaned up") + def http_get_handler(uri, *_, **__): + if uri in (agent_uri, 'http://168.63.129.16:32526/extensionArtifact'): + response = load_bin_data("ga/WALinuxAgent-9.9.9.9-no_manifest.zip") + return MockHttpResponse(status=httpclient.OK, body=response) + return None - @patch("azurelinuxagent.ga.update.GuestAgent._download") - @patch("azurelinuxagent.ga.update.GuestAgent._unpack") - @patch("azurelinuxagent.ga.update.GuestAgent._load_manifest", side_effect=UpdateError) - def test_ensure_downloaded_load_manifest_cleans_up_agent_directories(self, *_): - self.assertFalse(os.path.isdir(self.agent_path)) + pkg = ExtHandlerPackage(version="9.9.9.9") + pkg.uris.append(agent_uri) - pkg = ExtHandlerPackage(version=str(self._get_agent_version())) - pkg.uris.append(None) - agent = GuestAgent(is_fast_track_goal_state=False, pkg=pkg) + with mock_wire_protocol(mockwiredata.DATA_FILE) as protocol: + protocol.set_http_handlers(http_get_handler=http_get_handler) + agent = GuestAgent.from_agent_package(pkg, protocol, False) self.assertFalse(agent.is_blacklisted, "The agent should not be blacklisted if unable to unpack/download") self.assertFalse(os.path.exists(agent.get_agent_dir()), "Agent directory should be cleaned up") - self.assertFalse(os.path.exists(agent.get_agent_pkg_path()), "Agent package should be cleaned up") @patch("azurelinuxagent.ga.update.GuestAgent._download") - @patch("azurelinuxagent.ga.update.GuestAgent._unpack") - @patch("azurelinuxagent.ga.update.GuestAgent._load_manifest") - def test_ensure_download_skips_blacklisted(self, mock_manifest, mock_unpack, mock_download): # pylint: disable=unused-argument - agent = GuestAgent(is_fast_track_goal_state=False, path=self.agent_path) + def test_ensure_download_skips_blacklisted(self, mock_download): + agent = GuestAgent.from_installed_agent(self.agent_path) self.assertEqual(0, mock_download.call_count) agent.clear_error() @@ -747,13 +602,13 @@ def test_ensure_download_skips_blacklisted(self, mock_manifest, mock_unpack, moc pkg = ExtHandlerPackage(version=str(self._get_agent_version())) pkg.uris.append(None) - agent = GuestAgent(is_fast_track_goal_state=False, pkg=pkg) + # _download is mocked so there will be no http request; passing a None protocol + agent = GuestAgent.from_agent_package(pkg, None, False) self.assertEqual(1, agent.error.failure_count) self.assertTrue(agent.error.was_fatal) self.assertTrue(agent.is_blacklisted) self.assertEqual(0, mock_download.call_count) - self.assertEqual(0, mock_unpack.call_count) class TestUpdate(UpdateTestCase): @@ -954,7 +809,7 @@ def test_evaluate_agent_health_resets_with_new_agent(self): def test_filter_blacklisted_agents(self): self.prepare_agents() - self.update_handler._set_and_sort_agents([GuestAgent(is_fast_track_goal_state=False, path=path) for path in self.agent_dirs()]) + self.update_handler._set_and_sort_agents([GuestAgent.from_installed_agent(path) for path in self.agent_dirs()]) self.assertEqual(len(self.agent_dirs()), len(self.update_handler.agents)) kept_agents = self.update_handler.agents[::2] @@ -989,15 +844,6 @@ def test_find_agents_sorts(self): self.assertTrue(v > a.version) v = a.version - @patch('azurelinuxagent.common.protocol.wire.WireClient.get_host_plugin') - def test_get_host_plugin_returns_host_for_wireserver(self, mock_get_host): - protocol = WireProtocol('12.34.56.78') - mock_get_host.return_value = "faux host" - host = self.update_handler._get_host_plugin(protocol=protocol) - print("mock_get_host call cound={0}".format(mock_get_host.call_count)) - self.assertEqual(1, mock_get_host.call_count) - self.assertEqual("faux host", host) - def test_get_latest_agent(self): latest_version = self.prepare_agents() @@ -1025,7 +871,7 @@ def test_get_latest_agent_skips_unavailable(self): latest_version = self.prepare_agents(count=self.agent_count() + 1, is_available=False) latest_path = os.path.join(self.tmp_dir, "{0}-{1}".format(AGENT_NAME, latest_version)) - self.assertFalse(GuestAgent(is_fast_track_goal_state=False, path=latest_path).is_available) + self.assertFalse(GuestAgent.from_installed_agent(latest_path).is_available) latest_agent = self.update_handler.get_latest_agent_greater_than_daemon() self.assertTrue(latest_agent.version < latest_version) @@ -1300,14 +1146,14 @@ def test_get_latest_agent_should_return_latest_agent_even_on_bad_error_json(self def test_set_agents_sets_agents(self): self.prepare_agents() - self.update_handler._set_and_sort_agents([GuestAgent(is_fast_track_goal_state=False, path=path) for path in self.agent_dirs()]) + self.update_handler._set_and_sort_agents([GuestAgent.from_installed_agent(path) for path in self.agent_dirs()]) self.assertTrue(len(self.update_handler.agents) > 0) self.assertEqual(len(self.agent_dirs()), len(self.update_handler.agents)) def test_set_agents_sorts_agents(self): self.prepare_agents() - self.update_handler._set_and_sort_agents([GuestAgent(is_fast_track_goal_state=False, path=path) for path in self.agent_dirs()]) + self.update_handler._set_and_sort_agents([GuestAgent.from_installed_agent(path) for path in self.agent_dirs()]) v = FlexibleVersion("100000") for a in self.update_handler.agents: @@ -1461,7 +1307,7 @@ def _get_test_ext_handler_instance(protocol, name="OSTCExtensions.ExampleHandler eh = Extension(name=name) eh.version = version return ExtHandlerInstance(eh, protocol) - + def test_update_handler_recovers_from_error_with_no_certs(self): data = DATA_FILE.copy() data['goal_state'] = 'wire/goal_state_no_certs.xml' @@ -1489,7 +1335,7 @@ def match_unexpected_errors(): for (args, _) in filter(lambda a: len(a) > 0, patched_error.call_args_list): if unexpected_msg_fragment in args[0]: matching_errors.append(args[0]) - + if len(matching_errors) > 1: self.fail("Guest Agent did not recover, with new error(s): {}"\ .format(matching_errors[1:])) @@ -1564,6 +1410,7 @@ def _mock_popen(cmd, *args, **kwargs): "Not setting up persistent firewall rules as OS.EnableFirewall=False" == args[0] for (args, _) in patch_info.call_args_list), "Info not logged properly, got: {0}".format(patch_info.call_args_list)) + @skip_if_predicate_true(is_python_version_26_or_34, "Disabled on Python 2.6 and 3.4 for now. Need to revisit to fix it") def test_it_should_setup_persistent_firewall_rules_on_startup(self): iterations = 1 executed_commands = [] @@ -1984,7 +1831,7 @@ def get_handler(url, **kwargs): if HttpRequestPredicates.is_agent_package_request(url): agent_pkg = load_bin_data(self._get_agent_file_name(), self._agent_zip_dir) protocol.mock_wire_data.call_counts['agentArtifact'] += 1 - return ResponseMock(response=agent_pkg) + return MockHttpResponse(status=httpclient.OK, body=agent_pkg) return protocol.mock_wire_data.mock_http_get(url, **kwargs) def put_handler(url, *args, **_): @@ -2418,7 +2265,7 @@ class MonitorThreadTest(AgentTestCaseWithGetVmSizeMock): def setUp(self): super(MonitorThreadTest, self).setUp() self.event_patch = patch('azurelinuxagent.common.event.add_event') - currentThread().setName("ExtHandler") + current_thread().name = "ExtHandler" protocol = Mock() self.update_handler = get_update_handler() self.update_handler.protocol_util = Mock() @@ -2588,17 +2435,6 @@ def update_goal_state(self): self.call_counts["update_goal_state"] += 1 -class ResponseMock(Mock): - def __init__(self, status=restutil.httpclient.OK, response=None, reason=None): - Mock.__init__(self) - self.status = status - self.reason = reason - self.response = response - - def read(self): - return self.response - - class TimeMock(Mock): def __init__(self, time_increment=1): Mock.__init__(self) @@ -2894,7 +2730,7 @@ def vm_settings_not_supported(url, *_, **__): if HttpRequestPredicates.is_host_plugin_vm_settings_request(url): return MockHttpResponse(404) return None - + with mock_wire_protocol(data) as protocol: def mock_live_migration(iteration): @@ -2904,7 +2740,7 @@ def mock_live_migration(iteration): elif iteration == 2: protocol.mock_wire_data.set_incarnation(3) protocol.set_http_handlers(http_get_handler=vm_settings_not_supported) - + with mock_update_handler(protocol, 3, on_new_iteration=mock_live_migration) as update_handler: with patch("azurelinuxagent.ga.update.logger.error") as patched_error: def check_for_errors(): @@ -2916,7 +2752,7 @@ def check_for_errors(): update_handler.run(debug=True) check_for_errors() - + timestamp = protocol.client.get_host_plugin()._fast_track_timestamp self.assertEqual(timestamp, timeutil.create_timestamp(datetime.min), "Expected fast track time stamp to be set to {0}, got {1}".format(datetime.min, timestamp)) @@ -2926,57 +2762,56 @@ class HeartbeatTestCase(AgentTestCase): @patch("azurelinuxagent.common.logger.info") @patch("azurelinuxagent.ga.update.add_event") def test_telemetry_heartbeat_creates_event(self, patch_add_event, patch_info, *_): - + with mock_wire_protocol(mockwiredata.DATA_FILE) as mock_protocol: update_handler = get_update_handler() - + update_handler.last_telemetry_heartbeat = datetime.utcnow() - timedelta(hours=1) update_handler._send_heartbeat_telemetry(mock_protocol) self.assertEqual(1, patch_add_event.call_count) self.assertTrue(any(call_args[0] == "[HEARTBEAT] Agent {0} is running as the goal state agent {1}" for call_args in patch_info.call_args), "The heartbeat was not written to the agent's log") - - @patch("azurelinuxagent.ga.update.add_event") - @patch("azurelinuxagent.common.protocol.imds.ImdsClient") - def test_telemetry_heartbeat_retries_failed_vm_size_fetch(self, mock_imds_factory, patch_add_event, *_): - - def validate_single_heartbeat_event_matches_vm_size(vm_size): - heartbeat_event_kwargs = [ - kwargs for _, kwargs in patch_add_event.call_args_list - if kwargs.get('op', None) == WALAEventOperation.HeartBeat - ] - - self.assertEqual(1, len(heartbeat_event_kwargs), "Expected exactly one HeartBeat event, got {0}"\ - .format(heartbeat_event_kwargs)) - - telemetry_message = heartbeat_event_kwargs[0].get("message", "") - self.assertTrue(telemetry_message.endswith(vm_size), - "Expected HeartBeat message ('{0}') to end with the test vmSize value, {1}."\ - .format(telemetry_message, vm_size)) - - with mock_wire_protocol(mockwiredata.DATA_FILE) as mock_protocol: - update_handler = get_update_handler() - update_handler.protocol_util.get_protocol = Mock(return_value=mock_protocol) - # Zero out the _vm_size parameter for test resiliency - update_handler._vm_size = None - - mock_imds_client = mock_imds_factory.return_value = Mock() - - # First force a vmSize retrieval failure - mock_imds_client.get_compute.side_effect = HttpError(msg="HTTP Test Failure") - update_handler._last_telemetry_heartbeat = datetime.utcnow() - timedelta(hours=1) - update_handler._send_heartbeat_telemetry(mock_protocol) - validate_single_heartbeat_event_matches_vm_size("unknown") - patch_add_event.reset_mock() +class AgentMemoryCheckTestCase(AgentTestCase): - # Now provide a vmSize - mock_imds_client.get_compute = lambda: ComputeInfo(vmSize="TestVmSizeValue") - update_handler._last_telemetry_heartbeat = datetime.utcnow() - timedelta(hours=1) - update_handler._send_heartbeat_telemetry(mock_protocol) - - validate_single_heartbeat_event_matches_vm_size("TestVmSizeValue") + @patch("azurelinuxagent.common.logger.info") + @patch("azurelinuxagent.ga.update.add_event") + def test_check_agent_memory_usage_raises_exit_exception(self, patch_add_event, patch_info, *_): + with patch("azurelinuxagent.common.cgroupconfigurator.CGroupConfigurator._Impl.check_agent_memory_usage", side_effect=AgentMemoryExceededException()): + with patch('azurelinuxagent.common.conf.get_enable_agent_memory_usage_check', return_value=True): + with self.assertRaises(ExitException) as context_manager: + update_handler = get_update_handler() + + update_handler._check_agent_memory_usage() + self.assertEqual(1, patch_add_event.call_count) + self.assertTrue(any("Check on agent memory usage" in call_args[0] + for call_args in patch_info.call_args), + "The memory check was not written to the agent's log") + self.assertIn("Agent {0} is reached memory limit -- exiting".format(CURRENT_AGENT), + ustr(context_manager.exception), "An incorrect exception was raised") + + @patch("azurelinuxagent.common.logger.warn") + @patch("azurelinuxagent.ga.update.add_event") + def test_check_agent_memory_usage_fails(self, patch_add_event, patch_warn, *_): + with patch("azurelinuxagent.common.cgroupconfigurator.CGroupConfigurator._Impl.check_agent_memory_usage", side_effect=Exception()): + with patch('azurelinuxagent.common.conf.get_enable_agent_memory_usage_check', return_value=True): + update_handler = get_update_handler() + + update_handler._check_agent_memory_usage() + self.assertTrue(any("Error checking the agent's memory usage" in call_args[0] + for call_args in patch_warn.call_args), + "The memory check was not written to the agent's log") + self.assertEqual(1, patch_add_event.call_count) + add_events = [kwargs for _, kwargs in patch_add_event.call_args_list if + kwargs["op"] == WALAEventOperation.AgentMemory] + self.assertTrue( + len(add_events) == 1, + "Exactly 1 event should have been emitted when memory usage check fails. Got: {0}".format(add_events)) + self.assertIn( + "Error checking the agent's memory usage", + add_events[0]["message"], + "The error message is not correct when memory usage check failed") class GoalStateIntervalTestCase(AgentTestCase): diff --git a/tests/protocol/mockwiredata.py b/tests/protocol/mockwiredata.py index 7ec311af4..196ed32db 100644 --- a/tests/protocol/mockwiredata.py +++ b/tests/protocol/mockwiredata.py @@ -165,6 +165,7 @@ def __init__(self, data_files=None): self.in_vm_artifacts_profile = None self.vm_settings = None self.etag = None + self.prev_etag = None self.imds_info = None self.reload() @@ -242,9 +243,12 @@ def mock_http_get(self, url, *_, **kwargs): elif "/vmSettings" in url: if self.vm_settings is None: resp.status = httpclient.NOT_FOUND + elif self.call_counts["vm_settings"] > 0 and self.prev_etag == self.etag: + resp.status = httpclient.NOT_MODIFIED else: content = self.vm_settings response_headers = [('ETag', self.etag)] + self.prev_etag = self.etag self.call_counts["vm_settings"] += 1 elif '{0}/metadata/compute'.format(IMDS_ENDPOINT) in url: content = json.dumps(self.imds_info.get("compute", "{}")) diff --git a/tests/protocol/test_extensions_goal_state_from_vm_settings.py b/tests/protocol/test_extensions_goal_state_from_vm_settings.py index fb97a075f..1100b05bf 100644 --- a/tests/protocol/test_extensions_goal_state_from_vm_settings.py +++ b/tests/protocol/test_extensions_goal_state_from_vm_settings.py @@ -58,6 +58,7 @@ def test_it_should_parse_requested_version_properly(self): data_file = mockwiredata.DATA_FILE_VM_SETTINGS.copy() data_file["vm_settings"] = "hostgaplugin/vm_settings-requested_version.json" with mock_wire_protocol(data_file) as protocol: + protocol.mock_wire_data.set_etag(888) goal_state = GoalState(protocol.client) families = goal_state.extensions_goal_state.agent_families for family in families: diff --git a/tests/protocol/test_goal_state.py b/tests/protocol/test_goal_state.py index d853363c7..61653b2af 100644 --- a/tests/protocol/test_goal_state.py +++ b/tests/protocol/test_goal_state.py @@ -14,7 +14,8 @@ from azurelinuxagent.common.protocol.extensions_goal_state_from_extensions_config import ExtensionsGoalStateFromExtensionsConfig from azurelinuxagent.common.protocol.extensions_goal_state_from_vm_settings import ExtensionsGoalStateFromVmSettings from azurelinuxagent.common.protocol import hostplugin -from azurelinuxagent.common.protocol.goal_state import GoalState, GoalStateInconsistentError, _GET_GOAL_STATE_MAX_ATTEMPTS +from azurelinuxagent.common.protocol.goal_state import GoalState, GoalStateInconsistentError, \ + _GET_GOAL_STATE_MAX_ATTEMPTS, GoalStateProperties from azurelinuxagent.common.exception import ProtocolError from azurelinuxagent.common.utils import fileutil from azurelinuxagent.common.utils.archive import ARCHIVE_DIRECTORY_NAME @@ -27,6 +28,7 @@ class GoalStateTestCase(AgentTestCase, HttpRequestPredicates): def test_it_should_use_vm_settings_by_default(self): with mock_wire_protocol(mockwiredata.DATA_FILE_VM_SETTINGS) as protocol: + protocol.mock_wire_data.set_etag(888) extensions_goal_state = GoalState(protocol.client).extensions_goal_state self.assertTrue( isinstance(extensions_goal_state, ExtensionsGoalStateFromVmSettings), @@ -155,11 +157,12 @@ def http_get_handler(url, *_, **__): protocol.set_http_handlers(http_get_handler=None) goal_state.update() self._assert_directory_contents( - self._find_history_subdirectory("234-987"), ["VmSettings.json"]) + self._find_history_subdirectory("234-987"), ["VmSettings.json", "Certificates.json"]) def test_it_should_redact_the_protected_settings_when_saving_to_the_history_directory(self): with mock_wire_protocol(mockwiredata.DATA_FILE_VM_SETTINGS) as protocol: protocol.mock_wire_data.set_incarnation(888) + protocol.mock_wire_data.set_etag(888) goal_state = GoalState(protocol.client) @@ -172,7 +175,7 @@ def test_it_should_redact_the_protected_settings_when_saving_to_the_history_dire if len(protected_settings) == 0: raise Exception("The test goal state does not include any protected settings") - history_directory = self._find_history_subdirectory("888-1") + history_directory = self._find_history_subdirectory("888-888") extensions_config_file = os.path.join(history_directory, "ExtensionsConfig.xml") vm_settings_file = os.path.join(history_directory, "VmSettings.json") for file_name in extensions_config_file, vm_settings_file: @@ -197,7 +200,6 @@ def test_it_should_save_vm_settings_on_parse_errors(self): data_file = mockwiredata.DATA_FILE_VM_SETTINGS.copy() data_file["vm_settings"] = invalid_vm_settings_file protocol.mock_wire_data = mockwiredata.WireProtocolData(data_file) - protocol.mock_wire_data.set_etag(888) with self.assertRaises(ProtocolError): # the parsing error will cause an exception _ = GoalState(protocol.client) @@ -205,6 +207,7 @@ def test_it_should_save_vm_settings_on_parse_errors(self): # Do an extra call to update the goal state; this should save the vmsettings to the history directory # only once (self._find_history_subdirectory asserts 1 single match) time.sleep(0.1) # add a short delay to ensure that a new timestamp would be saved in the history folder + protocol.mock_wire_data.set_etag(888) with self.assertRaises(ProtocolError): _ = GoalState(protocol.client) @@ -374,6 +377,7 @@ def test_it_should_raise_when_the_tenant_certificate_is_missing(self): with mock_wire_protocol(data_file) as protocol: data_file["vm_settings"] = "hostgaplugin/vm_settings-missing_cert.json" protocol.mock_wire_data.reload() + protocol.mock_wire_data.set_etag(888) with self.assertRaises(GoalStateInconsistentError) as context: _ = GoalState(protocol.client) @@ -381,6 +385,55 @@ def test_it_should_raise_when_the_tenant_certificate_is_missing(self): expected_message = "Certificate 59A10F50FFE2A0408D3F03FE336C8FD5716CF25C needed by Microsoft.OSTCExtensions.VMAccessForLinux is missing from the goal state" self.assertIn(expected_message, str(context.exception)) + def test_it_should_download_certs_on_a_new_fast_track_goal_state(self): + data_file = mockwiredata.DATA_FILE_VM_SETTINGS.copy() + + with mock_wire_protocol(data_file) as protocol: + goal_state = GoalState(protocol.client) + + cert = "BD447EF71C3ADDF7C837E84D630F3FAC22CCD22F" + crt_path = os.path.join(self.tmp_dir, cert + ".crt") + prv_path = os.path.join(self.tmp_dir, cert + ".prv") + + # Check that crt and prv files are downloaded after processing goal state + self.assertTrue(os.path.isfile(crt_path)) + self.assertTrue(os.path.isfile(prv_path)) + + # Remove .crt file + os.remove(crt_path) + if os.path.isfile(crt_path): + raise Exception("{0}.crt was not removed.".format(cert)) + + # Update goal state and check that .crt was downloaded + protocol.mock_wire_data.set_etag(888) + goal_state.update() + self.assertTrue(os.path.isfile(crt_path)) + + def test_it_should_download_certs_on_a_new_fabric_goal_state(self): + data_file = mockwiredata.DATA_FILE_VM_SETTINGS.copy() + + with mock_wire_protocol(data_file) as protocol: + protocol.mock_wire_data.set_vm_settings_source(GoalStateSource.Fabric) + goal_state = GoalState(protocol.client) + + cert = "BD447EF71C3ADDF7C837E84D630F3FAC22CCD22F" + crt_path = os.path.join(self.tmp_dir, cert + ".crt") + prv_path = os.path.join(self.tmp_dir, cert + ".prv") + + # Check that crt and prv files are downloaded after processing goal state + self.assertTrue(os.path.isfile(crt_path)) + self.assertTrue(os.path.isfile(prv_path)) + + # Remove .crt file + os.remove(crt_path) + if os.path.isfile(crt_path): + raise Exception("{0}.crt was not removed.".format(cert)) + + # Update goal state and check that .crt was downloaded + protocol.mock_wire_data.set_incarnation(999) + goal_state.update() + self.assertTrue(os.path.isfile(crt_path)) + def test_it_should_refresh_the_goal_state_when_it_is_inconsistent(self): # # Some scenarios can produce inconsistent goal states. For example, during hibernation/resume, the Fabric goal state changes (the @@ -411,7 +464,7 @@ def http_get_handler(url, *_, **__): goal_state = GoalState(protocol.client) self.assertEqual(2, protocol.mock_wire_data.call_counts['goalstate'], "There should have been exactly 2 requests for the goal state (original + refresh)") - self.assertEqual(2, http_get_handler.certificate_requests, "There should have been exactly 2 requests for the goal state certificates (original + refresh)") + self.assertEqual(4, http_get_handler.certificate_requests, "There should have been exactly 4 requests for the goal state certificates (2x original + 2x refresh)") thumbprints = [c.thumbprint for c in goal_state.certs.cert_list.certificates] @@ -419,3 +472,74 @@ def http_get_handler(url, *_, **__): for settings in extension.settings: if settings.protectedSettings is not None: self.assertIn(settings.certificateThumbprint, thumbprints, "Certificate is missing from the goal state.") + + def test_it_should_raise_when_goal_state_properties_not_initialized(self): + with GoalStateTestCase._create_protocol_ws_and_hgap_in_sync() as protocol: + goal_state = GoalState( + protocol.client, + goal_state_properties=~GoalStateProperties.All) + + goal_state.update() + + with self.assertRaises(ProtocolError) as context: + _ = goal_state.container_id + + expected_message = "ContainerId is not in goal state properties" + self.assertIn(expected_message, str(context.exception)) + + with self.assertRaises(ProtocolError) as context: + _ = goal_state.role_config_name + + expected_message = "RoleConfig is not in goal state properties" + self.assertIn(expected_message, str(context.exception)) + + with self.assertRaises(ProtocolError) as context: + _ = goal_state.role_instance_id + + expected_message = "RoleInstanceId is not in goal state properties" + self.assertIn(expected_message, str(context.exception)) + + with self.assertRaises(ProtocolError) as context: + _ = goal_state.extensions_goal_state + + expected_message = "ExtensionsGoalState is not in goal state properties" + self.assertIn(expected_message, str(context.exception)) + + with self.assertRaises(ProtocolError) as context: + _ = goal_state.hosting_env + + expected_message = "HostingEnvironment is not in goal state properties" + self.assertIn(expected_message, str(context.exception)) + + with self.assertRaises(ProtocolError) as context: + _ = goal_state.certs + + expected_message = "Certificates is not in goal state properties" + self.assertIn(expected_message, str(context.exception)) + + with self.assertRaises(ProtocolError) as context: + _ = goal_state.shared_conf + + expected_message = "SharedConfig is not in goal state properties" + self.assertIn(expected_message, str(context.exception)) + + with self.assertRaises(ProtocolError) as context: + _ = goal_state.remote_access + + expected_message = "RemoteAccessInfo is not in goal state properties" + self.assertIn(expected_message, str(context.exception)) + + goal_state = GoalState( + protocol.client, + goal_state_properties=GoalStateProperties.All & ~GoalStateProperties.HostingEnv) + + goal_state.update() + + _ = goal_state.container_id, goal_state.role_instance_id, goal_state.role_config_name, \ + goal_state.extensions_goal_state, goal_state.certs, goal_state.shared_conf, goal_state.remote_access + + with self.assertRaises(ProtocolError) as context: + _ = goal_state.hosting_env + + expected_message = "HostingEnvironment is not in goal state properties" + self.assertIn(expected_message, str(context.exception)) diff --git a/tests/protocol/test_hostplugin.py b/tests/protocol/test_hostplugin.py index 42d8579d5..47e6871be 100644 --- a/tests/protocol/test_hostplugin.py +++ b/tests/protocol/test_hostplugin.py @@ -163,7 +163,7 @@ def create_mock_protocol(): yield protocol @patch("azurelinuxagent.common.protocol.healthservice.HealthService.report_host_plugin_versions") - @patch("azurelinuxagent.ga.update.restutil.http_get") + @patch("azurelinuxagent.common.protocol.hostplugin.restutil.http_get") @patch("azurelinuxagent.common.protocol.hostplugin.add_event") def assert_ensure_initialized(self, patch_event, patch_http_get, patch_report_health, response_body, @@ -244,7 +244,7 @@ def test_default_channel(self, patch_put, patch_upload, _): with self.create_mock_protocol() as wire_protocol: wire.HostPluginProtocol.is_default_channel = False - wire_protocol.update_goal_state() + wire_protocol.client.update_goal_state() # act wire_protocol.client.upload_status_blob() @@ -277,7 +277,7 @@ def test_fallback_channel_503(self, patch_put, patch_upload, _): with self.create_mock_protocol() as wire_protocol: wire.HostPluginProtocol.is_default_channel = False - wire_protocol.update_goal_state() + wire_protocol.client.update_goal_state() # act wire_protocol.client.upload_status_blob() @@ -311,7 +311,7 @@ def test_fallback_channel_410(self, patch_refresh_host_plugin, patch_put, patch_ with self.create_mock_protocol() as wire_protocol: wire.HostPluginProtocol.is_default_channel = False - wire_protocol.update_goal_state() + wire_protocol.client.update_goal_state() # act wire_protocol.client.upload_status_blob() @@ -345,7 +345,7 @@ def test_fallback_channel_failure(self, patch_put, patch_upload, _): with self.create_mock_protocol() as wire_protocol: wire.HostPluginProtocol.is_default_channel = False - wire_protocol.update_goal_state() + wire_protocol.client.update_goal_state() # act self.assertRaises(wire.ProtocolError, wire_protocol.client.upload_status_blob) @@ -998,7 +998,7 @@ def test_it_should_save_the_timestamp_of_the_most_recent_fast_track_goal_state(s # A fabric goal state should remove the state file protocol.mock_wire_data.set_vm_settings_source(GoalStateSource.Fabric) - + protocol.mock_wire_data.set_etag(888) _ = host_ga_plugin.fetch_vm_settings() self.assertFalse(os.path.exists(state_file), "{0} was not removed by a Fabric goal state".format(state_file)) diff --git a/tests/protocol/test_imds.py b/tests/protocol/test_imds.py index 167fe2bfb..1f8e428c1 100644 --- a/tests/protocol/test_imds.py +++ b/tests/protocol/test_imds.py @@ -20,18 +20,18 @@ import os import unittest -import azurelinuxagent.common.protocol.imds as imds +from azurelinuxagent.common.protocol import imds from azurelinuxagent.common.datacontract import set_properties from azurelinuxagent.common.exception import HttpError, ResourceGoneError from azurelinuxagent.common.future import ustr, httpclient from azurelinuxagent.common.utils import restutil -from tests.ga.test_update import ResponseMock +from tests.protocol.mocks import MockHttpResponse from tests.tools import AgentTestCase, data_dir, MagicMock, Mock, patch def get_mock_compute_response(): - return ResponseMock(response='''{ + return MockHttpResponse(status=httpclient.OK, body='''{ "location": "westcentralus", "name": "unit_test", "offer": "UnitOffer", @@ -52,7 +52,7 @@ def get_mock_compute_response(): class TestImds(AgentTestCase): - @patch("azurelinuxagent.ga.update.restutil.http_get") + @patch("azurelinuxagent.common.protocol.imds.restutil.http_get") def test_get(self, mock_http_get): mock_http_get.return_value = get_mock_compute_response() @@ -67,23 +67,23 @@ def test_get(self, mock_http_get): self.assertTrue('Metadata' in kw_args['headers']) self.assertEqual(True, kw_args['headers']['Metadata']) - @patch("azurelinuxagent.ga.update.restutil.http_get") + @patch("azurelinuxagent.common.protocol.imds.restutil.http_get") def test_get_bad_request(self, mock_http_get): - mock_http_get.return_value = ResponseMock(status=restutil.httpclient.BAD_REQUEST) + mock_http_get.return_value = MockHttpResponse(status=restutil.httpclient.BAD_REQUEST) test_subject = imds.ImdsClient(restutil.KNOWN_WIRESERVER_IP) self.assertRaises(HttpError, test_subject.get_compute) - @patch("azurelinuxagent.ga.update.restutil.http_get") + @patch("azurelinuxagent.common.protocol.imds.restutil.http_get") def test_get_internal_service_error(self, mock_http_get): - mock_http_get.return_value = ResponseMock(status=restutil.httpclient.INTERNAL_SERVER_ERROR) + mock_http_get.return_value = MockHttpResponse(status=restutil.httpclient.INTERNAL_SERVER_ERROR) test_subject = imds.ImdsClient(restutil.KNOWN_WIRESERVER_IP) self.assertRaises(HttpError, test_subject.get_compute) - @patch("azurelinuxagent.ga.update.restutil.http_get") + @patch("azurelinuxagent.common.protocol.imds.restutil.http_get") def test_get_empty_response(self, mock_http_get): - mock_http_get.return_value = ResponseMock(response=''.encode('utf-8')) + mock_http_get.return_value = MockHttpResponse(status=httpclient.OK, body=''.encode('utf-8')) test_subject = imds.ImdsClient(restutil.KNOWN_WIRESERVER_IP) self.assertRaises(ValueError, test_subject.get_compute) @@ -361,9 +361,9 @@ def _imds_response(f): def _assert_validation(self, http_status_code, http_response, expected_valid, expected_response): test_subject = imds.ImdsClient(restutil.KNOWN_WIRESERVER_IP) with patch("azurelinuxagent.common.utils.restutil.http_get") as mock_http_get: - mock_http_get.return_value = ResponseMock(status=http_status_code, + mock_http_get.return_value = MockHttpResponse(status=http_status_code, reason='reason', - response=http_response) + body=http_response) validate_response = test_subject.validate() self.assertEqual(1, mock_http_get.call_count) diff --git a/tests/protocol/test_wire.py b/tests/protocol/test_wire.py index b9fe23e41..2a36fc291 100644 --- a/tests/protocol/test_wire.py +++ b/tests/protocol/test_wire.py @@ -30,6 +30,7 @@ from azurelinuxagent.common.exception import ResourceGoneError, ProtocolError, \ ExtensionDownloadError, HttpError from azurelinuxagent.common.protocol.extensions_goal_state_from_extensions_config import ExtensionsGoalStateFromExtensionsConfig +from azurelinuxagent.common.protocol.goal_state import GoalStateProperties from azurelinuxagent.common.protocol.hostplugin import HostPluginProtocol from azurelinuxagent.common.protocol.wire import WireProtocol, WireClient, \ StatusBlob, VMStatus @@ -44,7 +45,7 @@ from tests.protocol.HttpRequestPredicates import HttpRequestPredicates from tests.protocol.mockwiredata import DATA_FILE_NO_EXT, DATA_FILE from tests.protocol.mockwiredata import WireProtocolData -from tests.tools import patch, AgentTestCase +from tests.tools import patch, AgentTestCase, load_bin_data data_with_bom = b'\xef\xbb\xbfhehe' testurl = 'http://foo' @@ -271,23 +272,23 @@ def http_get_handler(url, *_, **kwargs): protocol.set_http_handlers(http_get_handler=http_get_handler) mock_response = MockHttpResponse(200, body=None) - protocol.client.update_goal_state(force_update=True) + protocol.client.reset_goal_state() extensions_on_hold = protocol.get_goal_state().extensions_goal_state.on_hold self.assertFalse(extensions_on_hold, "Extensions should not be on hold when the in-vm artifacts profile response body is None") mock_response = MockHttpResponse(200, ' '.encode('utf-8')) - protocol.client.update_goal_state(force_update=True) + protocol.client.reset_goal_state() extensions_on_hold = protocol.get_goal_state().extensions_goal_state.on_hold self.assertFalse(extensions_on_hold, "Extensions should not be on hold when the in-vm artifacts profile response is an empty string") mock_response = MockHttpResponse(200, '{ }'.encode('utf-8')) - protocol.client.update_goal_state(force_update=True) + protocol.client.reset_goal_state() extensions_on_hold = protocol.get_goal_state().extensions_goal_state.on_hold self.assertFalse(extensions_on_hold, "Extensions should not be on hold when the in-vm artifacts profile response is an empty json object") with patch("azurelinuxagent.common.protocol.extensions_goal_state_from_extensions_config.add_event") as add_event: mock_response = MockHttpResponse(200, 'invalid json'.encode('utf-8')) - protocol.client.update_goal_state(force_update=True) + protocol.client.reset_goal_state() extensions_on_hold = protocol.get_goal_state().extensions_goal_state.on_hold self.assertFalse(extensions_on_hold, "Extensions should not be on hold when the in-vm artifacts profile response is not valid json") @@ -497,13 +498,30 @@ def test_get_ext_conf_with_extensions_should_retrieve_ext_handlers_and_vmagent_m self.assertFalse(extensions_goal_state.on_hold, "Extensions On Hold is expected to be False") - def test_download_ext_handler_pkg_should_not_invoke_host_channel_when_direct_channel_succeeds(self): + def test_download_zip_package_should_expand_and_delete_the_package(self): extension_url = 'https://fake_host/fake_extension.zip' target_file = os.path.join(self.tmp_dir, 'fake_extension.zip') + target_directory = os.path.join(self.tmp_dir, "fake_extension") + + def http_get_handler(url, *_, **__): + if url == extension_url or self.is_host_plugin_extension_artifact_request(url): + return MockHttpResponse(200, body=load_bin_data("ga/fake_extension.zip")) + return None + + with mock_wire_protocol(mockwiredata.DATA_FILE, http_get_handler=http_get_handler) as protocol: + protocol.client.download_zip_package("extension package", [extension_url], target_file, target_directory, use_verify_header=False) + + self.assertTrue(os.path.exists(target_directory), "The extension package was not downloaded") + self.assertFalse(os.path.exists(target_file), "The extension package was not deleted") + + def test_download_zip_package_should_not_invoke_host_channel_when_direct_channel_succeeds(self): + extension_url = 'https://fake_host/fake_extension.zip' + target_file = os.path.join(self.tmp_dir, 'fake_extension.zip') + target_directory = os.path.join(self.tmp_dir, "fake_extension") def http_get_handler(url, *_, **__): if url == extension_url: - return MockHttpResponse(200) + return MockHttpResponse(200, body=load_bin_data("ga/fake_extension.zip")) if self.is_host_plugin_extension_artifact_request(url): self.fail('The host channel should not have been used') return None @@ -511,40 +529,42 @@ def http_get_handler(url, *_, **__): with mock_wire_protocol(mockwiredata.DATA_FILE, http_get_handler=http_get_handler) as protocol: HostPluginProtocol.is_default_channel = False - protocol.client.download_extension([extension_url], target_file, use_verify_header=False) + protocol.client.download_zip_package("extension package", [extension_url], target_file, target_directory, use_verify_header=False) urls = protocol.get_tracked_urls() self.assertEqual(len(urls), 1, "Unexpected number of HTTP requests: [{0}]".format(urls)) self.assertEqual(urls[0], extension_url, "The extension should have been downloaded over the direct channel") - self.assertTrue(os.path.exists(target_file), "The extension package was not downloaded") + self.assertTrue(os.path.exists(target_directory), "The extension package was not downloaded") self.assertFalse(HostPluginProtocol.is_default_channel, "The host channel should not have been set as the default") - def test_download_ext_handler_pkg_should_use_host_channel_when_direct_channel_fails_and_set_host_as_default(self): + def test_download_zip_package_should_use_host_channel_when_direct_channel_fails_and_set_host_as_default(self): extension_url = 'https://fake_host/fake_extension.zip' target_file = os.path.join(self.tmp_dir, 'fake_extension.zip') + target_directory = os.path.join(self.tmp_dir, "fake_extension") def http_get_handler(url, *_, **kwargs): if url == extension_url: return HttpError("Exception to fake an error on the direct channel") if self.is_host_plugin_extension_request(url, kwargs, extension_url): - return MockHttpResponse(200) + return MockHttpResponse(200, body=load_bin_data("ga/fake_extension.zip")) return None with mock_wire_protocol(mockwiredata.DATA_FILE, http_get_handler=http_get_handler) as protocol: HostPluginProtocol.is_default_channel = False - protocol.client.download_extension([extension_url], target_file, use_verify_header=False) + protocol.client.download_zip_package("extension package", [extension_url], target_file, target_directory, use_verify_header=False) urls = protocol.get_tracked_urls() self.assertEqual(len(urls), 2, "Unexpected number of HTTP requests: [{0}]".format(urls)) self.assertEqual(urls[0], extension_url, "The first attempt should have been over the direct channel") self.assertTrue(self.is_host_plugin_extension_artifact_request(urls[1]), "The retry attempt should have been over the host channel") - self.assertTrue(os.path.exists(target_file), 'The extension package was not downloaded') + self.assertTrue(os.path.exists(target_directory), 'The extension package was not downloaded') self.assertTrue(HostPluginProtocol.is_default_channel, "The host channel should have been set as the default") - def test_download_ext_handler_pkg_should_retry_the_host_channel_after_refreshing_host_plugin(self): + def test_download_zip_package_should_retry_the_host_channel_after_refreshing_host_plugin(self): extension_url = 'https://fake_host/fake_extension.zip' target_file = os.path.join(self.tmp_dir, 'fake_extension.zip') + target_directory = os.path.join(self.tmp_dir, "fake_extension") def http_get_handler(url, *_, **kwargs): if url == extension_url: @@ -554,7 +574,7 @@ def http_get_handler(url, *_, **kwargs): if http_get_handler.goal_state_requests == 0: http_get_handler.goal_state_requests += 1 return ResourceGoneError("Exception to fake a stale goal") - return MockHttpResponse(200) + return MockHttpResponse(200, body=load_bin_data("ga/fake_extension.zip")) if self.is_goal_state_request(url): protocol.track_url(url) # track requests for the goal state return None @@ -569,7 +589,7 @@ def http_get_handler(url, *_, **kwargs): protocol.set_http_handlers(http_get_handler=http_get_handler) - protocol.client.download_extension([extension_url], target_file, use_verify_header=False) + protocol.client.download_zip_package("extension package", [extension_url], target_file, target_directory, use_verify_header=False) urls = protocol.get_tracked_urls() self.assertEqual(len(urls), 4, "Unexpected number of HTTP requests: [{0}]".format(urls)) @@ -577,14 +597,15 @@ def http_get_handler(url, *_, **kwargs): self.assertTrue(self.is_host_plugin_extension_artifact_request(urls[1]), "The second attempt should have been over the host channel") self.assertTrue(self.is_goal_state_request(urls[2]), "The host channel should have been refreshed the goal state") self.assertTrue(self.is_host_plugin_extension_artifact_request(urls[3]), "The third attempt should have been over the host channel") - self.assertTrue(os.path.exists(target_file), 'The extension package was not downloaded') + self.assertTrue(os.path.exists(target_directory), 'The extension package was not downloaded') self.assertTrue(HostPluginProtocol.is_default_channel, "The host channel should have been set as the default") finally: HostPluginProtocol.is_default_channel = False - def test_download_ext_handler_pkg_should_not_change_default_channel_when_all_channels_fail(self): + def test_download_zip_package_should_not_change_default_channel_when_all_channels_fail(self): extension_url = 'https://fake_host/fake_extension.zip' target_file = os.path.join(self.tmp_dir, "fake_extension.zip") + target_directory = os.path.join(self.tmp_dir, "fake_extension") def http_get_handler(url, *_, **kwargs): if url == extension_url or self.is_host_plugin_extension_request(url, kwargs, extension_url): @@ -602,7 +623,7 @@ def http_get_handler(url, *_, **kwargs): protocol.set_http_handlers(http_get_handler=http_get_handler) with self.assertRaises(ExtensionDownloadError): - protocol.client.download_extension([extension_url], target_file, use_verify_header=False) + protocol.client.download_zip_package("extension package", [extension_url], target_file, target_directory, use_verify_header=False) urls = protocol.get_tracked_urls() self.assertEqual(len(urls), 2, "Unexpected number of HTTP requests: [{0}]".format(urls)) @@ -611,6 +632,25 @@ def http_get_handler(url, *_, **kwargs): self.assertFalse(os.path.exists(target_file), "The extension package was downloaded and it shouldn't have") self.assertFalse(HostPluginProtocol.is_default_channel, "The host channel should not have been set as the default") + def test_invalid_zip_should_raise_an_error(self): + extension_url = 'https://fake_host/fake_extension.zip' + target_file = os.path.join(self.tmp_dir, "fake_extension.zip") + target_directory = os.path.join(self.tmp_dir, "fake_extension") + + def http_get_handler(url, *_, **kwargs): + if url == extension_url or self.is_host_plugin_extension_request(url, kwargs, extension_url): + return MockHttpResponse(status=200, body=b"NOT A ZIP") + return None + + with mock_wire_protocol(mockwiredata.DATA_FILE) as protocol: + protocol.set_http_handlers(http_get_handler=http_get_handler) + + with self.assertRaises(ExtensionDownloadError): + protocol.client.download_zip_package("extension package", [extension_url], target_file, target_directory, use_verify_header=False) + + self.assertFalse(os.path.exists(target_file), "The extension package should have been deleted") + self.assertFalse(os.path.exists(target_directory), "The extension directory should not have been created") + def test_fetch_manifest_should_not_invoke_host_channel_when_direct_channel_succeeds(self): manifest_url = 'https://fake_host/fake_manifest.xml' manifest_xml = '' @@ -741,7 +781,7 @@ def http_get_handler(url, *_, **__): protocol.set_http_handlers(http_get_handler=http_get_handler) HostPluginProtocol.is_default_channel = False - protocol.client.update_goal_state(force_update=True) + protocol.client.reset_goal_state() urls = protocol.get_tracked_urls() self.assertEqual(len(urls), 1, "Unexpected HTTP requests: [{0}]".format(urls)) @@ -760,7 +800,7 @@ def http_get_handler(url, *_, **kwargs): HostPluginProtocol.is_default_channel = False try: - protocol.client.update_goal_state(force_update=True) + protocol.client.reset_goal_state() urls = protocol.get_tracked_urls() self.assertEqual(len(urls), 2, "Invalid number of requests: [{0}]".format(urls)) @@ -793,7 +833,7 @@ def http_get_handler(url, *_, **kwargs): protocol.set_http_handlers(http_get_handler=http_get_handler) - protocol.client.update_goal_state(force_update=True) + protocol.client.reset_goal_state() urls = protocol.get_tracked_urls() self.assertEqual(len(urls), 4, "Invalid number of requests: [{0}]".format(urls)) @@ -825,7 +865,7 @@ def http_get_handler(url, *_, **kwargs): protocol.set_http_handlers(http_get_handler=http_get_handler) - protocol.client.update_goal_state(force_update=True) + protocol.client.reset_goal_state() urls = protocol.get_tracked_urls() self.assertEqual(len(urls), 4, "Invalid number of requests: [{0}]".format(urls)) @@ -952,7 +992,7 @@ def test_download_using_appropriate_channel_should_change_default_channel_when_s class UpdateGoalStateTestCase(HttpRequestPredicates, AgentTestCase): """ - Tests for WireClient.update_goal_state() + Tests for WireClient.update_goal_state() and WireClient.reset_goal_state() """ def test_it_should_update_the_goal_state_and_the_host_plugin_when_the_incarnation_changes(self): @@ -997,7 +1037,7 @@ def test_it_should_update_the_goal_state_and_the_host_plugin_when_the_incarnatio ''' if forced: - protocol.client.update_goal_state(force_update=True) + protocol.client.reset_goal_state() else: protocol.client.update_goal_state() @@ -1051,7 +1091,7 @@ def test_forced_update_should_update_the_goal_state_and_the_host_plugin_when_the protocol.mock_wire_data.set_role_config_name(new_role_config_name) protocol.mock_wire_data.shared_config = new_shared_conf - protocol.client.update_goal_state(force_update=True) + protocol.client.reset_goal_state() self.assertEqual(protocol.client.get_goal_state().incarnation, incarnation) self.assertEqual(protocol.client.get_shared_conf().xml_text, new_shared_conf) @@ -1059,6 +1099,28 @@ def test_forced_update_should_update_the_goal_state_and_the_host_plugin_when_the self.assertEqual(protocol.client.get_host_plugin().container_id, new_container_id) self.assertEqual(protocol.client.get_host_plugin().role_config_name, new_role_config_name) + def test_reset_should_init_provided_goal_state_properties(self): + with mock_wire_protocol(mockwiredata.DATA_FILE) as protocol: + protocol.client.reset_goal_state(goal_state_properties=GoalStateProperties.All & ~GoalStateProperties.Certificates) + + with self.assertRaises(ProtocolError) as context: + _ = protocol.client.get_certs() + + expected_message = "Certificates is not in goal state properties" + self.assertIn(expected_message, str(context.exception)) + + def test_reset_should_init_the_goal_state(self): + with mock_wire_protocol(mockwiredata.DATA_FILE) as protocol: + new_container_id = str(uuid.uuid4()) + new_role_config_name = str(uuid.uuid4()) + protocol.mock_wire_data.set_container_id(new_container_id) + protocol.mock_wire_data.set_role_config_name(new_role_config_name) + + protocol.client.reset_goal_state() + + self.assertEqual(protocol.client.get_goal_state().container_id, new_container_id) + self.assertEqual(protocol.client.get_goal_state().role_config_name, new_role_config_name) + class UpdateHostPluginFromGoalStateTestCase(AgentTestCase): """ diff --git a/tests/test_agent.py b/tests/test_agent.py index 1b14c9d16..f0f773f05 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -31,6 +31,7 @@ DVD.MountPoint = /mnt/cdrom/secure Debug.AgentCpuQuota = 50 Debug.AgentCpuThrottledTimeThreshold = 120 +Debug.AgentMemoryQuota = 31457280 Debug.AutoUpdateHotfixFrequency = 14400 Debug.AutoUpdateNormalFrequency = 86400 Debug.CgroupCheckPeriod = 300 @@ -39,6 +40,7 @@ Debug.CgroupLogMetrics = False Debug.CgroupMonitorExpiryTime = 2022-03-31 Debug.CgroupMonitorExtensionName = Microsoft.Azure.Monitor.AzureMonitorLinuxAgent +Debug.EnableAgentMemoryUsageCheck = False Debug.EnableFastTrack = True Debug.EnableGAVersioning = False Debug.EtpCollectionPeriod = 300 @@ -216,6 +218,7 @@ def test_rejects_invalid_log_collector_mode(self, mock_exit, mock_stderr): # py @patch("azurelinuxagent.agent.LogCollector") def test_calls_collect_logs_with_proper_mode(self, mock_log_collector, *args): # pylint: disable=unused-argument agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) + mock_log_collector.run = Mock() agent.collect_logs(is_full_mode=True) full_mode = mock_log_collector.call_args_list[0][0][0] @@ -229,15 +232,15 @@ def test_calls_collect_logs_with_proper_mode(self, mock_log_collector, *args): def test_calls_collect_logs_on_valid_cgroups(self, mock_log_collector): try: CollectLogsHandler.enable_cgroups_validation() + mock_log_collector.run = Mock() - @staticmethod def mock_cgroup_paths(*args, **kwargs): if args and args[0] == "self": relative_path = "{0}/{1}".format(cgroupconfigurator.LOGCOLLECTOR_SLICE, logcollector.CGROUPS_UNIT) return (cgroupconfigurator.LOGCOLLECTOR_SLICE, relative_path) return SystemdCgroupsApi.get_process_cgroup_relative_paths(*args, **kwargs) - with patch.object(SystemdCgroupsApi, "get_process_cgroup_relative_paths", mock_cgroup_paths): + with patch("azurelinuxagent.agent.SystemdCgroupsApi.get_process_cgroup_paths", side_effect=mock_cgroup_paths): agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) agent.collect_logs(is_full_mode=True) @@ -245,17 +248,18 @@ def mock_cgroup_paths(*args, **kwargs): finally: CollectLogsHandler.disable_cgroups_validation() - def test_doesnt_call_collect_logs_on_invalid_cgroups(self): + @patch("azurelinuxagent.agent.LogCollector") + def test_doesnt_call_collect_logs_on_invalid_cgroups(self, mock_log_collector): try: CollectLogsHandler.enable_cgroups_validation() + mock_log_collector.run = Mock() - @staticmethod def mock_cgroup_paths(*args, **kwargs): if args and args[0] == "self": return ("NOT_THE_CORRECT_PATH", "NOT_THE_CORRECT_PATH") return SystemdCgroupsApi.get_process_cgroup_relative_paths(*args, **kwargs) - with patch.object(SystemdCgroupsApi, "get_process_cgroup_relative_paths", mock_cgroup_paths): + with patch("azurelinuxagent.agent.SystemdCgroupsApi.get_process_cgroup_paths", side_effect=mock_cgroup_paths): agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) exit_error = RuntimeError("Exiting") diff --git a/tests/tools.py b/tests/tools.py index b22a85637..85d460d37 100644 --- a/tests/tools.py +++ b/tests/tools.py @@ -121,6 +121,14 @@ def is_python_version_26(): return sys.version_info[0] == 2 and sys.version_info[1] == 6 +def is_python_version_34(): + return sys.version_info[0] == 3 and sys.version_info[1] == 4 + + +def is_python_version_26_or_34(): + return is_python_version_26() or is_python_version_34() + + class AgentTestCase(unittest.TestCase): @classmethod def setUpClass(cls): diff --git a/tests_e2e/orchestrator/docker/Dockerfile b/tests_e2e/orchestrator/docker/Dockerfile new file mode 100644 index 000000000..a748ff0b8 --- /dev/null +++ b/tests_e2e/orchestrator/docker/Dockerfile @@ -0,0 +1,85 @@ +# +# * Sample command to build the image: +# +# docker build -t waagenttests . +# +# * Sample command to execute a container interactively: +# +# docker run --rm -it -v /home/nam/src/WALinuxAgent:/home/waagent/WALinuxAgent waagenttests bash --login +# +FROM ubuntu:latest +LABEL description="Test environment for WALinuxAgent" + +SHELL ["/bin/bash", "-c"] + +# +# Install the required packages as root +# +USER root + +RUN \ + apt-get update && \ + \ + # \ + # Install basic dependencies \ + # \ + apt-get install -y git python3.10 python3.10-dev wget bzip2 && \ + ln /usr/bin/python3.10 /usr/bin/python3 && \ + \ + # \ + # Install LISA dependencies \ + # \ + apt-get install -y git gcc libgirepository1.0-dev libcairo2-dev qemu-utils libvirt-dev \ + python3-pip python3-venv && \ + \ + # \ + # Install test dependencies \ + # \ + apt-get install -y zip && \ + \ + # \ + # Create user waagent, which is used to execute the tests \ + # \ + groupadd waagent && \ + useradd --shell /bin/bash --create-home -g waagent waagent && \ + : + +# +# Do the Poetry and LISA setup as waagent +# +USER waagent + +RUN \ + export PATH="$HOME/.local/bin:$PATH" && \ + \ + # \ + # Install LISA \ + # \ + cd $HOME && \ + git clone https://github.com/microsoft/lisa.git && \ + cd lisa && \ + \ + python3 -m pip install --upgrade pip && \ + python3 -m pip install --editable .[azure,libvirt] --config-settings editable_mode=compat && \ + \ + # \ + # Install additional test dependencies \ + # \ + python3 -m pip install distro msrestazure && \ + python3 -m pip install azure-mgmt-compute --upgrade && \ + \ + # \ + # Download Pypy to a known location, from which it will be installed to the test VMs. \ + # \ + mkdir $HOME/bin && \ + wget https://downloads.python.org/pypy/pypy3.7-v7.3.5-linux64.tar.bz2 -O /tmp/pypy3.7-x64.tar.bz2 && \ + wget https://downloads.python.org/pypy/pypy3.7-v7.3.5-aarch64.tar.bz2 -O /tmp/pypy3.7-arm64.tar.bz2 && \ + \ + # \ + # The setup for the tests depends on a few paths; add those to the profile \ + # \ + echo 'export PYTHONPATH="$HOME/WALinuxAgent"' >> $HOME/.bash_profile && \ + echo 'export PATH="$HOME/.local/bin:$PATH"' >> $HOME/.bash_profile && \ + echo 'cd $HOME' >> $HOME/.bash_profile && \ + : + diff --git a/tests_e2e/orchestrator/lib/agent_junit.py b/tests_e2e/orchestrator/lib/agent_junit.py new file mode 100644 index 000000000..a8ff8eb6c --- /dev/null +++ b/tests_e2e/orchestrator/lib/agent_junit.py @@ -0,0 +1,66 @@ +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Type + +# +# Disable those warnings, since 'lisa' is an external, non-standard, dependency +# E0401: Unable to import 'dataclasses_json' (import-error) +# E0401: Unable to import 'lisa.notifiers.junit' (import-error) +# E0401: Unable to import 'lisa' (import-error) +# E0401: Unable to import 'lisa.messages' (import-error) +from dataclasses import dataclass # pylint: disable=E0401 +from dataclasses_json import dataclass_json # pylint: disable=E0401 +from lisa.notifiers.junit import JUnit # pylint: disable=E0401 +from lisa import schema # pylint: disable=E0401 +from lisa.messages import ( # pylint: disable=E0401 + MessageBase, + TestResultMessage, +) + + +@dataclass_json() +@dataclass +class AgentJUnitSchema(schema.Notifier): + path: str = "agent.junit.xml" + + +class AgentJUnit(JUnit): + @classmethod + def type_name(cls) -> str: + return "agent.junit" + + @classmethod + def type_schema(cls) -> Type[schema.TypedSchema]: + return AgentJUnitSchema + + def _received_message(self, message: MessageBase) -> None: + # The Agent sends its own TestResultMessage and marks them as "AgentTestResultMessage"; for the + # test results sent by LISA itself, we change the suite name to "_Runbook_" in order to separate them + # from actual test results. + if isinstance(message, TestResultMessage) and message.type != "AgentTestResultMessage": + if "Unexpected error in AgentTestSuite" in message.message: + # Ignore these errors, they are already reported as AgentTestResultMessages + return + message.suite_full_name = "_Runbook_" + message.suite_name = message.suite_full_name + image = message.information.get('image') + if image is not None: + # NOTE: message.information['environment'] is similar to "[generated_2]" and can be correlated + # with the main LISA log to find the specific VM for the message. + message.full_name = f"{image} [{message.information['environment']}]" + message.name = message.full_name + super()._received_message(message) diff --git a/tests_e2e/orchestrator/lib/agent_test_loader.py b/tests_e2e/orchestrator/lib/agent_test_loader.py new file mode 100644 index 000000000..a0f0bfaaf --- /dev/null +++ b/tests_e2e/orchestrator/lib/agent_test_loader.py @@ -0,0 +1,257 @@ +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import importlib.util +# E0401: Unable to import 'yaml' (import-error) +import yaml # pylint: disable=E0401 + +from pathlib import Path +from typing import Any, Dict, List, Type + +import tests_e2e +from tests_e2e.tests.lib.agent_test import AgentTest + + +class TestSuiteInfo(object): + """ + Description of a test suite + """ + # The name of the test suite + name: str + # The tests that comprise the suite + tests: List[Type[AgentTest]] + # Images or image sets (as defined in images.yml) on which the suite must run. + images: List[str] + # The location (region) on which the suite must run; if empty, the suite can run on any location + location: str + # Whether this suite must run on its own test VM + owns_vm: bool + + def __str__(self): + return self.name + + +class VmImageInfo(object): + # The URN of the image (publisher, offer, version separated by spaces) + urn: str + # Indicates that the image is available only on those locations. If empty, the image should be available in all locations + locations: List[str] + # Indicates that the image is available only for those VM sizes. If empty, the image should be available for all VM sizes + vm_sizes: List[str] + + def __str__(self): + return self.urn + + +class AgentTestLoader(object): + """ + Loads a given set of test suites from the YAML configuration files. + """ + def __init__(self, test_suites: str): + """ + Loads the specified 'test_suites', which are given as a string of comma-separated suite names or a YAML description + of a single test_suite. + + When given as a comma-separated list, each item must correspond to the name of the YAML files describing s suite (those + files are located under the .../WALinuxAgent/tests_e2e/test_suites directory). For example, if test_suites == "agent_bvt, fast_track" + then this method will load files agent_bvt.yml and fast_track.yml. + + When given as a YAML string, the value must correspond to the description a single test suite, for example + + name: "AgentBvt" + tests: + - "bvts/extension_operations.py" + - "bvts/run_command.py" + - "bvts/vm_access.py" + """ + self.__test_suites: List[TestSuiteInfo] = self._load_test_suites(test_suites) + self.__images: Dict[str, List[VmImageInfo]] = self._load_images() + self._validate() + + _SOURCE_CODE_ROOT: Path = Path(tests_e2e.__path__[0]) + + @property + def test_suites(self) -> List[TestSuiteInfo]: + return self.__test_suites + + @property + def images(self) -> Dict[str, List[VmImageInfo]]: + """ + A dictionary where, for each item, the key is the name of an image or image set and the value is a list of VmImageInfos for + the corresponding images. + """ + return self.__images + + def _validate(self): + """ + Performs some basic validations on the data loaded from the YAML description files + """ + for suite in self.test_suites: + # Validate that the images the suite must run on are in images.yml + for image in suite.images: + if image not in self.images: + raise Exception(f"Invalid image reference in test suite {suite.name}: Can't find {image} in images.yml") + + # If the suite specifies a location, validate that the images it uses are available in that location + if suite.location != '': + for suite_image in suite.images: + for image in self.images[suite_image]: + if len(image.locations) > 0: + if suite.location not in image.locations: + raise Exception(f"Test suite {suite.name} must be executed in {suite.location}, but <{image.urn}> is not available in that location") + + @staticmethod + def _load_test_suites(test_suites: str) -> List[TestSuiteInfo]: + # + # Attempt to parse 'test_suites' as the YML description of a single suite + # + parsed = yaml.safe_load(test_suites) + + # + # A comma-separated list (e.g. "foo", "foo, bar", etc.) is valid YAML, but it is parsed as a string. An actual test suite would + # be parsed as a dictionary. If it is a dict, take is as the YML description of a single test suite + # + if isinstance(parsed, dict): + return [AgentTestLoader._load_test_suite(parsed)] + + # + # If test_suites is not YML, then it should be a comma-separated list of description files + # + description_files: List[Path] = [AgentTestLoader._SOURCE_CODE_ROOT/"test_suites"/f"{t.strip()}.yml" for t in test_suites.split(',')] + return [AgentTestLoader._load_test_suite(f) for f in description_files] + + @staticmethod + def _load_test_suite(description_file: Path) -> TestSuiteInfo: + """ + Loads the description of a TestSuite from its YAML file. + + A test suite has 5 properties: name, tests, images, location, and owns-vm. For example: + + name: "AgentBvt" + tests: + - "bvts/extension_operations.py" + - "bvts/run_command.py" + - "bvts/vm_access.py" + images: "endorsed" + location: "eastuseaup" + owns-vm: true + + * name - A string used to identify the test suite + * tests - A list of the tests in the suite. Each test is specified by the path for its source code relative to + WALinuxAgent/tests_e2e/tests. + * images - A string, or a list of strings, specifying the images on which the test suite must be executed. Each value + can be the name of a single image (e.g."ubuntu_2004"), or the name of an image set (e.g. "endorsed"). The + names for images and image sets are defined in WALinuxAgent/tests_e2e/tests_suites/images.yml. + * location - [Optional; string] If given, the test suite must be executed on that location. If not specified, + or set to an empty string, the test suite will be executed in the default location. This is useful + for test suites that exercise a feature that is enabled only in certain regions. + * owns-vm - [Optional; boolean] By default all suites in a test run are executed on the same test VMs; if this + value is set to True, new test VMs will be created and will be used exclusively for this test suite. + This is useful for suites that modify the test VMs in such a way that the setup may cause problems + in other test suites (for example, some tests targeted to the HGAP block internet access in order to + force the agent to use the HGAP). + + """ + test_suite: Dict[str, Any] = AgentTestLoader._load_file(description_file) + + if any([test_suite.get(p) is None for p in ["name", "tests", "images"]]): + raise Exception(f"Invalid test suite: {description_file}. 'name', 'tests', and 'images' are required properties") + + test_suite_info = TestSuiteInfo() + + test_suite_info.name = test_suite["name"] + + test_suite_info.tests = [] + source_files = [AgentTestLoader._SOURCE_CODE_ROOT/"tests"/t for t in test_suite["tests"]] + for f in source_files: + test_suite_info.tests.extend(AgentTestLoader._load_test_classes(f)) + + images = test_suite["images"] + if isinstance(images, str): + test_suite_info.images = [images] + else: + test_suite_info.images = images + + test_suite_info.location = test_suite.get("location") + if test_suite_info.location is None: + test_suite_info.location = "" + + test_suite_info.owns_vm = "owns-vm" in test_suite and test_suite["owns-vm"] + + return test_suite_info + + @staticmethod + def _load_test_classes(source_file: Path) -> List[Type[AgentTest]]: + """ + Takes a 'source_file', which must be a Python module, and returns a list of all the classes derived from AgentTest. + """ + spec = importlib.util.spec_from_file_location(f"tests_e2e.tests.{source_file.name}", str(source_file)) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + # return all the classes in the module that are subclasses of AgentTest but are not AgentTest itself. + return [v for v in module.__dict__.values() if isinstance(v, type) and issubclass(v, AgentTest) and v != AgentTest] + + @staticmethod + def _load_images() -> Dict[str, List[VmImageInfo]]: + """ + Loads images.yml into a dictionary where, for each item, the key is an image or image set and the value is a list of VmImageInfos + for the corresponding images. + + See the comments in image.yml for a description of the structure of each item. + """ + image_descriptions = AgentTestLoader._load_file(AgentTestLoader._SOURCE_CODE_ROOT/"test_suites"/"images.yml") + if "images" not in image_descriptions: + raise Exception("images.yml is missing the 'images' item") + + images = {} + + # first load the images as 1-item lists + for name, description in image_descriptions["images"].items(): + i = VmImageInfo() + if isinstance(description, str): + i.urn = description + i.locations = [] + i.vm_sizes = [] + else: + if "urn" not in description: + raise Exception(f"Image {name} is missing the 'urn' property: {description}") + i.urn = description["urn"] + i.locations = description["locations"] if "locations" in description else [] + i.vm_sizes = description["vm_sizes"] if "vm_sizes" in description else [] + images[name] = [i] + + # now load the image-sets, mapping them to the images that we just computed + for image_set_name, image_list in image_descriptions["image-sets"].items(): + # the same name cannot denote an image and an image-set + if image_set_name in images: + raise Exception(f"Invalid image-set in images.yml: {image_set_name}. The name is used by an existing image") + images_in_set = [] + for i in image_list: + if i not in images: + raise Exception(f"Can't find image {i} (referenced by image-set {image_set_name}) in images.yml") + images_in_set.extend(images[i]) + images[image_set_name] = images_in_set + + return images + + @staticmethod + def _load_file(file: Path) -> Dict[str, Any]: + """Helper to load a YML file""" + try: + with file.open() as f: + return yaml.safe_load(f) + except Exception as e: + raise Exception(f"Can't load {file}: {e}") diff --git a/tests_e2e/orchestrator/lib/agent_test_suite.py b/tests_e2e/orchestrator/lib/agent_test_suite.py new file mode 100644 index 000000000..0c95daf60 --- /dev/null +++ b/tests_e2e/orchestrator/lib/agent_test_suite.py @@ -0,0 +1,645 @@ +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import contextlib +import datetime +import json +import logging +import traceback +import uuid + +from pathlib import Path +from threading import current_thread, RLock +from typing import Any, Dict, List + +# Disable those warnings, since 'lisa' is an external, non-standard, dependency +# E0401: Unable to import 'lisa' (import-error) +# etc +from lisa import ( # pylint: disable=E0401 + Environment, + Logger, + Node, + notifier, + simple_requirement, + TestCaseMetadata, + TestSuite as LisaTestSuite, + TestSuiteMetadata, +) +from lisa.environment import EnvironmentStatus # pylint: disable=E0401 +from lisa.messages import TestStatus, TestResultMessage # pylint: disable=E0401 +from lisa.sut_orchestrator import AZURE # pylint: disable=E0401 +from lisa.sut_orchestrator.azure.common import get_node_context, AzureNodeSchema # pylint: disable=E0401 + +import makepkg +from azurelinuxagent.common.version import AGENT_VERSION +from tests_e2e.orchestrator.lib.agent_test_loader import TestSuiteInfo +from tests_e2e.tests.lib.agent_log import AgentLog +from tests_e2e.tests.lib.agent_test import TestSkipped +from tests_e2e.tests.lib.agent_test_context import AgentTestContext +from tests_e2e.tests.lib.identifiers import VmIdentifier +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.logging import set_current_thread_log +from tests_e2e.tests.lib.agent_log import AgentLogRecord +from tests_e2e.tests.lib.shell import run_command +from tests_e2e.tests.lib.ssh_client import SshClient + + +def _initialize_lisa_logger(): + """ + Customizes the LISA logger. + + The default behavior of this logger is too verbose, which makes reading the logs difficult. We set up a more succinct + formatter and decrease the log level to INFO (the default is VERBOSE). In the future we may consider making this + customization settable at runtime in case we need to debug LISA issues. + """ + logger: Logger = logging.getLogger("lisa") + + logger.setLevel(logging.INFO) + + formatter = logging.Formatter('%(asctime)s.%(msecs)03d [%(levelname)s] [%(threadName)s] %(message)s', datefmt="%Y-%m-%dT%H:%M:%SZ") + for handler in logger.handlers: + handler.setFormatter(formatter) + + +# +# We want to customize the LISA logger as early as possible, so we do it when this module is first imported. That will +# happen early in the LISA workflow, when it loads the test suites to execute. +# +_initialize_lisa_logger() + + +# +# Helper to change the current thread name temporarily +# +@contextlib.contextmanager +def _set_thread_name(name: str): + initial_name = current_thread().name + current_thread().name = name + try: + yield + finally: + current_thread().name = initial_name + + +# +# Possible values for the collect_logs parameter +# +class CollectLogs(object): + Always = 'always' # Always collect logs + Failed = 'failed' # Collect logs only on test failures + No = 'no' # Never collect logs + + +@TestSuiteMetadata(area="waagent", category="", description="") +class AgentTestSuite(LisaTestSuite): + """ + Manages the setup of test VMs and execution of Agent test suites. This class acts as the interface with the LISA framework, which + will invoke the execute() method when a runbook is executed. + """ + + class _Context(AgentTestContext): + def __init__(self, vm: VmIdentifier, paths: AgentTestContext.Paths, connection: AgentTestContext.Connection): + super().__init__(vm=vm, paths=paths, connection=connection) + # These are initialized by AgentTestSuite._set_context(). + self.log_path: Path = None + self.lisa_log: Logger = None + self.node: Node = None + self.runbook_name: str = None + self.environment_name: str = None + self.is_vhd: bool = None + self.test_suites: List[AgentTestSuite] = None + self.collect_logs: str = None + self.skip_setup: bool = None + self.ssh_client: SshClient = None + + def __init__(self, metadata: TestSuiteMetadata) -> None: + super().__init__(metadata) + # The context is initialized by _set_context() via the call to execute() + self.__context: AgentTestSuite._Context = None + + def _initialize(self, node: Node, variables: Dict[str, Any], lisa_working_path: str, lisa_log_path: str, lisa_log: Logger): + connection_info = node.connection_info + node_context = get_node_context(node) + runbook = node.capability.get_extended_runbook(AzureNodeSchema, AZURE) + + self.__context = self._Context( + vm=VmIdentifier( + location=runbook.location, + subscription=node.features._platform.subscription_id, + resource_group=node_context.resource_group_name, + name=node_context.vm_name), + paths=AgentTestContext.Paths( + working_directory=self._get_working_directory(lisa_working_path), + remote_working_directory=Path('/home')/connection_info['username']), + connection=AgentTestContext.Connection( + ip_address=connection_info['address'], + username=connection_info['username'], + private_key_file=connection_info['private_key_file'], + ssh_port=connection_info['port'])) + + self.__context.log_path = self._get_log_path(variables, lisa_log_path) + self.__context.lisa_log = lisa_log + self.__context.node = node + self.__context.is_vhd = self._get_optional_parameter(variables, "c_vhd") != "" + self.__context.environment_name = f"{node.os.name}-vhd" if self.__context.is_vhd else self._get_required_parameter(variables, "c_env_name") + self.__context.test_suites = self._get_required_parameter(variables, "c_test_suites") + self.__context.collect_logs = self._get_required_parameter(variables, "collect_logs") + self.__context.skip_setup = self._get_required_parameter(variables, "skip_setup") + self.__context.ssh_client = SshClient(ip_address=self.__context.vm_ip_address, username=self.__context.username, private_key_file=self.__context.private_key_file) + + @staticmethod + def _get_required_parameter(variables: Dict[str, Any], name: str) -> Any: + value = variables.get(name) + if value is None: + raise Exception(f"The runbook is missing required parameter '{name}'") + return value + + @staticmethod + def _get_optional_parameter(variables: Dict[str, Any], name: str, default_value: Any = "") -> Any: + value = variables.get(name) + if value is None: + return default_value + return value + + @staticmethod + def _get_log_path(variables: Dict[str, Any], lisa_log_path: str) -> Path: + # NOTE: If "log_path" is not given as argument to the runbook, use a path derived from LISA's log for the test suite. + # That path is derived from LISA's "--log_path" command line argument and has a value similar to + # "<--log_path>/20230217/20230217-040022-342/tests/20230217-040119-288-agent_test_suite"; use the directory + # 2 levels up. + log_path = variables.get("log_path") + if log_path is not None and len(log_path) > 0: + return Path(log_path) + return Path(lisa_log_path).parent.parent + + @staticmethod + def _get_working_directory(lisa_working_path: str) -> Path: + # LISA's "working_path" has a value similar to + # "<--working_path>/20230322/20230322-194430-287/tests/20230322-194451-333-agent_test_suite + # where "<--working_path>" is the value given to the --working_path command line argument. Create the working for + # the AgentTestSuite as + # "<--working_path>/20230322/20230322-194430-287/waagent + # This directory will be unique for each execution of the runbook ("20230322-194430" is the timestamp and "287" is a + # unique ID per execution) + return Path(lisa_working_path).parent.parent / "waagent" + + @property + def context(self): + if self.__context is None: + raise Exception("The context for the AgentTestSuite has not been initialized") + return self.__context + + # + # Test suites within the same runbook may be executed concurrently, and setup needs to be done only once. + # We use this lock to allow only 1 thread to do the setup. Setup completion is marked using the 'completed' + # file: the thread doing the setup creates the file and threads that find that the file already exists + # simply skip setup. + # + _setup_lock = RLock() + + def _setup(self) -> None: + """ + Prepares the test suite for execution (currently, it just builds the agent package) + + Returns the path to the agent package. + """ + self._setup_lock.acquire() + + try: + log.info("") + log.info("**************************************** [Build] ****************************************") + log.info("") + completed: Path = self.context.working_directory/"completed" + + if completed.exists(): + log.info("Found %s. Build has already been done, skipping.", completed) + return + + self.context.lisa_log.info("Building test agent") + log.info("Creating working directory: %s", self.context.working_directory) + self.context.working_directory.mkdir(parents=True) + + self._build_agent_package() + + log.info("Completed setup, creating %s", completed) + completed.touch() + + finally: + self._setup_lock.release() + + def _build_agent_package(self) -> None: + """ + Builds the agent package and returns the path to the package. + """ + log.info("Building agent package to %s", self.context.working_directory) + + makepkg.run(agent_family="Test", output_directory=str(self.context.working_directory), log=log) + + package_path: Path = self._get_agent_package_path() + if not package_path.exists(): + raise Exception(f"Can't find the agent package at {package_path}") + + log.info("Built agent package as %s", package_path) + + def _get_agent_package_path(self) -> Path: + """ + Returns the path to the agent package. + """ + return self.context.working_directory/"eggs"/f"WALinuxAgent-{AGENT_VERSION}.zip" + + def _clean_up(self) -> None: + """ + Cleans up any leftovers from the test suite run. Currently just an empty placeholder for future use. + """ + + def _setup_node(self) -> None: + """ + Prepares the remote node for executing the test suite (installs tools and the test agent, etc) + """ + self.context.lisa_log.info("Setting up test node") + log.info("") + log.info("************************************** [Node Setup] **************************************") + log.info("") + log.info("Test Node: %s", self.context.vm.name) + log.info("IP Address: %s", self.context.vm_ip_address) + log.info("Resource Group: %s", self.context.vm.resource_group) + log.info("") + + # + # Ensure that the correct version (x84 vs ARM64) Pypy has been downloaded; it is pre-downloaded to /tmp on the container image + # used for Azure Pipelines runs, but for developer runs it may need to be downloaded. + # + if self.context.ssh_client.get_architecture() == "aarch64": + pypy_path = Path("/tmp/pypy3.7-arm64.tar.bz2") + pypy_download = "https://downloads.python.org/pypy/pypy3.7-v7.3.5-aarch64.tar.bz2" + else: + pypy_path = Path("/tmp/pypy3.7-x64.tar.bz2") + pypy_download = "https://downloads.python.org/pypy/pypy3.7-v7.3.5-linux64.tar.bz2" + if pypy_path.exists(): + log.info("Found Pypy at %s", pypy_path) + else: + log.info("Downloading %s to %s", pypy_download, pypy_path) + run_command(["wget", pypy_download, "-O", pypy_path]) + + # + # Create a tarball with the files we need to copy to the test node. The tarball includes two directories: + # + # * bin - Executables file (Bash and Python scripts) + # * lib - Library files (Python modules) + # + # After extracting the tarball on the test node, 'bin' will be added to PATH and PYTHONPATH will be set to 'lib'. + # + # Note that executables are placed directly under 'bin', while the path for Python modules is preserved under 'lib. + # + tarball_path: Path = Path("/tmp/waagent.tar") + log.info("Creating %s with the files need on the test node", tarball_path) + log.info("Adding orchestrator/scripts") + run_command(['tar', 'cvf', str(tarball_path), '--transform=s,.*/,bin/,', '-C', str(self.context.test_source_directory/"orchestrator"/"scripts"), '.']) + # log.info("Adding tests/scripts") + # run_command(['tar', 'rvf', str(tarball_path), '--transform=s,.*/,bin/,', '-C', str(self.context.test_source_directory/"tests"/"scripts"), '.']) + log.info("Adding tests/lib") + run_command(['tar', 'rvf', str(tarball_path), '--transform=s,^,lib/,', '-C', str(self.context.test_source_directory.parent), '--exclude=__pycache__', 'tests_e2e/tests/lib']) + log.info("Contents of %s:\n\n%s", tarball_path, run_command(['tar', 'tvf', str(tarball_path)])) + + # + # Cleanup the test node (useful for developer runs) + # + log.info('Preparing the test node for setup') + # Note that removing lib requires sudo, since a Python cache may have been created by tests using sudo + self.context.ssh_client.run_command("rm -rvf ~/{bin,lib,tmp}", use_sudo=True) + + # + # Copy the tarball, Pypy and the test Agent to the test node + # + target_path = Path("~")/"tmp" + self.context.ssh_client.run_command(f"mkdir {target_path}") + log.info("Copying %s to %s:%s", tarball_path, self.context.node.name, target_path) + self.context.ssh_client.copy_to_node(tarball_path, target_path) + log.info("Copying %s to %s:%s", pypy_path, self.context.node.name, target_path) + self.context.ssh_client.copy_to_node(pypy_path, target_path) + agent_package_path: Path = self._get_agent_package_path() + log.info("Copying %s to %s:%s", agent_package_path, self.context.node.name, target_path) + self.context.ssh_client.copy_to_node(agent_package_path, target_path) + + # + # Extract the tarball and execute the install scripts + # + log.info('Installing tools on the test node') + command = f"tar xf {target_path/tarball_path.name} && ~/bin/install-tools" + log.info("%s\n%s", command, self.context.ssh_client.run_command(command)) + + if self.context.is_vhd: + log.info("Using a VHD; will not install the Test Agent.") + else: + log.info("Installing the Test Agent on the test node") + command = f"install-agent --package ~/tmp/{agent_package_path.name} --version {AGENT_VERSION}" + log.info("%s\n%s", command, self.context.ssh_client.run_command(command, use_sudo=True)) + + log.info("Completed test node setup") + + def _collect_node_logs(self) -> None: + """ + Collects the test logs from the remote machine and copies them to the local machine + """ + try: + # Collect the logs on the test machine into a compressed tarball + self.context.lisa_log.info("Collecting logs on test node") + log.info("Collecting logs on test node") + stdout = self.context.ssh_client.run_command("collect-logs", use_sudo=True) + log.info(stdout) + + # Copy the tarball to the local logs directory + remote_path = "/tmp/waagent-logs.tgz" + local_path = self.context.log_path/'{0}.tgz'.format(self.context.environment_name) + log.info("Copying %s:%s to %s", self.context.node.name, remote_path, local_path) + self.context.ssh_client.copy_from_node(remote_path, local_path) + + except: # pylint: disable=bare-except + log.exception("Failed to collect logs from the test machine") + + # NOTES: + # + # * environment_status=EnvironmentStatus.Deployed skips most of LISA's initialization of the test node, which is not needed + # for agent tests. + # + # * We need to take the LISA Logger using a parameter named 'log'; this parameter hides tests_e2e.tests.lib.logging.log. + # Be aware then, that within this method 'log' refers to the LISA log, and elsewhere it refers to tests_e2e.tests.lib.logging.log. + # + # W0621: Redefining name 'log' from outer scope (line 53) (redefined-outer-name) + @TestCaseMetadata(description="", priority=0, requirement=simple_requirement(environment_status=EnvironmentStatus.Deployed)) + def main(self, node: Node, environment: Environment, variables: Dict[str, Any], working_path: str, log_path: str, log: Logger): # pylint: disable=redefined-outer-name + """ + Entry point from LISA + """ + self._initialize(node, variables, working_path, log_path, log) + self._execute(environment, variables) + + def _execute(self, environment: Environment, variables: Dict[str, Any]): + """ + Executes each of the AgentTests included in the "c_test_suites" variable (which is generated by the AgentTestSuitesCombinator). + """ + # Set the thread name to the name of the environment. The thread name is added to each item in LISA's log. + with _set_thread_name(self.context.environment_name): + log_path: Path = self.context.log_path/f"env-{self.context.environment_name}.log" + with set_current_thread_log(log_path): + start_time: datetime.datetime = datetime.datetime.now() + success = True + + try: + # Log the environment's name and the variables received from the runbook (note that we need to expand the names of the test suites) + log.info("LISA Environment (for correlation with the LISA log): %s", environment.name) + log.info("Runbook variables:") + for name, value in variables.items(): + log.info(" %s: %s", name, value if name != 'c_test_suites' else [t.name for t in value]) + + test_suite_success = True + + try: + if not self.context.skip_setup: + self._setup() + + if not self.context.skip_setup: + self._setup_node() + + # pylint seems to think self.context.test_suites is not iterable. Suppressing warning, since its type is List[AgentTestSuite] + # E1133: Non-iterable value self.context.test_suites is used in an iterating context (not-an-iterable) + for suite in self.context.test_suites: # pylint: disable=E1133 + log.info("Executing test suite %s", suite.name) + self.context.lisa_log.info("Executing Test Suite %s", suite.name) + test_suite_success = self._execute_test_suite(suite) and test_suite_success + + test_suite_success = self._check_agent_log() and test_suite_success + + finally: + collect = self.context.collect_logs + if collect == CollectLogs.Always or collect == CollectLogs.Failed and not test_suite_success: + self._collect_node_logs() + + except Exception as e: # pylint: disable=bare-except + # Report the error and raise an exception to let LISA know that the test errored out. + success = False + log.exception("UNEXPECTED ERROR.") + self._report_test_result( + self.context.environment_name, + "Unexpected Error", + TestStatus.FAILED, + start_time, + message="UNEXPECTED ERROR.", + add_exception_stack_trace=True) + + raise Exception(f"[{self.context.environment_name}] Unexpected error in AgentTestSuite: {e}") + + finally: + self._clean_up() + if not success: + self._mark_log_as_failed() + + def _execute_test_suite(self, suite: TestSuiteInfo) -> bool: + """ + Executes the given test suite and returns True if all the tests in the suite succeeded. + """ + suite_name = suite.name + suite_full_name = f"{suite_name}-{self.context.environment_name}" + suite_start_time: datetime.datetime = datetime.datetime.now() + + success: bool = True # True if all the tests succeed + + with _set_thread_name(suite_full_name): # The thread name is added to the LISA log + log_path: Path = self.context.log_path/f"{suite_full_name}.log" + with set_current_thread_log(log_path): + try: + log.info("") + log.info("**************************************** %s ****************************************", suite_name) + log.info("") + + summary: List[str] = [] + + for test in suite.tests: + test_name = test.__name__ + test_full_name = f"{suite_name}-{test_name}" + test_start_time: datetime.datetime = datetime.datetime.now() + + log.info("******** Executing %s", test_name) + self.context.lisa_log.info("Executing test %s", test_full_name) + + try: + + test(self.context).run() + + summary.append(f"[Passed] {test_name}") + log.info("******** [Passed] %s", test_name) + self.context.lisa_log.info("[Passed] %s", test_full_name) + self._report_test_result( + suite_full_name, + test_name, + TestStatus.PASSED, + test_start_time) + except TestSkipped as e: + summary.append(f"[Skipped] {test_name}") + log.info("******** [Skipped] %s: %s", test_name, e) + self.context.lisa_log.info("******** [Skipped] %s", test_full_name) + self._report_test_result( + suite_full_name, + test_name, + TestStatus.SKIPPED, + test_start_time, + message=str(e)) + except AssertionError as e: + success = False + summary.append(f"[Failed] {test_name}") + log.error("******** [Failed] %s: %s", test_name, e) + self.context.lisa_log.error("******** [Failed] %s", test_full_name) + self._report_test_result( + suite_full_name, + test_name, + TestStatus.FAILED, + test_start_time, + message=str(e)) + except: # pylint: disable=bare-except + success = False + summary.append(f"[Error] {test_name}") + log.exception("UNHANDLED EXCEPTION IN %s", test_name) + self.context.lisa_log.exception("UNHANDLED EXCEPTION IN %s", test_full_name) + self._report_test_result( + suite_full_name, + test_name, + TestStatus.FAILED, + test_start_time, + message="Unhandled exception.", + add_exception_stack_trace=True) + + log.info("") + + log.info("********* [Test Results]") + log.info("") + for r in summary: + log.info("\t%s", r) + log.info("") + + except: # pylint: disable=bare-except + success = False + self._report_test_result( + suite_full_name, + suite_name, + TestStatus.FAILED, + suite_start_time, + message=f"Unhandled exception while executing test suite {suite_name}.", + add_exception_stack_trace=True) + finally: + if not success: + self._mark_log_as_failed() + + return success + + def _check_agent_log(self) -> bool: + """ + Checks the agent log for errors; returns true on success (no errors int the log) + """ + start_time: datetime.datetime = datetime.datetime.now() + + try: + self.context.lisa_log.info("Checking agent log on the test node") + log.info("Checking agent log on the test node") + + output = self.context.ssh_client.run_command("check-agent-log.py -j") + errors = json.loads(output, object_hook=AgentLogRecord.from_dictionary) + + # Individual tests may have rules to ignore known errors; filter those out + ignore_error_rules = [] + # pylint seems to think self.context.test_suites is not iterable. Suppressing warning, since its type is List[AgentTestSuite] + # E1133: Non-iterable value self.context.test_suites is used in an iterating context (not-an-iterable) + for suite in self.context.test_suites: # pylint: disable=E1133 + for test in suite.tests: + ignore_error_rules.extend(test(self.context).get_ignore_error_rules()) + + if len(ignore_error_rules) > 0: + new = [] + for e in errors: + if not AgentLog.matches_ignore_rule(e, ignore_error_rules): + new.append(e) + errors = new + + if len(errors) == 0: + # If no errors, we are done; don't create a log or test result. + log.info("There are no errors in the agent log") + return True + + message = f"Detected {len(errors)} error(s) in the agent log" + self.context.lisa_log.error(message) + log.error("%s:\n\n%s\n", message, '\n'.join(['\t\t' + e.text.replace('\n', '\n\t\t') for e in errors])) + self._mark_log_as_failed() + + self._report_test_result( + self.context.environment_name, + "CheckAgentLog", + TestStatus.FAILED, + start_time, + message=message + ' - First few errors:\n' + '\n'.join([e.text for e in errors[0:3]])) + except: # pylint: disable=bare-except + log.exception("Error checking agent log") + self._report_test_result( + self.context.environment_name, + "CheckAgentLog", + TestStatus.FAILED, + start_time, + "Error checking agent log", + add_exception_stack_trace=True) + + return False + + @staticmethod + def _mark_log_as_failed(): + """ + Adds a message to indicate the log contains errors. + """ + log.info("MARKER-LOG-WITH-ERRORS") + + @staticmethod + def _report_test_result( + suite_name: str, + test_name: str, + status: TestStatus, + start_time: datetime.datetime, + message: str = "", + add_exception_stack_trace: bool = False + ) -> None: + """ + Reports a test result to the junit notifier + """ + # The junit notifier requires an initial RUNNING message in order to register the test in its internal cache. + msg: TestResultMessage = TestResultMessage() + msg.type = "AgentTestResultMessage" + msg.id_ = str(uuid.uuid4()) + msg.status = TestStatus.RUNNING + msg.suite_full_name = suite_name + msg.suite_name = msg.suite_full_name + msg.full_name = test_name + msg.name = msg.full_name + msg.elapsed = 0 + + notifier.notify(msg) + + # Now send the actual result. The notifier pipeline makes a deep copy of the message so it is OK to re-use the + # same object and just update a few fields. If using a different object, be sure that the "id_" is the same. + msg.status = status + msg.message = message + if add_exception_stack_trace: + msg.stacktrace = traceback.format_exc() + msg.elapsed = (datetime.datetime.now() - start_time).total_seconds() + + notifier.notify(msg) + + diff --git a/tests_e2e/orchestrator/lib/agent_test_suite_combinator.py b/tests_e2e/orchestrator/lib/agent_test_suite_combinator.py new file mode 100644 index 000000000..28fca0fad --- /dev/null +++ b/tests_e2e/orchestrator/lib/agent_test_suite_combinator.py @@ -0,0 +1,249 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +import logging +import re +import urllib.parse + +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Type + +# E0401: Unable to import 'dataclasses_json' (import-error) +from dataclasses_json import dataclass_json # pylint: disable=E0401 + +# Disable those warnings, since 'lisa' is an external, non-standard, dependency +# E0401: Unable to import 'lisa' (import-error) +# etc +from lisa import schema # pylint: disable=E0401 +from lisa.combinator import Combinator # pylint: disable=E0401 +from lisa.util import field_metadata # pylint: disable=E0401 + +from tests_e2e.orchestrator.lib.agent_test_loader import AgentTestLoader, VmImageInfo + + +@dataclass_json() +@dataclass +class AgentTestSuitesCombinatorSchema(schema.Combinator): + test_suites: str = field( + default_factory=str, metadata=field_metadata(required=True) + ) + cloud: str = field( + default_factory=str, metadata=field_metadata(required=True) + ) + location: str = field( + default_factory=str, metadata=field_metadata(required=True) + ) + image: str = field( + default_factory=str, metadata=field_metadata(required=False) + ) + vm_size: str = field( + default_factory=str, metadata=field_metadata(required=False) + ) + vm_name: str = field( + default_factory=str, metadata=field_metadata(required=False) + ) + + +class AgentTestSuitesCombinator(Combinator): + """ + The "agent_test_suites" combinator returns a list of variables that specify the environments (i.e. test VMs) that the agent + test suites must be executed on: + + * c_env_name: Unique name for the environment, e.g. "0001-com-ubuntu-server-focal-20_04-lts-westus2" + * c_marketplace_image: e.g. "Canonical UbuntuServer 18.04-LTS latest", + * c_location: e.g. "westus2", + * c_vm_size: e.g. "Standard_D2pls_v5" + * c_vhd: e.g "https://rhel.blob.core.windows.net/images/RHEL_8_Standard-8.3.202006170423.vhd?se=..." + * c_test_suites: e.g. [AgentBvt, FastTrack] + + (c_marketplace_image, c_location, c_vm_size) and vhd are mutually exclusive and define the environment (i.e. the test VM) + in which the test will be executed. c_test_suites defines the test suites that should be executed in that + environment. + + The 'vm_name' runbook parameter can be used to execute the test suites on an existing VM. In that case, the combinator + generates a single item with these variables: + + * c_env_name: Name for the environment, same as vm_name + * c_vm_name: Name of the test VM + * c_location: Location of the test VM e.g. "westus2", + * c_test_suites: e.g. [AgentBvt, FastTrack] + """ + def __init__(self, runbook: AgentTestSuitesCombinatorSchema) -> None: + super().__init__(runbook) + if self.runbook.cloud not in self._DEFAULT_LOCATIONS: + raise Exception(f"Invalid cloud: {self.runbook.cloud}") + + if self.runbook.vm_name != '' and (self.runbook.image != '' or self.runbook.vm_size != ''): + raise Exception("Invalid runbook parameters: When 'vm_name' is specified, 'image' and 'vm_size' should not be specified.") + + if self.runbook.vm_name != '': + self._environments = self.create_environment_for_existing_vm() + else: + self._environments = self.create_environment_list() + self._index = 0 + + + @classmethod + def type_name(cls) -> str: + return "agent_test_suites" + + @classmethod + def type_schema(cls) -> Type[schema.TypedSchema]: + return AgentTestSuitesCombinatorSchema + + def _next(self) -> Optional[Dict[str, Any]]: + result: Optional[Dict[str, Any]] = None + if self._index < len(self._environments): + result = self._environments[self._index] + self._index += 1 + return result + + _DEFAULT_LOCATIONS = { + "china": "china north 2", + "government": "usgovarizona", + "public": "westus2" + } + + def create_environment_for_existing_vm(self) -> List[Dict[str, Any]]: + loader = AgentTestLoader(self.runbook.test_suites) + + environment: List[Dict[str, Any]] = [ + { + "c_env_name": self.runbook.vm_name, + "c_vm_name": self.runbook.vm_name, + "c_location": self.runbook.location, + "c_test_suites": loader.test_suites, + } + ] + + log: logging.Logger = logging.getLogger("lisa") + log.info("******** Environment for existing VMs *****") + log.info( + "{ c_env_name: '%s', c_vm_name: '%s', c_location: '%s', c_test_suites: '%s' }", + environment[0]['c_env_name'], environment[0]['c_vm_name'], environment[0]['c_location'], [s.name for s in environment[0]['c_test_suites']]) + log.info("***************************") + + return environment + + def create_environment_list(self) -> List[Dict[str, Any]]: + loader = AgentTestLoader(self.runbook.test_suites) + + # + # If the runbook provides any of 'image', 'location', or 'vm_size', those values + # override any configuration values on the test suite. + # + # Check 'images' first and add them to 'runbook_images', if any + # + if self.runbook.image == "": + runbook_images = [] + else: + runbook_images = loader.images.get(self.runbook.image) + if runbook_images is None: + if not self._is_urn(self.runbook.image) and not self._is_vhd(self.runbook.image): + raise Exception(f"The 'image' parameter must be an image or image set name, a urn, or a vhd: {self.runbook.image}") + i = VmImageInfo() + i.urn = self.runbook.image # Note that this could be a URN or the URI for a VHD + i.locations = [] + i.vm_sizes = [] + runbook_images = [i] + + # + # Now walk through all the test_suites and create a list of the environments (test VMs) that need to be created. + # + environment_list: List[Dict[str, Any]] = [] + shared_environments: Dict[str, Dict[str, Any]] = {} + + for suite_info in loader.test_suites: + if len(runbook_images) > 0: + images_info = runbook_images + else: + # The test suite may be referencing multiple image sets, and sets can intersect, so we need to ensure + # we eliminate any duplicates. + unique_images: Dict[str, str] = {} + for image in suite_info.images: + for i in loader.images[image]: + unique_images[i] = i + images_info = unique_images.values() + + for image in images_info: + # The URN can actually point to a VHD if the runbook provided a VHD in the 'images' parameter + if self._is_vhd(image.urn): + marketplace_image = "" + vhd = image.urn + name = "vhd" + else: + marketplace_image = image.urn + vhd = "" + match = AgentTestSuitesCombinator._URN.match(image.urn) + if match is None: + raise Exception(f"Invalid URN: {image.urn}") + name = f"{match.group('offer')}-{match.group('sku')}" + + # If the runbook specified a location, use it. Then try the suite location, if any. Otherwise, check if the image specifies + # a list of locations and use any of them. If no location is specified so far, use the default. + if self.runbook.location != "": + location = self.runbook.location + elif suite_info.location != '': + location = suite_info.location + elif len(image.locations) > 0: + location = image.locations[0] + else: + location = AgentTestSuitesCombinator._DEFAULT_LOCATIONS[self.runbook.cloud] + + # If the runbook specified a VM size, use it. Else if the image specifies a list of VM sizes, use any of them. Otherwise, + # set the size to empty and let LISA choose it. + if self.runbook.vm_size != '': + vm_size = self.runbook.vm_size + elif len(image.vm_sizes) > 0: + vm_size = image.vm_sizes[0] + else: + vm_size = "" + + if suite_info.owns_vm: + # create an environment for exclusive use by this suite + environment_list.append({ + "c_marketplace_image": marketplace_image, + "c_location": location, + "c_vm_size": vm_size, + "c_vhd": vhd, + "c_test_suites": [suite_info], + "c_env_name": f"{name}-{suite_info.name}" + }) + else: + # add this suite to the shared environments + key: str = f"{name}-{location}" + if key in shared_environments: + shared_environments[key]["c_test_suites"].append(suite_info) + else: + shared_environments[key] = { + "c_marketplace_image": marketplace_image, + "c_location": location, + "c_vm_size": vm_size, + "c_vhd": vhd, + "c_test_suites": [suite_info], + "c_env_name": key + } + + environment_list.extend(shared_environments.values()) + + log: logging.Logger = logging.getLogger("lisa") + log.info("******** Environments *****") + for e in environment_list: + log.info( + "{ c_marketplace_image: '%s', c_location: '%s', c_vm_size: '%s', c_vhd: '%s', c_test_suites: '%s', c_env_name: '%s' }", + e['c_marketplace_image'], e['c_location'], e['c_vm_size'], e['c_vhd'], [s.name for s in e['c_test_suites']], e['c_env_name']) + log.info("***************************") + + return environment_list + + _URN = re.compile(r"(?P[^\s:]+)[\s:](?P[^\s:]+)[\s:](?P[^\s:]+)[\s:](?P[^\s:]+)") + + @staticmethod + def _is_urn(urn: str) -> bool: + # URNs can be given as ' ' or ':::' + return AgentTestSuitesCombinator._URN.match(urn) is not None + + @staticmethod + def _is_vhd(vhd: str) -> bool: + # VHDs are given as URIs to storage; do some basic validation, not intending to be exhaustive. + parsed = urllib.parse.urlparse(vhd) + return parsed.scheme == 'https' and parsed.netloc != "" and parsed.path != "" diff --git a/tests_e2e/orchestrator/runbook.yml b/tests_e2e/orchestrator/runbook.yml new file mode 100644 index 000000000..8075725eb --- /dev/null +++ b/tests_e2e/orchestrator/runbook.yml @@ -0,0 +1,142 @@ +name: WALinuxAgent + +testcase: + - criteria: + area: waagent + +extension: + - "./lib" + +variable: + # + # These variables define parameters handled by LISA. + # + - name: subscription_id + value: "" + - name: user + value: "waagent" + - name: identity_file + value: "" + is_secret: true + - name: admin_password + value: "" + is_secret: true + - name: keep_environment + value: "no" + # + # These variables define parameters for the AgentTestSuite; see the test wiki for details. + # + # NOTE: c_test_suites, generated by the AgentTestSuitesCombinator, is also a parameter + # for the AgentTestSuite + # + # Root directory for log files (optional) + - name: log_path + value: "" + is_case_visible: true + + # Whether to collect logs from the test VM + - name: collect_logs + value: "failed" + is_case_visible: true + + # Whether to skip setup of the test VM + - name: skip_setup + value: false + is_case_visible: true + + # + # These variables are parameters for the AgentTestSuitesCombinator + # + # The test suites to execute + - name: test_suites + value: "agent_bvt" + - name: cloud + value: "public" + - name: image + value: "" + - name: location + value: "" + - name: vm_size + value: "" + + # + # The values for these variables are generated by the AgentTestSuitesCombinator combinator. They are + # prefixed with "c_" to distinguish them from the rest of the variables, whose value can be set from + # the command line. + # + # c_marketplace_image, c_vm_size, c_location, and c_vhd are handled by LISA and define + # the set of test VMs that need to be created, while c_test_suites and c_env_name are parameters + # for the AgentTestSuite; the former defines the test suites that must be executed on each + # of those test VMs and the latter is the name of the environment, which is used for logging + # purposes (NOTE: the AgentTestSuite also uses c_vhd). + # + - name: c_env_name + value: "" + is_case_visible: true + - name: c_marketplace_image + value: "" + - name: c_vm_size + value: "" + - name: c_location + value: "" + - name: c_vhd + value: "" + is_case_visible: true + - name: c_test_suites + value: [] + is_case_visible: true + + # + # Set these variables to use an SSH proxy when executing the runbook + # + - name: proxy + value: False + - name: proxy_host + value: "" + - name: proxy_user + value: "foo" + - name: proxy_identity_file + value: "" + is_secret: true + +platform: + - type: azure + admin_username: $(user) + admin_private_key_file: $(identity_file) + admin_password: $(admin_password) + keep_environment: $(keep_environment) + azure: + deploy: True + subscription_id: $(subscription_id) + wait_delete: false + requirement: + core_count: + min: 2 + azure: + marketplace: $(c_marketplace_image) + vhd: $(c_vhd) + location: $(c_location) + vm_size: $(c_vm_size) + +combinator: + type: agent_test_suites + test_suites: $(test_suites) + cloud: $(cloud) + image: $(image) + location: $(location) + vm_size: $(vm_size) + +concurrency: 16 + +notifier: + - type: agent.junit + +dev: + enabled: $(proxy) + mock_tcp_ping: $(proxy) + jump_boxes: + - private_key_file: $(proxy_identity_file) + address: $(proxy_host) + username: $(proxy_user) + password: "dummy" + diff --git a/tests_e2e/orchestrator/sample_runbooks/existing_vm.yml b/tests_e2e/orchestrator/sample_runbooks/existing_vm.yml new file mode 100644 index 000000000..2a5109f41 --- /dev/null +++ b/tests_e2e/orchestrator/sample_runbooks/existing_vm.yml @@ -0,0 +1,143 @@ +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Executes the test suites on an existing VM +# +name: ExistingVM + +testcase: + - criteria: + area: waagent + +extension: + - "../lib" + +variable: + # + # These variables identify the existing VM, and the user for SSH connections + # + - name: cloud + value: "public" + - name: subscription_id + value: "" + - name: resource_group_name + value: "" + - name: vm_name + value: "" + - name: location + value: "" + + - name: user + value: "" + - name: identity_file + value: "" + is_secret: true + + # + # The test suites to execute + # + - name: test_suites + value: "agent_bvt" + + # + # These variables define parameters for the AgentTestSuite; see the test wiki for details. + # + # NOTE: c_test_suites, generated by the AgentTestSuitesCombinator, is also a parameter + # for the AgentTestSuite + # + # Root directory for log files (optional) + - name: log_path + value: "" + is_case_visible: true + + # Whether to collect logs from the test VM + - name: collect_logs + value: "failed" + is_case_visible: true + + # Whether to skip setup of the test VM + - name: skip_setup + value: false + is_case_visible: true + + # + # The values for these variables are generated by the AgentTestSuitesCombinator combinator. They are + # prefixed with "c_" to distinguish them from the rest of the variables, whose value can be set from + # the command line. + # + # c_marketplace_image, c_vm_size, c_location, and c_vhd are handled by LISA and define + # the set of test VMs that need to be created, while c_test_suites is a parameter + # for the AgentTestSuite and defines the test suites that must be executed on each + # of those test VMs (the AgentTestSuite also uses c_vhd) + # + - name: c_env_name + value: "" + is_case_visible: true + - name: c_vm_name + value: "" + - name: c_location + value: "" + - name: c_test_suites + value: [] + is_case_visible: true + + # + # Set these variables to use an SSH proxy when executing the runbook + # + - name: proxy + value: False + - name: proxy_host + value: "" + - name: proxy_user + value: "foo" + - name: proxy_identity_file + value: "" + is_secret: true + +platform: + - type: azure + admin_username: $(user) + admin_private_key_file: $(identity_file) + azure: + resource_group_name: $(resource_group_name) + deploy: false + subscription_id: $(subscription_id) + requirement: + azure: + name: $(c_vm_name) + location: $(c_location) + +combinator: + type: agent_test_suites + test_suites: $(test_suites) + cloud: $(cloud) + location: $(location) + vm_name: $(vm_name) + +notifier: + - type: env_stats + - type: agent.junit + +dev: + enabled: $(proxy) + mock_tcp_ping: $(proxy) + jump_boxes: + - private_key_file: $(proxy_identity_file) + address: $(proxy_host) + username: $(proxy_user) + password: "dummy" diff --git a/tests_e2e/orchestrator/sample_runbooks/local_machine/hello_world.py b/tests_e2e/orchestrator/sample_runbooks/local_machine/hello_world.py new file mode 100644 index 000000000..bf1a44a5c --- /dev/null +++ b/tests_e2e/orchestrator/sample_runbooks/local_machine/hello_world.py @@ -0,0 +1,32 @@ +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# E0401: Unable to import 'lisa' (import-error) +from lisa import ( # pylint: disable=E0401 + Logger, + Node, + TestCaseMetadata, + TestSuite, + TestSuiteMetadata, +) + + +@TestSuiteMetadata(area="sample", category="", description="") +class HelloWorld(TestSuite): + @TestCaseMetadata(description="") + def main(self, node: Node, log: Logger) -> None: + log.info(f"Hello world from {node.os.name}!") diff --git a/tests_e2e/orchestrator/sample_runbooks/local_machine/local.yml b/tests_e2e/orchestrator/sample_runbooks/local_machine/local.yml new file mode 100644 index 000000000..c397159f8 --- /dev/null +++ b/tests_e2e/orchestrator/sample_runbooks/local_machine/local.yml @@ -0,0 +1,32 @@ +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Executes the test suites on the local machine +# + +extension: + - "." +environment: + environments: + - nodes: + - type: local +notifier: + - type: console +testcase: + - criteria: + area: sample diff --git a/tests_e2e/orchestrator/scripts/check-agent-log.py b/tests_e2e/orchestrator/scripts/check-agent-log.py new file mode 100755 index 000000000..8807f8046 --- /dev/null +++ b/tests_e2e/orchestrator/scripts/check-agent-log.py @@ -0,0 +1,49 @@ +#!/usr/bin/env pypy3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import json +import sys + +from pathlib import Path +from tests_e2e.tests.lib.agent_log import AgentLog + +try: + parser = argparse.ArgumentParser() + parser.add_argument('path', nargs='?', help='Path of the log file', default='/var/log/waagent.log') + parser.add_argument('-j', '--json', action='store_true', help='Produce a JSON report') + parser.set_defaults(json=False) + args = parser.parse_args() + + error_list = AgentLog(Path(args.path)).get_errors() + + if args.json: + print(json.dumps(error_list, default=lambda o: o.__dict__)) + else: + if len(error_list) == 0: + print("No errors were found.") + else: + for e in error_list: + print(e.text) + +except Exception as e: + print(f"{e}", file=sys.stderr) + sys.exit(1) + +sys.exit(0) diff --git a/tests_e2e/orchestrator/scripts/collect-logs b/tests_e2e/orchestrator/scripts/collect-logs new file mode 100755 index 000000000..eadf0483a --- /dev/null +++ b/tests_e2e/orchestrator/scripts/collect-logs @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# +# Collects the logs needed to debug agent issues into a compressed tarball. +# + +# Note that we do "set -euxo pipefail" only after executing "tar". That command exits with code 1 on warnings +# and we do not want to consider those as failures. + +logs_file_name="/tmp/waagent-logs.tgz" + +echo "Collecting logs to $logs_file_name ..." + +tar --exclude='journal/*' --exclude='omsbundle' --exclude='omsagent' --exclude='mdsd' --exclude='scx*' \ + --exclude='*.so' --exclude='*__LinuxDiagnostic__*' --exclude='*.zip' --exclude='*.deb' --exclude='*.rpm' \ + --warning=no-file-changed \ + -czf "$logs_file_name" \ + /var/log \ + /var/lib/waagent/ \ + /etc/waagent.conf + +set -euxo pipefail + +# Ignore warnings (exit code 1) +exit_code=$? + +if [ "$exit_code" == "1" ]; then + echo "WARNING: tar exit code is 1" +elif [ "$exit_code" != "0" ]; then + exit $exit_code +fi + +chmod a+r "$logs_file_name" + +ls -l "$logs_file_name" diff --git a/tests_e2e/orchestrator/scripts/get-agent-bin-path b/tests_e2e/orchestrator/scripts/get-agent-bin-path new file mode 100755 index 000000000..e2e44f453 --- /dev/null +++ b/tests_e2e/orchestrator/scripts/get-agent-bin-path @@ -0,0 +1,56 @@ +#!/usr/bin/env bash + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Returns the path for the 'waagent' command. +# +set -euo pipefail + +# On most distros, 'waagent' is in PATH +if which waagent 2> /dev/null; then + exit 0 +fi + +# if the agent is running, get the path from 'cmdline' in the /proc file system +if test -e /run/waagent.pid; then + cmdline="/proc/$(cat /run/waagent.pid)/cmdline" + if test -e "$cmdline"; then + # cmdline is a sequence of null-terminated strings; break into lines and look for waagent + if tr '\0' '\n' < "$cmdline" | grep waagent; then + exit 0 + fi + fi +fi + +# try some well-known locations +declare -a known_locations=( + "/usr/sbin/waagent" + "/usr/share/oem/bin/waagent" +) + +for path in "${known_locations[@]}" +do + if [[ -e $path ]]; then + echo "$path" + exit 0 + fi +done + +echo "Can't find the path for the 'waagent' command" >&2 +exit 1 diff --git a/tests_e2e/orchestrator/scripts/get-agent-modules-path b/tests_e2e/orchestrator/scripts/get-agent-modules-path new file mode 100755 index 000000000..5493b96d9 --- /dev/null +++ b/tests_e2e/orchestrator/scripts/get-agent-modules-path @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Returns the PYTHONPATH on which the azurelinuxagent and associated modules are located. +# +# To do this, the script walks the site packages for the Python used to execute the agent, +# looking for the directory that contains "azurelinuxagent". +# +set -euo pipefail + +$(get-agent-python) -c ' +import site +import os + +for dir in site.getsitepackages(): + if os.path.isdir(dir + "/azurelinuxagent"): + print(dir) + exit(0) +exit(1) +' diff --git a/tests_e2e/orchestrator/scripts/get-agent-python b/tests_e2e/orchestrator/scripts/get-agent-python new file mode 100755 index 000000000..049732d45 --- /dev/null +++ b/tests_e2e/orchestrator/scripts/get-agent-python @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Returns the path of the Python executable used to start the Agent. +# +set -euo pipefail + +# if the agent is running, get the python command from 'exe' in the /proc file system +if test -e /run/waagent.pid; then + exe="/proc/$(cat /run/waagent.pid)/exe" + if test -e "$exe"; then + # exe is a symbolic link; return its target + readlink -f "$exe" + exit 0 + fi +fi + +# try all the instances of 'python' and 'python3' in $PATH +for path in $(echo "$PATH" | tr ':' '\n'); do + if [[ -e $path ]]; then + for python in $(find "$path" -maxdepth 1 -name python3 -or -name python); do + if $python -c 'import azurelinuxagent' 2> /dev/null; then + echo "$python" + exit 0 + fi + done + fi +done + +# try some well-known locations +declare -a known_locations=( + "/usr/share/oem/python/bin/python" +) +for python in "${known_locations[@]}" +do + if $python -c 'import azurelinuxagent' 2> /dev/null; then + echo "$python" + exit 0 + fi +done + +exit 1 diff --git a/tests_e2e/orchestrator/scripts/install-agent b/tests_e2e/orchestrator/scripts/install-agent new file mode 100755 index 000000000..14663d0b8 --- /dev/null +++ b/tests_e2e/orchestrator/scripts/install-agent @@ -0,0 +1,137 @@ +#!/usr/bin/env bash + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +set -euo pipefail + +usage() ( + echo "Usage: install-agent -p|--package -v|--version " + exit 1 +) + +while [[ $# -gt 0 ]]; do + case $1 in + -p|--package) + shift + if [ "$#" -lt 1 ]; then + usage + fi + package=$1 + shift + ;; + -v|--version) + shift + if [ "$#" -lt 1 ]; then + usage + fi + version=$1 + shift + ;; + *) + usage + esac +done +if [ "$#" -ne 0 ] || [ -z ${package+x} ] || [ -z ${version+x} ]; then + usage +fi + +# +# Find the command to manage services +# +if command -v systemctl &> /dev/null; then + service-status() { systemctl --no-pager -l status $1; } + service-stop() { systemctl stop $1; } + service-start() { systemctl start $1; } +else + service-status() { service $1 status; } + service-stop() { service $1 stop; } + service-start() { service $1 start; } +fi + +# +# Find the service name (walinuxagent in Ubuntu and waagent elsewhere) +# +if service-status walinuxagent > /dev/null 2>&1;then + service_name="walinuxagent" +else + service_name="waagent" +fi +echo "Service name: $service_name" + +# +# Output the initial version of the agent +# +python=$(get-agent-python) +waagent=$(get-agent-bin-path) +echo "Agent's path: $waagent" +$python "$waagent" --version +printf "\n" + +# +# Install the package +# +echo "Installing $package as version $version..." +unzip.py "$package" "/var/lib/waagent/WALinuxAgent-$version" + +# Ensure that AutoUpdate is enabled. some distros, e.g. Flatcar, don't have a waagent.conf +# but AutoUpdate defaults to True so there is no need to do anything in that case. +if [[ -e /etc/waagent.conf ]]; then + sed -i 's/AutoUpdate.Enabled=n/AutoUpdate.Enabled=y/g' /etc/waagent.conf +fi + +# +# Restart the service +# +echo "Restarting service..." +service-stop $service_name + +# Rename the previous log to ensure the new log starts with the agent we just installed +mv /var/log/waagent.log /var/log/waagent."$(date --iso-8601=seconds)".log + +service-start $service_name + +# +# Verify that the new agent is running and output its status. +# Note that the extension handler may take some time to start so give 1 minute. +# +echo "Verifying agent installation..." + +check-version() { + for i in {0..5} + do + if $python "$waagent" --version | grep -E "Goal state agent:\s+$version" > /dev/null; then + return 0 + fi + sleep 10 + done + + return 1 +} + +if check-version "$version"; then + printf "\nThe agent was installed successfully\n" + exit_code=0 +else + printf "\nFailed to install agent.\n" + exit_code=1 +fi + +$python "$waagent" --version +printf "\n" +service-status $service_name + +exit $exit_code diff --git a/tests_e2e/orchestrator/scripts/install-tools b/tests_e2e/orchestrator/scripts/install-tools new file mode 100755 index 000000000..2e2dd53fb --- /dev/null +++ b/tests_e2e/orchestrator/scripts/install-tools @@ -0,0 +1,135 @@ +#!/usr/bin/env bash + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Installs the tools in ~/bin/scripts/* to ~/bin, as well as Pypy. +# +# It also makes Pypy the default python for the current user. +# + +set -euo pipefail + +PATH="$HOME/bin:$PATH" + +python=$(get-agent-python) +echo "Python executable: $python" +echo "Python version: $($python --version)" + +# +# Install Pypy as ~/bin/pypy3 +# +# Note that bzip2/lbzip2 (used by tar to uncompress *.bz2 files) are not available by default in some distros; +# use Python to uncompress the Pypy tarball. +# +echo "Installing Pypy 3.7" +$python ~/bin/uncompress.py ~/tmp/pypy3.7-*.tar.bz2 ~/tmp/pypy3.7.tar +tar xf ~/tmp/pypy3.7.tar -C ~/bin +echo "Pypy was installed in $(ls -d ~/bin/pypy*)" +ln -s ~/bin/pypy*/bin/pypy3.7 ~/bin/pypy3 +echo "Creating symbolic link to Pypy: ~/bin/pypy3" + +# +# The 'distro' and 'platform' modules in Pypy have small differences with the ones in the system's Python. +# This can create problems in tests that use the get_distro() method in the Agent's 'version.py' module. +# To work around this, we copy the system's 'distro' module to Pypy. +# +# In the case of 'platform', the 'linux_distribution' method was removed on Python 3.7 so we check the +# system's module and, if the method does not exist, we also remove it from Pypy. Ubuntu 16 and 18 are +# special cases in that the 'platform' module in Pypy identifies the distro as 'debian'; in this case we +# copy the system's 'platform' module to Pypy. +# +distro_path=$($python -c ' +try: + import distro +except: + exit(0) +print(distro.__file__.replace("__init__.py", "distro.py")) +exit(0) +') +if [[ "$distro_path" != "" ]]; then + echo "Copying the system's distro module to Pypy" + cp -v "$distro_path" ~/bin/pypy*/site-packages +else + echo "The distro module is not is not installing on the system; skipping." +fi + +has_linux_distribution=$($python -c 'import platform; print(hasattr(platform, "linux_distribution"))') +if [[ "$has_linux_distribution" == "False" ]]; then + echo "Python does not have platform.linux_distribution; removing it from Pypy" + sed -i 's/def linux_distribution(/def __linux_distribution__(/' ~/bin/pypy*/lib-python/3/platform.py +else + echo "Python has platform.linux_distribution" + uname=$(uname -v) + if [[ "$uname" == *~18*-Ubuntu* || "$uname" == *~16*-Ubuntu* ]]; then + echo "Copying the system's platform module to Pypy" + pypy_platform=$(pypy3 -c 'import platform; print(platform.__file__)') + python_platform=$($python -c 'import platform; print(platform.__file__)') + cp -v "$python_platform" "$pypy_platform" + fi +fi + +# +# Now install the test Agent as a module package in Pypy. +# +echo "Installing Agent modules to Pypy" +unzip.py ~/tmp/WALinuxAgent-*.zip ~/tmp/WALinuxAgent +unzip.py ~/tmp/WALinuxAgent/bin/WALinuxAgent-*.egg ~/tmp/WALinuxAgent/bin/WALinuxAgent.egg +mv ~/tmp/WALinuxAgent/bin/WALinuxAgent.egg/azurelinuxagent ~/bin/pypy*/site-packages + +# +# Log the results of get_distro() in Pypy and Python. +# +pypy_get_distro=$(pypy3 -c 'from azurelinuxagent.common.version import get_distro; print(get_distro())') +python_get_distro=$($python -c 'from azurelinuxagent.common.version import get_distro; print(get_distro())') +echo "Pypy get_distro(): $pypy_get_distro" +echo "Python get_distro(): $python_get_distro" + +# +# Create ~/bin/set-agent-env to set PATH and PYTHONPATH. +# +# We append $HOME/bin to PATH and set PYTHONPATH to $HOME/lib (bin contains the scripts used by tests, while +# lib contains the Python libraries used by tests). +# +echo "Creating ~/bin/set-agent-env to set PATH and PYTHONPATH" + +echo " +if [[ \$PATH != *\"$HOME/bin\"* ]]; then + PATH=\"$HOME/bin:\$PATH:\" +fi + +export PYTHONPATH=\"$HOME/lib\" +" > ~/bin/set-agent-env + +chmod u+x ~/bin/set-agent-env + +# +# Add ~/bin/set-agent-env to .bash_profile to simplify interactive debugging sessions +# +# Note that in some distros .bash_profile is a symbolic link to a read-only file. Make a copy in that case. +# +echo "Adding ~/bin/set-agent-env to ~/.bash_profile" +if test -e ~/.bash_profile && ls -l .bash_profile | grep '\->'; then + cp ~/.bash_profile ~/.bash_profile-bk + rm ~/.bash_profile + mv ~/.bash_profile-bk ~/.bash_profile +fi +if ! test -e ~/.bash_profile || ! grep '~/bin/set-agent-env' ~/.bash_profile > /dev/null; then + echo 'source ~/bin/set-agent-env +' >> ~/.bash_profile +fi diff --git a/tests_e2e/orchestrator/scripts/uncompress.py b/tests_e2e/orchestrator/scripts/uncompress.py new file mode 100755 index 000000000..755397cf3 --- /dev/null +++ b/tests_e2e/orchestrator/scripts/uncompress.py @@ -0,0 +1,33 @@ +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Un-compresses a bz2 file +# +import argparse +import bz2 +import shutil + +parser = argparse.ArgumentParser() +parser.add_argument('source', help='File to uncompress') +parser.add_argument('target', help='Output file') + +args = parser.parse_args() + +with bz2.BZ2File(args.source, 'rb') as f_in: + with open(args.target, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) diff --git a/tests_e2e/orchestrator/scripts/unzip.py b/tests_e2e/orchestrator/scripts/unzip.py new file mode 100755 index 000000000..b909d6ae7 --- /dev/null +++ b/tests_e2e/orchestrator/scripts/unzip.py @@ -0,0 +1,36 @@ +#!/usr/bin/env pypy3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import argparse +import sys +import zipfile + +try: + parser = argparse.ArgumentParser() + parser.add_argument('source', help='ZIP package to expand') + parser.add_argument('target', help='Destination directory') + + args = parser.parse_args() + + zipfile.ZipFile(args.source).extractall(args.target) + +except Exception as e: + print(f"{e}", file=sys.stderr) + sys.exit(1) + +sys.exit(0) diff --git a/tests_e2e/pipeline/pipeline-cleanup.yml b/tests_e2e/pipeline/pipeline-cleanup.yml new file mode 100644 index 000000000..ba880a4f4 --- /dev/null +++ b/tests_e2e/pipeline/pipeline-cleanup.yml @@ -0,0 +1,58 @@ +# +# Pipeline for cleaning up any remaining Resource Groups generated by the Azure.WALinuxAgent pipeline. +# +# Deletes any resource groups that are more than a day old and contain string "lisa-WALinuxAgent-" +# +schedules: + - cron: "0 */12 * * *" # Run twice a day (every 12 hours) + displayName: cleanup build + branches: + include: + - develop + always: true + +trigger: + - develop + +pr: none + +pool: + vmImage: ubuntu-latest + +variables: + - name: azureConnection + value: 'azuremanagement' + - name: rgPrefix + value: 'lisa-WALinuxAgent-' + +steps: + + - task: AzureKeyVault@2 + displayName: "Fetch secrets from KV" + inputs: + azureSubscription: '$(azureConnection)' + KeyVaultName: 'dcrV2SPs' + SecretsFilter: '*' + RunAsPreJob: true + + - task: AzureCLI@2 + inputs: + azureSubscription: '$(azureConnection)' + scriptType: 'bash' + scriptLocation: 'inlineScript' + inlineScript: | + set -euxo pipefail + date=`date --utc +%Y-%m-%d'T'%H:%M:%S.%N'Z' -d "1 day ago"` + + # Using the Azure REST GET resourceGroups API call as we can add the createdTime to the results. + # This feature is not available via the az-cli commands directly so we have to use the Azure REST APIs + + az rest --method GET \ + --url "https://management.azure.com/subscriptions/$(SUBSCRIPTION-ID)/resourcegroups" \ + --url-parameters api-version=2021-04-01 \$expand=createdTime \ + --output json \ + --query value \ + | jq --arg date "$date" '.[] | select (.createdTime < $date).name' \ + | grep "$(rgPrefix)" \ + | xargs -l -t -r az group delete --no-wait -y -n \ + || echo "No resource groups found to delete" diff --git a/tests_e2e/pipeline/pipeline.yml b/tests_e2e/pipeline/pipeline.yml new file mode 100644 index 000000000..1de541634 --- /dev/null +++ b/tests_e2e/pipeline/pipeline.yml @@ -0,0 +1,119 @@ +# variables: + # + # NOTE: When creating the pipeline, "connection_info" must be added as a variable pointing to the + # corresponding key vault; see wiki for details. + # + +parameters: + # See the test wiki for a description of the parameters + - name: test_suites + displayName: Test Suites + type: string + default: agent_bvt + + # NOTES: + # * 'image', 'location' and 'vm_size' override any values in the test suites/images definition + # files. Those parameters are useful for 1-off tests, like testing a VHD or checking if + # an image is supported in a particular location. + # * Azure Pipelines do not allow empty string for the parameter value, using "-" instead. + # + - name: image + displayName: Image (image/image set name, URN, or VHD) + type: string + default: "-" + + - name: location + displayName: Location (region) + type: string + default: "-" + + - name: vm_size + displayName: VM size + type: string + default: "-" + + - name: collect_logs + displayName: Collect logs from test VMs + type: string + default: failed + values: + - always + - failed + - no + + - name: keep_environment + displayName: Keep the test VMs (do not delete them) + type: string + default: no + values: + - always + - failed + - no + +trigger: + - develop + +pr: none + +pool: + vmImage: ubuntu-latest + +jobs: + - job: "ExecuteTests" + + steps: + - task: UsePythonVersion@0 + displayName: "Set Python Version" + inputs: + versionSpec: '3.10' + addToPath: true + architecture: 'x64' + + # Extract the Azure cloud from the "connection_info" variable and store it in the "cloud" variable. + # The cloud name is used as a suffix of the value for "connection_info" and comes after the last '-'. + - bash: echo "##vso[task.setvariable variable=cloud]$(echo $CONNECTION_INFO | sed 's/^.*-//')" + displayName: "Set Cloud type" + + - task: DownloadSecureFile@1 + name: downloadSshKey + displayName: "Download SSH key" + inputs: + secureFile: 'id_rsa' + + - task: AzureKeyVault@2 + displayName: "Fetch connection info" + inputs: + azureSubscription: 'azuremanagement' + KeyVaultName: '$(connection_info)' + SecretsFilter: '*' + + - bash: $(Build.SourcesDirectory)/tests_e2e/pipeline/scripts/execute_tests.sh + displayName: "Execute tests" + continueOnError: true + env: + SUBSCRIPTION_ID: $(SUBSCRIPTION-ID) + AZURE_CLIENT_ID: $(AZURE-CLIENT-ID) + AZURE_CLIENT_SECRET: $(AZURE-CLIENT-SECRET) + AZURE_TENANT_ID: $(AZURE-TENANT-ID) + CR_USER: $(CR-USER) + CR_SECRET: $(CR-SECRET) + CLOUD: ${{ variables.cloud }} + COLLECT_LOGS: ${{ parameters.collect_logs }} + IMAGE: ${{ parameters.image }} + KEEP_ENVIRONMENT: ${{ parameters.keep_environment }} + LOCATION: ${{ parameters.location }} + TEST_SUITES: ${{ parameters.test_suites }} + VM_SIZE: ${{ parameters.vm_size }} + + - publish: $(Build.ArtifactStagingDirectory) + artifact: 'artifacts' + displayName: 'Publish test artifacts' + + - task: PublishTestResults@2 + displayName: 'Publish test results' + inputs: + testResultsFormat: 'JUnit' + testResultsFiles: 'runbook_logs/agent.junit.xml' + searchFolder: $(Build.ArtifactStagingDirectory) + failTaskOnFailedTests: true + diff --git a/tests_e2e/pipeline/scripts/execute_tests.sh b/tests_e2e/pipeline/scripts/execute_tests.sh new file mode 100755 index 000000000..15c9f0b5f --- /dev/null +++ b/tests_e2e/pipeline/scripts/execute_tests.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +# +# UID of 'waagent' in the Docker container +# +WAAGENT_UID=1000 + +# +# Set the correct mode and owner for the private SSH key and generate the public key. +# +cd "$HOME" +mkdir ssh +cp "$DOWNLOADSSHKEY_SECUREFILEPATH" ssh +chmod 700 ssh/id_rsa +ssh-keygen -y -f ssh/id_rsa > ssh/id_rsa.pub +sudo find ssh -exec chown "$WAAGENT_UID" {} \; + +# +# Allow write access to the sources directory. This is needed because building the agent package (within the Docker +# container) writes the egg info to that directory. +# +chmod a+w "$BUILD_SOURCESDIRECTORY" + +# +# Create the directory where the Docker container will create the test logs and give ownership to 'waagent' +# +LOGS_DIRECTORY="$HOME/logs" +mkdir "$LOGS_DIRECTORY" +sudo chown "$WAAGENT_UID" "$LOGS_DIRECTORY" + +# +# Pull the container image used to execute the tests +# +az acr login --name waagenttests --username "$CR_USER" --password "$CR_SECRET" + +docker pull waagenttests.azurecr.io/waagenttests:latest + +# Azure Pipelines does not allow an empty string as the value for a pipeline parameter; instead we use "-" to indicate +# an empty value. Change "-" to "" for the variables that capture the parameter values. +if [[ $IMAGE == "-" ]]; then + IMAGE="" +fi +if [[ $LOCATION == "-" ]]; then + LOCATION="" +fi +if [[ $VM_SIZE == "-" ]]; then + VM_SIZE="" +fi + +# A test failure will cause automation to exit with an error code and we don't want this script to stop so we force the command +# to succeed and capture the exit code to return it at the end of the script. +echo "exit 0" > /tmp/exit.sh + +docker run --rm \ + --volume "$BUILD_SOURCESDIRECTORY:/home/waagent/WALinuxAgent" \ + --volume "$HOME"/ssh:/home/waagent/.ssh \ + --volume "$LOGS_DIRECTORY":/home/waagent/logs \ + --env AZURE_CLIENT_ID \ + --env AZURE_CLIENT_SECRET \ + --env AZURE_TENANT_ID \ + waagenttests.azurecr.io/waagenttests \ + bash --login -c \ + "lisa \ + --runbook \$HOME/WALinuxAgent/tests_e2e/orchestrator/runbook.yml \ + --log_path \$HOME/logs/lisa \ + --working_path \$HOME/tmp \ + -v cloud:$CLOUD \ + -v subscription_id:$SUBSCRIPTION_ID \ + -v identity_file:\$HOME/.ssh/id_rsa \ + -v test_suites:\"$TEST_SUITES\" \ + -v log_path:\$HOME/logs \ + -v collect_logs:\"$COLLECT_LOGS\" \ + -v keep_environment:\"$KEEP_ENVIRONMENT\" \ + -v image:\"$IMAGE\" \ + -v location:\"$LOCATION\" \ + -v vm_size:\"$VM_SIZE\"" \ +|| echo "exit $?" > /tmp/exit.sh + +# +# Re-take ownership of the logs directory +# +sudo find "$LOGS_DIRECTORY" -exec chown "$USER" {} \; + +# +# Move the relevant logs to the staging directory +# +# Move the logs for failed tests to a temporary location +mkdir "$BUILD_ARTIFACTSTAGINGDIRECTORY"/tmp +for log in $(grep -l MARKER-LOG-WITH-ERRORS "$LOGS_DIRECTORY"/*.log); do + mv "$log" "$BUILD_ARTIFACTSTAGINGDIRECTORY"/tmp +done +# Move the environment logs to "environment_logs" +if ls "$LOGS_DIRECTORY"/env-*.log > /dev/null 2>&1; then + mkdir "$BUILD_ARTIFACTSTAGINGDIRECTORY"/environment_logs + mv "$LOGS_DIRECTORY"/env-*.log "$BUILD_ARTIFACTSTAGINGDIRECTORY"/environment_logs +fi +# Move the rest of the logs to "test_logs" +if ls "$LOGS_DIRECTORY"/*.log > /dev/null 2>&1; then + mkdir "$BUILD_ARTIFACTSTAGINGDIRECTORY"/test_logs + mv "$LOGS_DIRECTORY"/*.log "$BUILD_ARTIFACTSTAGINGDIRECTORY"/test_logs +fi +# Move the logs for failed tests to the main directory +if ls "$BUILD_ARTIFACTSTAGINGDIRECTORY"/tmp/*.log > /dev/null 2>&1; then + mv "$BUILD_ARTIFACTSTAGINGDIRECTORY"/tmp/*.log "$BUILD_ARTIFACTSTAGINGDIRECTORY" +fi +rmdir "$BUILD_ARTIFACTSTAGINGDIRECTORY"/tmp +# Move the logs collected from the test VMs to vm_logs +if ls "$LOGS_DIRECTORY"/*.tgz > /dev/null 2>&1; then + mkdir "$BUILD_ARTIFACTSTAGINGDIRECTORY"/vm_logs + mv "$LOGS_DIRECTORY"/*.tgz "$BUILD_ARTIFACTSTAGINGDIRECTORY"/vm_logs +fi +# Files created by LISA are under .../lisa//" +mkdir "$BUILD_ARTIFACTSTAGINGDIRECTORY"/runbook_logs +mv "$LOGS_DIRECTORY"/lisa/*/*/lisa-*.log "$BUILD_ARTIFACTSTAGINGDIRECTORY"/runbook_logs +mv "$LOGS_DIRECTORY"/lisa/*/*/agent.junit.xml "$BUILD_ARTIFACTSTAGINGDIRECTORY"/runbook_logs + +cat /tmp/exit.sh +bash /tmp/exit.sh diff --git a/tests_e2e/test_suites/agent_bvt.yml b/tests_e2e/test_suites/agent_bvt.yml new file mode 100644 index 000000000..1f0f91405 --- /dev/null +++ b/tests_e2e/test_suites/agent_bvt.yml @@ -0,0 +1,8 @@ +name: "AgentBvt" +tests: + - "bvts/extension_operations.py" + - "bvts/run_command.py" + - "bvts/vm_access.py" +images: + - "endorsed" + - "endorsed-arm64" diff --git a/tests_e2e/test_suites/fail.yml b/tests_e2e/test_suites/fail.yml new file mode 100644 index 000000000..6cd3b01af --- /dev/null +++ b/tests_e2e/test_suites/fail.yml @@ -0,0 +1,5 @@ +name: "Fail" +tests: + - "fail_test.py" + - "error_test.py" +images: "ubuntu_1804" diff --git a/tests_e2e/test_suites/images.yml b/tests_e2e/test_suites/images.yml new file mode 100644 index 000000000..253f8a138 --- /dev/null +++ b/tests_e2e/test_suites/images.yml @@ -0,0 +1,94 @@ +# +# Image sets are used to group images +# +image-sets: + # Endorsed distros that are tested on the daily runs + endorsed: +# +# TODO: Add CentOS 6.10 and Debian 8 +# +# - "centos_610" +# - "debian_8" +# + - "alma_9" + - "centos_79" + - "debian_9" + - "debian_10" + - "debian_11" + - "flatcar" + - "suse_12" + - "mariner_1" + - "mariner_2" + - "suse_15" + - "rhel_79" + - "rhel_82" + - "rhel_90" + - "rocky_9" + - "ubuntu_1604" + - "ubuntu_1804" + - "ubuntu_2004" + - "ubuntu_2204" + + # Endorsed distros (ARM64) that are tested on the daily runs + endorsed-arm64: + - "debian_11_arm64" + - "flatcar_arm64" + - "mariner_2_arm64" + - "rhel_90_arm64" + - "ubuntu_2204_arm64" + +# +# An image can be specified by a string giving its urn, as in +# +# ubuntu_2004: "Canonical 0001-com-ubuntu-server-focal 20_04-lts latest" +# +# or by an object with 3 properties: urn, locations and vm_sizes, as in +# +# mariner_2_arm64: +# urn: "microsoftcblmariner cbl-mariner cbl-mariner-2-arm64 latest" +# locations: +# - "eastus" +# vm_sizes: +# - "Standard_D2pls_v5" +# +# 'urn' is required, while 'locations' and 'vm_sizes' are optional. The latter +# two properties can be used to specify that the image is available only in +# some locations, or that it can be used only on some VM sizes. +# +# URNs follow the format ' ' or +# ':::' +# +images: + alma_9: "almalinux almalinux 9-gen2 latest" + centos_610: "OpenLogic CentOS 6.10 latest" + centos_79: "OpenLogic CentOS 7_9 latest" + debian_8: "credativ Debian 8 latest" + debian_9: "credativ Debian 9 latest" + debian_10: "Debian debian-10 10 latest" + debian_11: "Debian debian-11 11 latest" + debian_11_arm64: "Debian debian-11 11-backports-arm64 latest" + flatcar: "kinvolk flatcar-container-linux-free stable latest" + flatcar_arm64: + urn: "kinvolk flatcar-container-linux-corevm stable latest" + vm_sizes: + - "Standard_D2pls_v5" + mariner_1: "microsoftcblmariner cbl-mariner cbl-mariner-1 latest" + mariner_2: "microsoftcblmariner cbl-mariner cbl-mariner-2 latest" + mariner_2_arm64: + urn: "microsoftcblmariner cbl-mariner cbl-mariner-2-arm64 latest" + locations: + - "eastus" + vm_sizes: + - "Standard_D2pls_v5" + rocky_9: "erockyenterprisesoftwarefoundationinc1653071250513 rockylinux-9 rockylinux-9 latest" + suse_12: "SUSE sles-12-sp5-basic gen1 latest" + suse_15: "SUSE sles-15-sp2-basic gen2 latest" + rhel_79: "RedHat RHEL 7_9 latest" + rhel_82: "RedHat RHEL 8.2 latest" + rhel_90: "RedHat RHEL 9_0 latest" + rhel_90_arm64: "RedHat rhel-arm64 9_0-arm64 latest" + ubuntu_1604: "Canonical UbuntuServer 16.04-LTS latest" + ubuntu_1804: "Canonical UbuntuServer 18.04-LTS latest" + ubuntu_2004: "Canonical 0001-com-ubuntu-server-focal 20_04-lts latest" + ubuntu_2204: "Canonical 0001-com-ubuntu-server-jammy 22_04-lts latest" + ubuntu_2204_arm64: "Canonical 0001-com-ubuntu-server-jammy 22_04-lts-arm64 latest" diff --git a/tests_e2e/test_suites/pass.yml b/tests_e2e/test_suites/pass.yml new file mode 100644 index 000000000..40b0e60b4 --- /dev/null +++ b/tests_e2e/test_suites/pass.yml @@ -0,0 +1,4 @@ +name: "Pass" +tests: + - "pass_test.py" +images: "ubuntu_2004" diff --git a/dcr/__init__.py b/tests_e2e/tests/__init__.py similarity index 100% rename from dcr/__init__.py rename to tests_e2e/tests/__init__.py diff --git a/dcr/scenario_utils/__init__.py b/tests_e2e/tests/bvts/__init__.py similarity index 100% rename from dcr/scenario_utils/__init__.py rename to tests_e2e/tests/bvts/__init__.py diff --git a/tests_e2e/tests/bvts/extension_operations.py b/tests_e2e/tests/bvts/extension_operations.py new file mode 100755 index 000000000..e8a45ee44 --- /dev/null +++ b/tests_e2e/tests/bvts/extension_operations.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# BVT for extension operations (Install/Enable/Update/Uninstall). +# +# The test executes an older version of an extension, then updates it to a newer version, and lastly +# it removes it. The actual extension is irrelevant, but the test uses CustomScript for simplicity, +# since it's invocation is trivial and the entire extension workflow can be tested end-to-end by +# checking the message in the status produced by the extension. +# +import uuid + +from assertpy import assert_that + +from azure.core.exceptions import ResourceNotFoundError + +from tests_e2e.tests.lib.agent_test import AgentTest +from tests_e2e.tests.lib.identifiers import VmExtensionIds, VmExtensionIdentifier +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.ssh_client import SshClient +from tests_e2e.tests.lib.vm_extension import VmExtension + + +class ExtensionOperationsBvt(AgentTest): + def run(self): + ssh_client: SshClient = SshClient( + ip_address=self._context.vm_ip_address, + username=self._context.username, + private_key_file=self._context.private_key_file) + + is_arm64: bool = ssh_client.get_architecture() == "aarch64" + + custom_script_2_0 = VmExtension( + self._context.vm, + VmExtensionIds.CustomScript, + resource_name="CustomScript") + + if is_arm64: + log.info("Will skip the update scenario, since currently there is only 1 version of CSE on ARM64") + else: + log.info("Installing %s", custom_script_2_0) + message = f"Hello {uuid.uuid4()}!" + custom_script_2_0.enable( + settings={ + 'commandToExecute': f"echo \'{message}\'" + }, + auto_upgrade_minor_version=False + ) + custom_script_2_0.assert_instance_view(expected_version="2.0", expected_message=message) + + custom_script_2_1 = VmExtension( + self._context.vm, + VmExtensionIdentifier(VmExtensionIds.CustomScript.publisher, VmExtensionIds.CustomScript.type, "2.1"), + resource_name="CustomScript") + + if is_arm64: + log.info("Installing %s", custom_script_2_1) + else: + log.info("Updating %s to %s", custom_script_2_0, custom_script_2_1) + + message = f"Hello {uuid.uuid4()}!" + custom_script_2_1.enable( + settings={ + 'commandToExecute': f"echo \'{message}\'" + } + ) + custom_script_2_1.assert_instance_view(expected_version="2.1", expected_message=message) + + custom_script_2_1.delete() + + assert_that(custom_script_2_1.get_instance_view).\ + described_as("Fetching the instance view should fail after removing the extension").\ + raises(ResourceNotFoundError) + + +if __name__ == "__main__": + ExtensionOperationsBvt.run_from_command_line() diff --git a/tests_e2e/tests/bvts/run_command.py b/tests_e2e/tests/bvts/run_command.py new file mode 100755 index 000000000..188c12d3f --- /dev/null +++ b/tests_e2e/tests/bvts/run_command.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# BVT for RunCommand. +# +# Note that there are two incarnations of RunCommand (which are actually two different extensions): +# Microsoft.CPlat.Core.RunCommandHandlerLinux and Microsoft.CPlat.Core.RunCommandLinux. This test +# exercises both using the same strategy: execute the extension to create a file on the test VM, +# then fetch the contents of the file over SSH and compare against the known value. +# +import base64 +import uuid + +from assertpy import assert_that, soft_assertions +from typing import Callable, Dict + +from tests_e2e.tests.lib.agent_test import AgentTest +from tests_e2e.tests.lib.identifiers import VmExtensionIds +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.ssh_client import SshClient +from tests_e2e.tests.lib.vm_extension import VmExtension + + +class RunCommandBvt(AgentTest): + class TestCase: + def __init__(self, extension: VmExtension, get_settings: Callable[[str], Dict[str, str]]): + self.extension = extension + self.get_settings = get_settings + + def run(self): + ssh_client = SshClient( + ip_address=self._context.vm_ip_address, + username=self._context.username, + private_key_file=self._context.private_key_file) + + test_cases = [ + RunCommandBvt.TestCase( + VmExtension(self._context.vm, VmExtensionIds.RunCommand, resource_name="RunCommand"), + lambda s: { + "script": base64.standard_b64encode(bytearray(s, 'utf-8')).decode('utf-8') + }) + ] + + if ssh_client.get_architecture() == "aarch64": + log.info("Skipping test case for %s, since it has not been published on ARM64", VmExtensionIds.RunCommandHandler) + else: + test_cases.append( + RunCommandBvt.TestCase( + VmExtension(self._context.vm, VmExtensionIds.RunCommandHandler, resource_name="RunCommandHandler"), + lambda s: { + "source": { + "script": s + } + })) + + with soft_assertions(): + for t in test_cases: + log.info("Test case: %s", t.extension) + + unique = str(uuid.uuid4()) + test_file = f"/tmp/waagent-test.{unique}" + script = f"echo '{unique}' > {test_file}" + log.info("Script to execute: %s", script) + + t.extension.enable(settings=t.get_settings(script)) + t.extension.assert_instance_view() + + log.info("Verifying contents of the file created by the extension") + contents = ssh_client.run_command(f"cat {test_file}").rstrip() # remove the \n + assert_that(contents).\ + described_as("Contents of the file created by the extension").\ + is_equal_to(unique) + log.info("The contents match") + + +if __name__ == "__main__": + RunCommandBvt.run_from_command_line() diff --git a/tests_e2e/tests/bvts/vm_access.py b/tests_e2e/tests/bvts/vm_access.py new file mode 100755 index 000000000..1af0f99e1 --- /dev/null +++ b/tests_e2e/tests/bvts/vm_access.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# BVT for the VmAccess extension +# +# The test executes VmAccess to add a user and then verifies that an SSH connection to the VM can +# be established with that user's identity. +# +import uuid + +from assertpy import assert_that +from pathlib import Path + +from tests_e2e.tests.lib.agent_test import AgentTest, TestSkipped +from tests_e2e.tests.lib.identifiers import VmExtensionIds +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.ssh_client import SshClient + +from tests_e2e.tests.lib.vm_extension import VmExtension + + +class VmAccessBvt(AgentTest): + def run(self): + ssh: SshClient = SshClient(ip_address=self._context.vm_ip_address, username=self._context.username, private_key_file=self._context.private_key_file) + if "-flatcar" in ssh.run_command("uname -a"): + raise TestSkipped("Currently VMAccess is not supported on Flatcar") + + # Try to use a unique username for each test run (note that we truncate to 32 chars to + # comply with the rules for usernames) + log.info("Generating a new username and SSH key") + username: str = f"test-{uuid.uuid4()}"[0:32] + log.info("Username: %s", username) + + # Create an SSH key for the user and fetch the public key + private_key_file: Path = self._context.working_directory/f"{username}_rsa" + public_key_file: Path = self._context.working_directory/f"{username}_rsa.pub" + log.info("Generating SSH key as %s", private_key_file) + ssh = SshClient(ip_address=self._context.vm_ip_address, username=username, private_key_file=private_key_file) + ssh.generate_ssh_key(private_key_file) + with public_key_file.open() as f: + public_key = f.read() + + # Invoke the extension + vm_access = VmExtension(self._context.vm, VmExtensionIds.VmAccess, resource_name="VmAccess") + vm_access.enable( + protected_settings={ + 'username': username, + 'ssh_key': public_key, + 'reset_ssh': 'false' + } + ) + vm_access.assert_instance_view() + + # Verify the user was added correctly by starting an SSH session to the VM + log.info("Verifying SSH connection to the test VM") + stdout = ssh.run_command("echo -n $USER") + assert_that(stdout).described_as("Output from SSH command").is_equal_to(username) + log.info("SSH command output ($USER): %s", stdout) + + +if __name__ == "__main__": + VmAccessBvt.run_from_command_line() diff --git a/tests_e2e/tests/error_test.py b/tests_e2e/tests/error_test.py new file mode 100755 index 000000000..cf369f7d3 --- /dev/null +++ b/tests_e2e/tests/error_test.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from tests_e2e.tests.lib.agent_test import AgentTest + + +class ErrorTest(AgentTest): + """ + A trivial test that errors out + """ + def run(self): + raise Exception("* ERROR *") + + +if __name__ == "__main__": + ErrorTest.run_from_command_line() diff --git a/tests_e2e/tests/fail_test.py b/tests_e2e/tests/fail_test.py new file mode 100755 index 000000000..e96b5bcf7 --- /dev/null +++ b/tests_e2e/tests/fail_test.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from assertpy import fail +from tests_e2e.tests.lib.agent_test import AgentTest + + +class FailTest(AgentTest): + """ + A trivial test that fails + """ + def run(self): + fail("* FAILED *") + + +if __name__ == "__main__": + FailTest.run_from_command_line() diff --git a/dcr/scenario_utils/extensions/__init__.py b/tests_e2e/tests/lib/__init__.py similarity index 100% rename from dcr/scenario_utils/extensions/__init__.py rename to tests_e2e/tests/lib/__init__.py diff --git a/tests_e2e/tests/lib/agent_log.py b/tests_e2e/tests/lib/agent_log.py new file mode 100644 index 000000000..657b72928 --- /dev/null +++ b/tests_e2e/tests/lib/agent_log.py @@ -0,0 +1,446 @@ +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import re + +from datetime import datetime +from pathlib import Path +from typing import Any, AnyStr, Dict, Iterable, List, Match + +from azurelinuxagent.common.version import DISTRO_NAME, DISTRO_VERSION + + +class AgentLogRecord: + """ + Represents an entry in the Agent's log (note that entries can span multiple lines in the log) + + Sample message: + 2023-03-13T15:44:04.906673Z INFO ExtHandler ExtHandler Azure Linux Agent (Goal State Agent version 9.9.9.9) + """ + text: str # Full text of the record + when: str # Timestamp (as text) + level: str # Level (INFO, ERROR, etc) + thread: str # Thread name (e.g. 'Daemon', 'ExtHandler') + prefix: str # Prefix (e.g. 'Daemon', 'ExtHandler', ) + message: str # Message + + @staticmethod + def from_match(match: Match[AnyStr]): + """Builds a record from a regex match""" + record = AgentLogRecord() + record.text = match.string + record.when = match.group("when") + record.level = match.group("level") + record.thread = match.group("thread") + record.prefix = match.group("prefix") + record.message = match.group("message") + return record + + @staticmethod + def from_dictionary(dictionary: Dict[str, str]): + """Deserializes from a dict""" + record = AgentLogRecord() + record.text = dictionary["text"] + record.when = dictionary["when"] + record.level = dictionary["level"] + record.thread = dictionary["thread"] + record.prefix = dictionary["prefix"] + record.message = dictionary["message"] + return record + + @property + def timestamp(self) -> datetime: + return datetime.strptime(self.when, u'%Y-%m-%dT%H:%M:%S.%fZ') + + +class AgentLog(object): + """ + Provides facilities to parse and/or extract errors from the agent's log. + """ + def __init__(self, path: Path = Path('/var/log/waagent.log')): + self._path: Path = path + self._counter_table: Dict[str, int] = {} + + def get_errors(self) -> List[AgentLogRecord]: + """ + Returns any ERRORs or WARNINGs in the agent log. + + The function filters out known/uninteresting errors, which are kept in the 'ignore_list' variable. + """ + # + # Items in this list are known errors and they are ignored. + # + # * 'message' - A regular expression matched using re.search; be sure to escape any regex metacharacters. A positive match indicates + # that the error should be ignored + # * 'if' - A lambda that takes as parameter an AgentLogRecord representing an error and returns true if the error should be ignored + # + ignore_rules = [ + # + # NOTE: This list was taken from the older agent tests and needs to be cleaned up. Feel free to un-comment rules as new tests are added. + # + # # This warning is expected on CentOS/RedHat 7.4, 7.8 and Redhat 7.6 + # { + # 'message': r"Move rules file 70-persistent-net.rules to /var/lib/waagent/70-persistent-net.rules", + # 'if': lambda r: + # re.match(r"(((centos|redhat)7\.[48])|(redhat7\.6)|(redhat8\.2))\D*", DISTRO_NAME, flags=re.IGNORECASE) is not None + # and r.level == "WARNING" + # and r.prefix == "ExtHandler" and r.thread in ("", "EnvHandler") + # }, + # # This warning is expected on SUSE 12 + # { + # 'message': r"WARNING EnvHandler ExtHandler Move rules file 75-persistent-net-generator.rules to /var/lib/waagent/75-persistent-net-generator.rules", + # 'if': lambda _: re.match(r"((sles15\.2)|suse12)\D*", DISTRO_NAME, flags=re.IGNORECASE) is not None + # }, + # # The following message is expected to log an error if systemd is not enabled on it + # { + # 'message': r"Did not detect Systemd, unable to set wa(|linux)agent-network-setup.service", + # 'if': lambda _: not self._is_systemd() + # }, + # # + # # Journalctl in Debian 8.11 does not have the --utc option by default. + # # Ignoring this error for Deb 8 as its not a blocker and since Deb 8 is old and not widely used + # { + # 'message': r"journalctl: unrecognized option '--utc'", + # 'if': lambda r: re.match(r"(debian8\.11)\D*", DISTRO_NAME, flags=re.IGNORECASE) is not None and r.level == "WARNING" + # }, + # # Sometimes it takes the Daemon some time to identify primary interface and the route to Wireserver, + # # ignoring those errors if they come from the Daemon. + # { + # 'message': r"(No route exists to \d+\.\d+\.\d+\.\d+|" + # r"Could not determine primary interface, please ensure \/proc\/net\/route is correct|" + # r"Contents of \/proc\/net\/route:|Primary interface examination will retry silently)", + # 'if': lambda r: r.prefix == "Daemon" + # }, + # + # # This happens in CENTOS and RHEL when waagent attempt to format and mount the error while cloud init is already doing it + # # 2021-09-20T06:45:57.253801Z WARNING Daemon Daemon Could not mount resource disk: mount: /dev/sdb1 is already mounted or /mnt/resource busy + # # /dev/sdb1 is already mounted on /mnt/resource + # { + # 'message': r"Could not mount resource disk: mount: \/dev\/sdb1 is already mounted or \/mnt\/resource busy", + # 'if': lambda r: + # re.match(r"((centos7\.8)|(redhat7\.8)|(redhat7\.6)|(redhat8\.2))\D*", DISTRO_NAME, flags=re.IGNORECASE) + # and r.level == "WARNING" + # and r.prefix == "Daemon" + # }, + # # + # # 2021-09-20T06:45:57.246593Z ERROR Daemon Daemon Command: [mkfs.ext4 -F /dev/sdb1], return code: [1], result: [mke2fs 1.42.9 (28-Dec-2013) + # # /dev/sdb1 is mounted; will not make a filesystem here! + # { + # 'message': r"Command: \[mkfs.ext4 -F \/dev\/sdb1\], return code: \[1\]", + # 'if': lambda r: + # re.match(r"((centos7\.8)|(redhat7\.8)|(redhat7\.6)|(redhat8\.2))\D*", DISTRO_NAME, flags=re.IGNORECASE) + # and r.level == "ERROR" + # and r.prefix == "Daemon" + # }, + # # + # # 2022-01-20T06:52:21.515447Z WARNING Daemon Daemon Fetch failed: [HttpError] [HTTP Failed] GET https://dcrgajhx62.blob.core.windows.net/$system/edprpwqbj6.5c2ddb5b-d6c3-4d73-9468-54419ca87a97.vmSettings -- IOError timed out -- 6 attempts made + # # + # # The daemon does not need the artifacts profile blob, but the request is done as part of protocol initialization. This timeout can be ignored, if the issue persist the log would include additional instances. + # # + # { + # 'message': r"\[HTTP Failed\] GET https://.*\.vmSettings -- IOError timed out", + # 'if': lambda r: r.level == "WARNING" and r.prefix == "Daemon" + # }, + # + # Probably the agent should log this as INFO, but for now it is a warning + # e.g. + # 2021-07-29T04:40:17.190879Z WARNING EnvHandler ExtHandler Dhcp client is not running. + # old agents logs don't have a prefix of thread and/or logger names. + { + 'message': r"Dhcp client is not running.", + 'if': lambda r: r.level == "WARNING" + }, + # Known bug fixed in the current agent, but still present in older daemons + # + { + 'message': r"\[CGroupsException\].*Error: join\(\) argument must be str, bytes, or os.PathLike object, not 'NoneType'", + 'if': lambda r: r.level == "WARNING" and r.prefix == "Daemon" + }, + # This warning is expected on when WireServer gives us the incomplete goalstate without roleinstance data + { + 'message': r"\[ProtocolError\] Fetched goal state without a RoleInstance", + }, + # + # Download warnings (manifest and zips). + # + # Examples: + # 2021-03-31T03:48:35.216494Z WARNING ExtHandler ExtHandler Fetch failed: [HttpError] [HTTP Failed] GET https://zrdfepirv2cbn04prdstr01a.blob.core.windows.net/f72653efd9e349ed9842c8b99e4c1712/Microsoft.CPlat.Core_NullSeqA_useast2euap_manifest.xml -- IOError ('The read operation timed out',) -- 1 attempts made + # 2021-03-31T06:54:29.655861Z WARNING ExtHandler ExtHandler Fetch failed: [HttpError] [HTTP Retry] GET http://168.63.129.16:32526/extensionArtifact -- Status Code 502 -- 1 attempts made + # 2021-03-31T06:43:17.806663Z WARNING ExtHandler ExtHandler Download failed, switching to host plugin + { + 'message': r"(Fetch failed: \[HttpError\] .+ GET .+ -- [0-9]+ attempts made)|(Download failed, switching to host plugin)", + 'if': lambda r: r.level == "WARNING" and r.prefix == "ExtHandler" and r.thread == "ExtHandler" + }, + # 2021-07-09T01:46:53.307959Z INFO MonitorHandler ExtHandler [CGW] Disabling resource usage monitoring. Reason: Check on cgroups failed: + # [CGroupsException] The agent's cgroup includes unexpected processes: ['[PID: 2367] UNKNOWN'] + { + 'message': r"The agent's cgroup includes unexpected processes: \[('\[PID:\s?\d+\]\s*UNKNOWN'(,\s*)?)+\]" + }, + # 2021-12-20T07:46:23.020197Z INFO ExtHandler ExtHandler [CGW] The agent's process is not within a memory cgroup + # Ignoring this since memory cgroup(MemoryAccounting) not enabled. + { + 'message': r"The agent's process is not within a memory cgroup", + 'if': lambda r: re.match(r"(((centos|redhat)7\.[48])|(redhat7\.6)|(redhat8\.2))\D*", DISTRO_NAME, flags=re.IGNORECASE) + }, + # + # Ubuntu 22 uses cgroups v2, so we need to ignore these: + # + # 2023-03-15T20:47:56.684849Z INFO ExtHandler ExtHandler [CGW] The CPU cgroup controller is not mounted + # 2023-03-15T20:47:56.685392Z INFO ExtHandler ExtHandler [CGW] The memory cgroup controller is not mounted + # 2023-03-15T20:47:56.688576Z INFO ExtHandler ExtHandler [CGW] The agent's process is not within a CPU cgroup + # 2023-03-15T20:47:56.688981Z INFO ExtHandler ExtHandler [CGW] The agent's process is not within a memory cgroup + # + { + 'message': r"\[CGW\]\s*(The (CPU|memory) cgroup controller is not mounted)|(The agent's process is not within a (CPU|memory) cgroup)", + 'if': lambda r: DISTRO_NAME == 'ubuntu' and DISTRO_VERSION >= '22.00' + }, + # + # 2022-02-09T04:50:37.384810Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] GET vmSettings [correlation ID: 2bed9b62-188e-4668-b1a8-87c35cfa4927 eTag: 7031887032544600793]: [Internal error in HostGAPlugin] [HTTP Failed] [502: Bad Gateway] b'{ "errorCode": "VMArtifactsProfileBlobContentNotFound", "message": "VM artifacts profile blob has no content in it.", "details": ""}' + # + # Fetching the goal state may catch the HostGAPlugin in the process of computing the vmSettings. This can be ignored, if the issue persist the log would include other errors as well. + # + { + 'message': r"\[ProtocolError\] GET vmSettings.*VMArtifactsProfileBlobContentNotFound", + 'if': lambda r: r.level == "ERROR" + }, + # + # 2022-11-01T02:45:55.513692Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] GET vmSettings [correlation ID: 616873cc-be87-41b6-83b7-ef3a76370628 eTag: 3693655388249891516]: [Internal error in HostGAPlugin] [HTTP Failed] [502: Bad Gateway] { "errorCode": "InternalError", "message": "The server encountered an internal error. Please retry the request.", "details": ""} + # + # Fetching the goal state may catch the HostGAPlugin in the process of computing the vmSettings. This can be ignored, if the issue persist the log would include other errors as well. + # + { + 'message': r"\[ProtocolError\] GET vmSettings.*Please retry the request", + 'if': lambda r: r.level == "ERROR" + }, + # + # 2022-08-16T01:50:10.759502Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] GET vmSettings [correlation ID: e162f7c3-8d0c-4a9b-a987-8f9ec0699dae eTag: 9757461589808963322]: Timeout + # + # Fetching the goal state may hit timeouts in the HostGAPlugin's vmSettings. This can be ignored, if the issue persist the log would include other errors as well. + # + { + 'message': r"\[ProtocolError\] GET vmSettings.*Timeout", + 'if': lambda r: r.level == "ERROR" + }, + # + # 2021-12-29T06:50:49.904601Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] Error fetching goal state Inner error: [ResourceGoneError] [HTTP Failed] [410: Gone] The page you requested was removed. + # 2022-03-21T02:44:03.770017Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] Error fetching goal state Inner error: [ResourceGoneError] Resource is gone + # 2022-02-16T04:46:50.477315Z WARNING Daemon Daemon Fetching the goal state failed: [ResourceGoneError] [HTTP Failed] [410: Gone] b'\n\n ResourceNotAvailable\n The resource requested is no longer available. Please refresh your cache.\n
\n
' + # + # ResourceGone can happen if we are fetching one of the URIs in the goal state and a new goal state arrives + { + 'message': r"(?s)(Fetching the goal state failed|Error fetching goal state|Error fetching the goal state).*(\[ResourceGoneError\]|\[410: Gone\]|Resource is gone)", + 'if': lambda r: r.level in ("WARNING", "ERROR") + }, + # + # 2022-12-02T05:45:51.771876Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] [Wireserver Exception] [HttpError] [HTTP Failed] GET http://168.63.129.16/machine/ -- IOError [Errno 104] Connection reset by peer -- 6 attempts made + # + { + 'message': r"\[HttpError\] \[HTTP Failed\] GET http://168.63.129.16/machine/ -- IOError \[Errno 104\] Connection reset by peer", + 'if': lambda r: r.level in ("WARNING", "ERROR") + }, + # + # 2022-03-08T03:03:23.036161Z WARNING ExtHandler ExtHandler Fetch failed from [http://168.63.129.16:32526/extensionArtifact]: [HTTP Failed] [400: Bad Request] b'' + # 2022-03-08T03:03:23.042008Z WARNING ExtHandler ExtHandler Fetch failed: [ProtocolError] Fetch failed from [http://168.63.129.16:32526/extensionArtifact]: [HTTP Failed] [400: Bad Request] b'' + # + # Warning downloading extension manifest. If the issue persists, this would cause errors elsewhere so safe to ignore + { + 'message': r"\[http://168.63.129.16:32526/extensionArtifact\]: \[HTTP Failed\] \[400: Bad Request\]", + 'if': lambda r: r.level == "WARNING" + }, + # + # 2022-03-29T05:52:10.089958Z WARNING ExtHandler ExtHandler An error occurred while retrieving the goal state: [ProtocolError] GET vmSettings [correlation ID: da106cf5-83a0-44ec-9484-d0e9223847ab eTag: 9856274988128027586]: Timeout + # + # Ignore warnings about timeouts in vmSettings; if the condition persists, an error will occur elsewhere. + # + { + 'message': r"GET vmSettings \[[^]]+\]: Timeout", + 'if': lambda r: r.level == "WARNING" + }, + # + # 2022-09-30T02:48:33.134649Z WARNING MonitorHandler ExtHandler Error in SendHostPluginHeartbeat: [HttpError] [HTTP Failed] GET http://168.63.129.16:32526/health -- IOError timed out -- 1 attempts made --- [NOTE: Will not log the same error for the next hour] + # + # Ignore timeouts in the HGAP's health API... those are tracked in the HGAP dashboard so no need to worry about them on test runs + # + { + 'message': r"SendHostPluginHeartbeat:.*GET http://168.63.129.16:32526/health.*timed out", + 'if': lambda r: r.level == "WARNING" + }, + # + # 2022-09-30T03:09:25.013398Z WARNING MonitorHandler ExtHandler Error in SendHostPluginHeartbeat: [ResourceGoneError] [HTTP Failed] [410: Gone] + # + # ResourceGone should not happen very often, since the monitor thread already refreshes the goal state before sending the HostGAPlugin heartbeat. Errors can still happen, though, since the goal state + # can change in-between the time at which the monitor thread refreshes and the time at which it sends the heartbeat. Ignore these warnings unless there are 2 or more of them. + # + { + 'message': r"SendHostPluginHeartbeat:.*ResourceGoneError.*410", + 'if': lambda r: r.level == "WARNING" and self._increment_counter("SendHostPluginHeartbeat-ResourceGoneError-410") < 2 # ignore unless there are 2 or more instances + }, + # 2023-01-18T02:58:25.589492Z ERROR SendTelemetryHandler ExtHandler Event: name=WALinuxAgent, op=ReportEventErrors, message=DroppedEventsCount: 1 + # Reasons (first 5 errors): [ProtocolError] [Wireserver Exception] [ProtocolError] [Wireserver Failed] URI http://168.63.129.16/machine?comp=telemetrydata [HTTP Failed] Status Code 400: Traceback (most recent call last): + # + { + 'message': r"(?s)SendTelemetryHandler.*http://168.63.129.16/machine\?comp=telemetrydata.*Status Code 400", + 'if': lambda _: self._increment_counter("SendTelemetryHandler-telemetrydata-Status Code 400") < 2 # ignore unless there are 2 or more instances + }, + # + # Ignore these errors in flatcar: + # + # 1) 2023-03-16T14:30:33.091427Z ERROR Daemon Daemon Failed to mount resource disk [ResourceDiskError] unable to detect disk topology + # 2) 2023-03-16T14:30:33.091708Z ERROR Daemon Daemon Event: name=WALinuxAgent, op=ActivateResourceDisk, message=[ResourceDiskError] unable to detect disk topology, duration=0 + # 3) 2023-03-16T14:30:34.660976Z WARNING ExtHandler ExtHandler Fetch failed: [HttpError] HTTPS is unavailable and required + # 4) 2023-03-16T14:30:34.800112Z ERROR ExtHandler ExtHandler Unable to setup the persistent firewall rules: [Errno 30] Read-only file system: '/lib/systemd/system/waagent-network-setup.service' + # + # 1, 2) under investigation + # 3) There seems to be a configuration issue in flatcar that prevents python from using HTTPS when trying to reach storage. This does not produce any actual errors, since the agent fallbacks to the HGAP. + # 4) Remove this when bug 17523033 is fixed. + # + { + 'message': r"(Failed to mount resource disk)|(unable to detect disk topology)", + 'if': lambda r: r.prefix == 'Daemon' and DISTRO_NAME == 'flatcar' + }, + { + 'message': r"(HTTPS is unavailable and required)|(Unable to setup the persistent firewall rules.*Read-only file system)", + 'if': lambda r: DISTRO_NAME == 'flatcar' + }, + # + # AzureSecurityLinuxAgent fails to install on a few distros (e.g. Debian 11) + # + # 2023-03-16T14:29:48.798415Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Security.Monitoring.AzureSecurityLinuxAgent, op=Install, message=[ExtensionOperationError] Non-zero exit code: 56, /var/lib/waagent/Microsoft.Azure.Security.Monitoring.AzureSecurityLinuxAgent-2.21.115/handler.sh install + # + { + 'message': r"Microsoft.Azure.Security.Monitoring.AzureSecurityLinuxAgent.*op=Install.*Non-zero exit code: 56,", + }, + + ] + + def is_error(r: AgentLogRecord) -> bool: + return r.level in ('ERROR', 'WARNING') or any(err in r.text for err in ['Exception', 'Traceback', '[CGW]']) + + errors = [] + primary_interface_error = None + provisioning_complete = False + + for record in self.read(): + if is_error(record) and not self.matches_ignore_rule(record, ignore_rules): + # Handle "/proc/net/route contains no routes" and "/proc/net/route is missing headers" as a special case + # since it can take time for the primary interface to come up, and we don't want to report transient + # errors as actual errors. The last of these errors in the log will be reported + if "/proc/net/route contains no routes" in record.text or "/proc/net/route is missing headers" in record.text and record.prefix == "Daemon": + primary_interface_error = record + provisioning_complete = False + else: + errors.append(record) + + if "Provisioning complete" in record.text and record.prefix == "Daemon": + provisioning_complete = True + + # Keep the "no routes found" as a genuine error message if it was never corrected + if primary_interface_error is not None and not provisioning_complete: + errors.append(primary_interface_error) + + return errors + + @staticmethod + def _is_systemd(): + # Taken from azurelinuxagent/common/osutil/systemd.py; repeated here because it is available only on agents >= 2.3 + return os.path.exists("/run/systemd/system/") + + def _increment_counter(self, counter_name) -> int: + """ + Keeps a table of counters indexed by the given 'counter_name'. Each call to the function + increments the value of that counter and returns the new value. + """ + count = self._counter_table.get(counter_name) + count = 1 if count is None else count + 1 + self._counter_table[counter_name] = count + return count + + @staticmethod + def matches_ignore_rule(record: AgentLogRecord, ignore_rules: List[Dict[str, Any]]) -> bool: + """ + Returns True if the given 'record' matches any of the 'ignore_rules' + """ + return any(re.search(rule['message'], record.message) is not None and ('if' not in rule or rule['if'](record)) for rule in ignore_rules) + + # The format of the log has changed over time and the current log may include records from different sources. Most records are single-line, but some of them + # can span across multiple lines. We will assume records always start with a line similar to the examples below; any other lines will be assumed to be part + # of the record that is being currently parsed. + # + # Newer Agent: 2019-11-27T22:22:48.123985Z VERBOSE ExtHandler ExtHandler Report vm agent status + # 2021-03-30T19:45:33.793213Z INFO ExtHandler [Microsoft.Azure.Security.Monitoring.AzureSecurityLinuxAgent-2.14.64] Target handler state: enabled [incarnation 3] + # + # 2.2.46: the date time was changed to ISO-8601 format but the thread name was not added. + # 2021-05-28T01:17:40.683072Z INFO ExtHandler Wire server endpoint:168.63.129.16 + # 2021-05-28T01:17:40.683823Z WARNING ExtHandler Move rules file 70-persistent-net.rules to /var/lib/waagent/70-persistent-net.rules + # 2021-05-28T01:17:40.767600Z INFO ExtHandler Successfully added Azure fabric firewall rules + # + # Older Agent: 2021/03/30 19:35:35.971742 INFO Daemon Azure Linux Agent Version:2.2.45 + # + # Extension: 2021/03/30 19:45:31 Azure Monitoring Agent for Linux started to handle. + # 2021/03/30 19:45:31 [Microsoft.Azure.Monitor.AzureMonitorLinuxAgent-1.7.0] cwd is /var/lib/waagent/Microsoft.Azure.Monitor.AzureMonitorLinuxAgent-1.7.0 + # + _NEWER_AGENT_RECORD = re.compile(r'(?P[\d-]+T[\d:.]+Z)\s(?PVERBOSE|INFO|WARNING|ERROR)\s(?P\S+)\s(?P(Daemon)|(ExtHandler)|(\[\S+\]))\s(?P.*)') + _2_2_46_AGENT_RECORD = re.compile(r'(?P[\d-]+T[\d:.]+Z)\s(?PVERBOSE|INFO|WARNING|ERROR)\s(?P)(?PDaemon|ExtHandler|\[\S+\])\s(?P.*)') + _OLDER_AGENT_RECORD = re.compile(r'(?P[\d/]+\s[\d:.]+)\s(?PVERBOSE|INFO|WARNING|ERROR)\s(?P)(?P\S*)\s(?P.*)') + _EXTENSION_RECORD = re.compile(r'(?P[\d/]+\s[\d:.]+)\s(?P)(?P)((?P\[[^\]]+\])\s)?(?P.*)') + + def read(self) -> Iterable[AgentLogRecord]: + """ + Generator function that returns each of the entries in the agent log parsed as AgentLogRecords. + + The function can be used following this pattern: + + for record in read_agent_log(): + ... do something... + + """ + if not self._path.exists(): + raise IOError('{0} does not exist'.format(self._path)) + + def match_record(): + for regex in [self._NEWER_AGENT_RECORD, self._2_2_46_AGENT_RECORD, self._OLDER_AGENT_RECORD]: + m = regex.match(line) + if m is not None: + return m + # The extension regex also matches the old agent records, so it needs to be last + return self._EXTENSION_RECORD.match(line) + + def complete_record(): + record.text = record.text.rstrip() # the text includes \n + if extra_lines != "": + record.text = record.text + "\n" + extra_lines.rstrip() + record.message = record.message + "\n" + extra_lines.rstrip() + return record + + with self._path.open() as file_: + record = None + extra_lines = "" + + line = file_.readline() + while line != "": # while not EOF + match = match_record() + if match is not None: + if record is not None: + yield complete_record() + record = AgentLogRecord.from_match(match) + extra_lines = "" + else: + extra_lines = extra_lines + line + line = file_.readline() + + if record is not None: + yield complete_record() diff --git a/tests_e2e/tests/lib/agent_test.py b/tests_e2e/tests/lib/agent_test.py new file mode 100644 index 000000000..22f865a6f --- /dev/null +++ b/tests_e2e/tests/lib/agent_test.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys + +from abc import ABC, abstractmethod +from typing import Any, Dict, List + +from tests_e2e.tests.lib.agent_test_context import AgentTestContext +from tests_e2e.tests.lib.logging import log + + +class TestSkipped(Exception): + """ + Tests can raise this exception to indicate they should not be executed (for example, if trying to execute them on + an unsupported distro + """ + + +class AgentTest(ABC): + """ + Defines the interface for agent tests, which are simply constructed from an AgentTestContext and expose a single method, + run(), to execute the test. + """ + def __init__(self, context: AgentTestContext): + self._context = context + + @abstractmethod + def run(self): + pass + + def get_ignore_error_rules(self) -> List[Dict[str, Any]]: + # Tests can override this method to return a list with rules to ignore errors in the agent log (see agent_log.py for sample rules). + return [] + + @classmethod + def run_from_command_line(cls): + """ + Convenience method to execute the test when it is being invoked directly from the command line (as opposed as + being invoked from a test framework or library. + """ + try: + cls(AgentTestContext.from_args()).run() + except SystemExit: # Bad arguments + pass + except: # pylint: disable=bare-except + log.exception("Test failed") + sys.exit(1) + + sys.exit(0) diff --git a/tests_e2e/tests/lib/agent_test_context.py b/tests_e2e/tests/lib/agent_test_context.py new file mode 100644 index 000000000..ca9fc64ad --- /dev/null +++ b/tests_e2e/tests/lib/agent_test_context.py @@ -0,0 +1,164 @@ +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import argparse +import os + +from pathlib import Path + +import tests_e2e +from tests_e2e.tests.lib.identifiers import VmIdentifier + + +class AgentTestContext: + """ + Execution context for agent tests. Defines the test VM, working directories and connection info for the tests. + + NOTE: The context is shared by all tests in the same runbook execution. Tests within the same test suite + are executed sequentially, but multiple test suites may be executed concurrently depending on the + concurrency level of the runbook. + """ + class Paths: + DEFAULT_TEST_SOURCE_DIRECTORY = Path(tests_e2e.__path__[0]) + + def __init__( + self, + working_directory: Path, + remote_working_directory: Path, + test_source_directory: Path = DEFAULT_TEST_SOURCE_DIRECTORY + ): + self._test_source_directory: Path = test_source_directory + self._working_directory: Path = working_directory + self._remote_working_directory: Path = remote_working_directory + + class Connection: + DEFAULT_SSH_PORT = 22 + + def __init__( + self, + ip_address: str, + username: str, + private_key_file: Path, + ssh_port: int = DEFAULT_SSH_PORT + ): + self._ip_address: str = ip_address + self._username: str = username + self._private_key_file: Path = private_key_file + self._ssh_port: int = ssh_port + + def __init__(self, vm: VmIdentifier, paths: Paths, connection: Connection): + self._vm: VmIdentifier = vm + self._paths = paths + self._connection = connection + + @property + def vm(self) -> VmIdentifier: + """ + The test VM (the VM on which the tested Agent is running) + """ + return self._vm + + @property + def vm_ip_address(self) -> str: + """ + The IP address of the test VM + """ + return self._connection._ip_address + + @property + def test_source_directory(self) -> Path: + """ + Root directory for the source code of the tests. Used to build paths to specific scripts. + """ + return self._paths._test_source_directory + + @property + def working_directory(self) -> Path: + """ + Tests can create temporary files under this directory. + + """ + return self._paths._working_directory + + @property + def remote_working_directory(self) -> Path: + """ + Tests can create temporary files under this directory on the test VM. + """ + return self._paths._remote_working_directory + + @property + def username(self) -> str: + """ + The username to use for SSH connections + """ + return self._connection._username + + @property + def private_key_file(self) -> Path: + """ + The file containing the private SSH key for the username + """ + return self._connection._private_key_file + + @property + def ssh_port(self) -> int: + """ + Port for SSH connections + """ + return self._connection._ssh_port + + @staticmethod + def from_args(): + """ + Creates an AgentTestContext from the command line arguments. + """ + parser = argparse.ArgumentParser() + parser.add_argument('-g', '--group', required=True) + parser.add_argument('-l', '--location', required=True) + parser.add_argument('-s', '--subscription', required=True) + parser.add_argument('-vm', '--vm', required=True) + + parser.add_argument('-rw', '--remote-working-directory', dest="remote_working_directory", required=False, default=str(Path('/home')/os.getenv("USER"))) + parser.add_argument('-t', '--test-source-directory', dest="test_source_directory", required=False, default=str(AgentTestContext.Paths.DEFAULT_TEST_SOURCE_DIRECTORY)) + parser.add_argument('-w', '--working-directory', dest="working_directory", required=False, default=str(Path().home()/"tmp")) + + parser.add_argument('-a', '--ip-address', dest="ip_address", required=False) # Use the vm name as default + parser.add_argument('-u', '--username', required=False, default=os.getenv("USER")) + parser.add_argument('-k', '--private-key-file', dest="private_key_file", required=False, default=Path.home()/".ssh"/"id_rsa") + parser.add_argument('-p', '--ssh-port', dest="ssh_port", required=False, default=AgentTestContext.Connection.DEFAULT_SSH_PORT) + + args = parser.parse_args() + + working_directory = Path(args.working_directory) + if not working_directory.exists(): + working_directory.mkdir(exist_ok=True) + + return AgentTestContext( + vm=VmIdentifier( + location=args.location, + subscription=args.subscription, + resource_group=args.group, + name=args.vm), + paths=AgentTestContext.Paths( + working_directory=working_directory, + remote_working_directory=Path(args.remote_working_directory), + test_source_directory=Path(args.test_source_directory)), + connection=AgentTestContext.Connection( + ip_address=args.ip_address if args.ip_address is not None else args.vm, + username=args.username, + private_key_file=Path(args.private_key_file), + ssh_port=args.ssh_port)) diff --git a/tests_e2e/tests/lib/identifiers.py b/tests_e2e/tests/lib/identifiers.py new file mode 100644 index 000000000..48794140b --- /dev/null +++ b/tests_e2e/tests/lib/identifiers.py @@ -0,0 +1,63 @@ +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +class VmIdentifier(object): + def __init__(self, location, subscription, resource_group, name): + """ + Represents the information that identifies a VM to the ARM APIs + """ + self.location = location + self.subscription: str = subscription + self.resource_group: str = resource_group + self.name: str = name + + def __str__(self): + return f"{self.resource_group}:{self.name}" + + +class VmExtensionIdentifier(object): + def __init__(self, publisher, ext_type, version): + """ + Represents the information that identifies an extension to the ARM APIs + + publisher - e.g. Microsoft.Azure.Extensions + type - e.g. CustomScript + version - e.g. 2.1, 2.* + name - arbitrary name for the extension ARM resource + """ + self.publisher: str = publisher + self.type: str = ext_type + self.version: str = version + + def __str__(self): + return f"{self.publisher}.{self.type}" + + +class VmExtensionIds(object): + """ + A set of extensions used by the tests, listed here for convenience (easy to reference them by name). + + Only the major version is specified, and the minor version is set to 0 (set autoUpgradeMinorVersion to True in the call to enable + to use the latest version) + """ + CustomScript: VmExtensionIdentifier = VmExtensionIdentifier(publisher='Microsoft.Azure.Extensions', ext_type='CustomScript', version="2.0") + # Older run command extension, still used by the Portal as of Dec 2022 + RunCommand: VmExtensionIdentifier = VmExtensionIdentifier(publisher='Microsoft.CPlat.Core', ext_type='RunCommandLinux', version="1.0") + # New run command extension, with support for multi-config + RunCommandHandler: VmExtensionIdentifier = VmExtensionIdentifier(publisher='Microsoft.CPlat.Core', ext_type='RunCommandHandlerLinux', version="1.0") + VmAccess: VmExtensionIdentifier = VmExtensionIdentifier(publisher='Microsoft.OSTCExtensions', ext_type='VMAccessForLinux', version="1.0") diff --git a/tests_e2e/tests/lib/logging.py b/tests_e2e/tests/lib/logging.py new file mode 100644 index 000000000..ff636b63d --- /dev/null +++ b/tests_e2e/tests/lib/logging.py @@ -0,0 +1,155 @@ +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# This module defines a single object, 'log', of type AgentLogger, which the end-to-end tests and libraries use +# for logging. +# +import contextlib +from logging import FileHandler, Formatter, Handler, Logger, StreamHandler, INFO +from pathlib import Path +from threading import current_thread +from typing import Dict, Callable + + +class _AgentLoggingHandler(Handler): + """ + AgentLoggingHandler is a helper class for AgentLogger. + + This handler simply redirects logging to other handlers. It maintains a set of FileHandlers associated to specific + threads. When a thread emits a log record, the AgentLoggingHandler passes through the call to the FileHandlers + associated with that thread, or to a StreamHandler that outputs to stdout if there is not a FileHandler for that + thread. + + Thread can set a FileHandler for themselves using _AgentLoggingHandler.set_current_thread_log() and remove that + handler using _AgentLoggingHandler.close_current_thread_log(). + + The _AgentLoggingHandler simply passes through calls to setLevel, setFormatter, flush, and close to the handlers + it maintains. + + AgentLoggingHandler is meant to be primarily used in multithreaded scenarios and is thread-safe. + """ + def __init__(self): + super().__init__() + self.formatter: Formatter = Formatter('%(asctime)s.%(msecs)03d [%(levelname)s] %(message)s', datefmt="%Y-%m-%dT%H:%M:%SZ") + self.default_handler = StreamHandler() + self.default_handler.setFormatter(self.formatter) + self.per_thread_handlers: Dict[int, FileHandler] = {} + + def set_thread_log(self, thread_ident: int, log_file: Path) -> None: + self.close_current_thread_log() + handler: FileHandler = FileHandler(str(log_file)) + handler.setFormatter(self.formatter) + self.per_thread_handlers[thread_ident] = handler + + def get_thread_log(self, thread_ident: int) -> Path: + handler = self.per_thread_handlers.get(thread_ident) + if handler is None: + return None + return Path(handler.baseFilename) + + def close_thread_log(self, thread_ident: int) -> None: + handler = self.per_thread_handlers.pop(thread_ident, None) + if handler is not None: + handler.close() + + def set_current_thread_log(self, log_file: Path) -> None: + self.set_thread_log(current_thread().ident, log_file) + + def get_current_thread_log(self) -> Path: + return self.get_thread_log(current_thread().ident) + + def close_current_thread_log(self) -> None: + self.close_thread_log(current_thread().ident) + + def emit(self, record) -> None: + handler = self.per_thread_handlers.get(current_thread().ident) + if handler is None: + handler = self.default_handler + handler.emit(record) + + def setLevel(self, level) -> None: + self._for_each_handler(lambda h: h.setLevel(level)) + + def setFormatter(self, fmt) -> None: + self._for_each_handler(lambda h: h.setFormatter(fmt)) + + def flush(self) -> None: + self._for_each_handler(lambda h: h.flush()) + + def close(self) -> None: + self._for_each_handler(lambda h: h.close()) + + def _for_each_handler(self, op: Callable[[Handler], None]) -> None: + op(self.default_handler) + # copy of the values into a new list in case the dictionary changes while we are iterating + for handler in list(self.per_thread_handlers.values()): + op(handler) + + +class AgentLogger(Logger): + """ + AgentLogger is a Logger customized for agent test scenarios. When tests are executed from the command line + (for example, during development) the AgentLogger can be used with its default configuration, which simply + outputs to stdout. When tests are executed from the test framework, typically there are multiple test suites + executed concurrently on different threads, and each test suite must have its own log file; in that case, + each thread can call AgentLogger.set_current_thread_log() to send all the logging from that thread to a + particular file. + """ + def __init__(self): + super().__init__(name="waagent", level=INFO) + self._handler: _AgentLoggingHandler = _AgentLoggingHandler() + self.addHandler(self._handler) + + def set_thread_log(self, thread_ident: int, log_file: Path) -> None: + self._handler.set_thread_log(thread_ident, log_file) + + def get_thread_log_file(self, thread_ident: int) -> Path: + """ + Returns the Path of the log file for the current thread, or None if a log has not been set + """ + return self._handler.get_thread_log(thread_ident) + + def close_thread_log(self, thread_ident: int) -> None: + self._handler.close_thread_log(thread_ident) + + def set_current_thread_log(self, log_file: Path) -> None: + self._handler.set_current_thread_log(log_file) + + def get_current_thread_log(self) -> Path: + return self._handler.get_current_thread_log() + + def close_current_thread_log(self) -> None: + self._handler.close_current_thread_log() + + +log: AgentLogger = AgentLogger() + + +@contextlib.contextmanager +def set_current_thread_log(log_file: Path): + """ + Context Manager to set the log file for the current thread temporarily + """ + initial_value = log.get_current_thread_log() + log.set_current_thread_log(log_file) + try: + yield + finally: + log.close_current_thread_log() + if initial_value is not None: + log.set_current_thread_log(initial_value) diff --git a/tests_e2e/tests/lib/retry.py b/tests_e2e/tests/lib/retry.py new file mode 100644 index 000000000..bbd327cda --- /dev/null +++ b/tests_e2e/tests/lib/retry.py @@ -0,0 +1,59 @@ +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import time + +from typing import Callable, Any + +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.shell import CommandError + + +def execute_with_retry(operation: Callable[[], Any]) -> Any: + """ + Some Azure errors (e.g. throttling) are retryable; this method attempts the given operation retrying a few times + (after a short delay) if the error includes the string "RetryableError" + """ + attempts = 3 + while attempts > 0: + attempts -= 1 + try: + return operation() + except Exception as e: + # TODO: Do we need to retry on msrestazure.azure_exceptions.CloudError? + if "RetryableError" not in str(e) or attempts == 0: + raise + log.warning("The operation failed with a RetryableError, retrying in 30 secs. Error: %s", e) + time.sleep(30) + + +def retry_ssh_run(operation: Callable[[], Any]) -> Any: + """ + This method attempts to retry ssh run command a few times if operation failed with connection time out + """ + attempts = 3 + while attempts > 0: + attempts -= 1 + try: + return operation() + except Exception as e: + # We raise CommandError on !=0 exit codes in the called method + if isinstance(e, CommandError): + # Instance of 'Exception' has no 'exit_code' member (no-member) - Disabled: e is actually an CommandError + if e.exit_code != 255 or attempts == 0: # pylint: disable=no-member + raise + log.warning("The operation failed with %s, retrying in 30 secs.", e) + time.sleep(30) diff --git a/tests_e2e/tests/lib/shell.py b/tests_e2e/tests/lib/shell.py new file mode 100644 index 000000000..a5288439a --- /dev/null +++ b/tests_e2e/tests/lib/shell.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from subprocess import Popen, PIPE +from typing import Any + + +class CommandError(Exception): + """ + Exception raised by run_command when the command returns an error + """ + def __init__(self, command: Any, exit_code: int, stdout: str, stderr: str): + super().__init__(f"'{command}' failed (exit code: {exit_code}): {stderr}") + self.command: Any = command + self.exit_code: int = exit_code + self.stdout: str = stdout + self.stderr: str = stderr + + def __str__(self): + return f"'{self.command}' failed (exit code: {self.exit_code})\nstdout:\n{self.stdout}\nstderr:\n{self.stderr}\n" + + +def run_command(command: Any, shell=False) -> str: + """ + This function is a thin wrapper around Popen/communicate in the subprocess module. It executes the given command + and returns its stdout. If the command returns a non-zero exit code, the function raises a RunCommandException. + + Similarly to Popen, the 'command' can be a string or a list of strings, and 'shell' indicates whether to execute + the command through the shell. + + NOTE: The command's stdout and stderr are read as text streams. + """ + process = Popen(command, stdout=PIPE, stderr=PIPE, shell=shell, text=True) + + stdout, stderr = process.communicate() + + if process.returncode != 0: + raise CommandError(command, process.returncode, stdout, stderr) + + return stdout + diff --git a/tests_e2e/tests/lib/ssh_client.py b/tests_e2e/tests/lib/ssh_client.py new file mode 100644 index 000000000..a6e1ab9fd --- /dev/null +++ b/tests_e2e/tests/lib/ssh_client.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import re + +from pathlib import Path + +from tests_e2e.tests.lib import shell +from tests_e2e.tests.lib.retry import retry_ssh_run + + +class SshClient(object): + def __init__(self, ip_address: str, username: str, private_key_file: Path, port: int = 22): + self._ip_address: str = ip_address + self._username: str = username + self._private_key_file: Path = private_key_file + self._port: int = port + + def run_command(self, command: str, use_sudo: bool = False) -> str: + """ + Executes the given command over SSH and returns its stdout. If the command returns a non-zero exit code, + the function raises a RunCommandException. + """ + if re.match(r"^\s*sudo\s*", command): + raise Exception("Do not include 'sudo' in the 'command' argument, use the 'use_sudo' parameter instead") + + destination = f"ssh://{self._username}@{self._ip_address}:{self._port}" + + # Note that we add ~/bin to the remote PATH, since Python (Pypy) and other test tools are installed there. + # Note, too, that when using sudo we need to carry over the value of PATH to the sudo session + sudo = "sudo env PATH=$PATH PYTHONPATH=$PYTHONPATH" if use_sudo else '' + return retry_ssh_run(lambda: shell.run_command([ + "ssh", "-o", "StrictHostKeyChecking=no", "-i", self._private_key_file, destination, + f"if [[ -e ~/bin/set-agent-env ]]; then source ~/bin/set-agent-env; fi; {sudo} {command}"])) + + @staticmethod + def generate_ssh_key(private_key_file: Path): + """ + Generates an SSH key on the given Path + """ + shell.run_command( + ["ssh-keygen", "-m", "PEM", "-t", "rsa", "-b", "4096", "-q", "-N", "", "-f", str(private_key_file)]) + + def get_architecture(self): + return self.run_command("uname -m").rstrip() + + def copy_to_node(self, local_path: Path, remote_path: Path, recursive: bool = False) -> None: + """ + File copy to a remote node + """ + self._copy(local_path, remote_path, remote_source=False, remote_target=True, recursive=recursive) + + def copy_from_node(self, remote_path: Path, local_path: Path, recursive: bool = False) -> None: + """ + File copy from a remote node + """ + self._copy(remote_path, local_path, remote_source=True, remote_target=False, recursive=recursive) + + def _copy(self, source: Path, target: Path, remote_source: bool, remote_target: bool, recursive: bool) -> None: + if remote_source: + source = f"{self._username}@{self._ip_address}:{source}" + if remote_target: + target = f"{self._username}@{self._ip_address}:{target}" + + command = ["scp", "-o", "StrictHostKeyChecking=no", "-i", self._private_key_file] + if recursive: + command.append("-r") + command.extend([str(source), str(target)]) + + shell.run_command(command) diff --git a/tests_e2e/tests/lib/virtual_machine.py b/tests_e2e/tests/lib/virtual_machine.py new file mode 100644 index 000000000..032a7e0f5 --- /dev/null +++ b/tests_e2e/tests/lib/virtual_machine.py @@ -0,0 +1,143 @@ +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# This module includes facilities to execute some operations on virtual machines and scale sets (list extensions, restart, etc). +# + +from abc import ABC, abstractmethod +from builtins import TimeoutError +from typing import Any, List + +from azure.core.polling import LROPoller +from azure.identity import DefaultAzureCredential +from azure.mgmt.compute import ComputeManagementClient +from azure.mgmt.compute.models import VirtualMachineExtension, VirtualMachineScaleSetExtension, VirtualMachineInstanceView, VirtualMachineScaleSetInstanceView +from azure.mgmt.resource import ResourceManagementClient + +from tests_e2e.tests.lib.identifiers import VmIdentifier +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.retry import execute_with_retry + + +class VirtualMachineBaseClass(ABC): + """ + Abstract base class for VirtualMachine and VmScaleSet. + + Defines the interface common to both classes and provides the implementation of some methods in that interface. + """ + def __init__(self, vm: VmIdentifier): + super().__init__() + self._identifier: VmIdentifier = vm + self._compute_client = ComputeManagementClient(credential=DefaultAzureCredential(), subscription_id=vm.subscription) + self._resource_client = ResourceManagementClient(credential=DefaultAzureCredential(), subscription_id=vm.subscription) + + @abstractmethod + def get_instance_view(self) -> Any: # Returns VirtualMachineInstanceView or VirtualMachineScaleSetInstanceView + """ + Retrieves the instance view of the virtual machine or scale set + """ + + @abstractmethod + def get_extensions(self) -> Any: # Returns List[VirtualMachineExtension] or List[VirtualMachineScaleSetExtension] + """ + Retrieves the extensions installed on the virtual machine or scale set + """ + + def restart(self, timeout=5 * 60) -> None: + """ + Restarts the virtual machine or scale set + """ + log.info("Initiating restart of %s", self._identifier) + + poller: LROPoller = execute_with_retry(self._begin_restart) + + poller.wait(timeout=timeout) + + if not poller.done(): + raise TimeoutError(f"Failed to restart {self._identifier.name} after {timeout} seconds") + + log.info("Restarted %s", self._identifier.name) + + @abstractmethod + def _begin_restart(self) -> LROPoller: + """ + Derived classes must provide the implementation for this method using their corresponding begin_restart() implementation + """ + + def __str__(self): + return f"{self._identifier}" + + +class VirtualMachine(VirtualMachineBaseClass): + def get_instance_view(self) -> VirtualMachineInstanceView: + log.info("Retrieving instance view for %s", self._identifier) + return execute_with_retry(lambda: self._compute_client.virtual_machines.get( + resource_group_name=self._identifier.resource_group, + vm_name=self._identifier.name, + expand="instanceView" + ).instance_view) + + def get_extensions(self) -> List[VirtualMachineExtension]: + log.info("Retrieving extensions for %s", self._identifier) + return execute_with_retry(lambda: self._compute_client.virtual_machine_extensions.list( + resource_group_name=self._identifier.resource_group, + vm_name=self._identifier.name)) + + def _begin_restart(self) -> LROPoller: + return self._compute_client.virtual_machines.begin_restart( + resource_group_name=self._identifier.resource_group, + vm_name=self._identifier.name) + + +class VmScaleSet(VirtualMachineBaseClass): + def get_instance_view(self) -> VirtualMachineScaleSetInstanceView: + log.info("Retrieving instance view for %s", self._identifier) + + # TODO: Revisit this implementation. Currently this method returns the instance view of the first VM instance available. + # For the instance view of the complete VMSS, use the compute_client.virtual_machine_scale_sets function + # https://docs.microsoft.com/en-us/python/api/azure-mgmt-compute/azure.mgmt.compute.v2019_12_01.operations.virtualmachinescalesetsoperations?view=azure-python + for vm in execute_with_retry(lambda: self._compute_client.virtual_machine_scale_set_vms.list(self._identifier.resource_group, self._identifier.name)): + try: + return execute_with_retry(lambda: self._compute_client.virtual_machine_scale_set_vms.get_instance_view( + resource_group_name=self._identifier.resource_group, + vm_scale_set_name=self._identifier.name, + instance_id=vm.instance_id)) + except Exception as e: + log.warning("Unable to retrieve instance view for scale set instance %s. Trying out other instances.\nError: %s", vm, e) + + raise Exception(f"Unable to retrieve instance view of any instances for scale set {self._identifier}") + + + @property + def vm_func(self): + return self._compute_client.virtual_machine_scale_set_vms + + @property + def extension_func(self): + return self._compute_client.virtual_machine_scale_set_extensions + + def get_extensions(self) -> List[VirtualMachineScaleSetExtension]: + log.info("Retrieving extensions for %s", self._identifier) + return execute_with_retry(lambda: self._compute_client.virtual_machine_scale_set_extensions.list( + resource_group_name=self._identifier.resource_group, + vm_scale_set_name=self._identifier.name)) + + def _begin_restart(self) -> LROPoller: + return self._compute_client.virtual_machine_scale_sets.begin_restart( + resource_group_name=self._identifier.resource_group, + vm_scale_set_name=self._identifier.name) diff --git a/tests_e2e/tests/lib/vm_extension.py b/tests_e2e/tests/lib/vm_extension.py new file mode 100644 index 000000000..eab676e75 --- /dev/null +++ b/tests_e2e/tests/lib/vm_extension.py @@ -0,0 +1,239 @@ +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# This module includes facilities to execute VM extension operations (enable, remove, etc) on single virtual machines (using +# class VmExtension) or virtual machine scale sets (using class VmssExtension). +# + +import uuid + +from abc import ABC, abstractmethod +from assertpy import assert_that, soft_assertions +from typing import Any, Callable, Dict, Type + +from azure.core.polling import LROPoller +from azure.mgmt.compute import ComputeManagementClient +from azure.mgmt.compute.models import VirtualMachineExtension, VirtualMachineScaleSetExtension, VirtualMachineExtensionInstanceView +from azure.identity import DefaultAzureCredential + +from tests_e2e.tests.lib.identifiers import VmIdentifier, VmExtensionIdentifier +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.retry import execute_with_retry + + +_TIMEOUT = 5 * 60 # Timeout for extension operations (in seconds) + + +class _VmExtensionBaseClass(ABC): + """ + Abstract base class for VmExtension and VmssExtension. + + Implements the operations that are common to virtual machines and scale sets. Derived classes must provide the specific types and methods for the + virtual machine or scale set. + """ + def __init__(self, vm: VmIdentifier, extension: VmExtensionIdentifier, resource_name: str): + super().__init__() + self._vm: VmIdentifier = vm + self._identifier = extension + self._resource_name = resource_name + self._compute_client: ComputeManagementClient = ComputeManagementClient(credential=DefaultAzureCredential(), subscription_id=vm.subscription) + + def enable( + self, + settings: Dict[str, Any] = None, + protected_settings: Dict[str, Any] = None, + auto_upgrade_minor_version: bool = True, + force_update: bool = False, + force_update_tag: str = None + ) -> None: + """ + Performs an enable operation on the extension. + + NOTE: 'force_update' is not a parameter of the actual ARM API. It is provided for convenience: If set to True, + the 'force_update_tag' can be left unspecified and this method will generate a random tag. + """ + if force_update_tag is not None and not force_update: + raise ValueError("If force_update_tag is provided then force_update must be set to true") + + if force_update and force_update_tag is None: + force_update_tag = str(uuid.uuid4()) + + extension_parameters = self._ExtensionType( + publisher=self._identifier.publisher, + location=self._vm.location, + type_properties_type=self._identifier.type, + type_handler_version=self._identifier.version, + auto_upgrade_minor_version=auto_upgrade_minor_version, + settings=settings, + protected_settings=protected_settings, + force_update_tag=force_update_tag) + + # Hide the protected settings from logging + if protected_settings is not None: + extension_parameters.protected_settings = "*****[REDACTED]*****" + log.info("Enabling %s", self._identifier) + log.info("%s", extension_parameters) + # Now set the actual protected settings before invoking the extension + extension_parameters.protected_settings = protected_settings + + result: VirtualMachineExtension = execute_with_retry( + lambda: self._begin_create_or_update( + self._vm.resource_group, + self._vm.name, + self._resource_name, + extension_parameters + ).result(timeout=_TIMEOUT)) + + if result.provisioning_state not in ('Succeeded', 'Updating'): + raise Exception(f"Enable {self._identifier} failed. Provisioning state: {result.provisioning_state}") + log.info("Enable completed (provisioning state: %s).", result.provisioning_state) + + def get_instance_view(self) -> VirtualMachineExtensionInstanceView: # TODO: Check type for scale sets + """ + Retrieves the instance view of the extension + """ + log.info("Retrieving instance view for %s...", self._identifier) + + return execute_with_retry(lambda: self._get( + resource_group_name=self._vm.resource_group, + vm_name=self._vm.name, + vm_extension_name=self._resource_name, + expand="instanceView" + ).instance_view) + + def assert_instance_view( + self, + expected_status_code: str = "ProvisioningState/succeeded", + expected_version: str = None, + expected_message: str = None, + assert_function: Callable[[VirtualMachineExtensionInstanceView], None] = None + ) -> None: + """ + Asserts that the extension's instance view matches the given expected values. If 'expected_version' and/or 'expected_message' + are omitted, they are not validated. + + If 'assert_function' is provided, it is invoked passing as parameter the instance view. This function can be used to perform + additional validations. + """ + instance_view = self.get_instance_view() + + with soft_assertions(): + if expected_version is not None: + # Compare only the major and minor versions (i.e. the first 2 items in the result of split()) + installed_version = instance_view.type_handler_version + assert_that(expected_version.split(".")[0:2]).described_as("Unexpected extension version").is_equal_to(installed_version.split(".")[0:2]) + + assert_that(instance_view.statuses).described_as(f"Expected 1 status, got: {instance_view.statuses}").is_length(1) + status = instance_view.statuses[0] + + if expected_message is not None: + assert_that(expected_message in status.message).described_as(f"{expected_message} should be in the InstanceView message ({status.message})").is_true() + + assert_that(status.code).described_as("InstanceView status code").is_equal_to(expected_status_code) + + if assert_function is not None: + assert_function(instance_view) + + log.info("The instance view matches the expected values") + + @abstractmethod + def delete(self) -> None: + """ + Performs a delete operation on the extension + """ + + @property + @abstractmethod + def _ExtensionType(self) -> Type: + """ + Type of the extension object for the virtual machine or scale set (i.e. VirtualMachineExtension or VirtualMachineScaleSetExtension) + """ + + @property + @abstractmethod + def _begin_create_or_update(self) -> Callable[[str, str, str, Any], LROPoller[Any]]: # "Any" can be VirtualMachineExtension or VirtualMachineScaleSetExtension + """ + The begin_create_or_update method for the virtual machine or scale set extension + """ + + @property + @abstractmethod + def _get(self) -> Any: # VirtualMachineExtension or VirtualMachineScaleSetExtension + """ + The get method for the virtual machine or scale set extension + """ + + def __str__(self): + return f"{self._identifier}" + + +class VmExtension(_VmExtensionBaseClass): + """ + Extension operations on a single virtual machine. + """ + @property + def _ExtensionType(self) -> Type: + return VirtualMachineExtension + + @property + def _begin_create_or_update(self) -> Callable[[str, str, str, VirtualMachineExtension], LROPoller[VirtualMachineExtension]]: + return self._compute_client.virtual_machine_extensions.begin_create_or_update + + @property + def _get(self) -> VirtualMachineExtension: + return self._compute_client.virtual_machine_extensions.get + + def delete(self) -> None: + log.info("Deleting %s", self._identifier) + + execute_with_retry(lambda: self._compute_client.virtual_machine_extensions.begin_delete( + self._vm.resource_group, + self._vm.name, + self._resource_name + ).wait(timeout=_TIMEOUT)) + + +class VmssExtension(_VmExtensionBaseClass): + """ + Extension operations on virtual machine scale sets. + """ + @property + def _ExtensionType(self) -> Type: + return VirtualMachineScaleSetExtension + + @property + def _begin_create_or_update(self) -> Callable[[str, str, str, VirtualMachineScaleSetExtension], LROPoller[VirtualMachineScaleSetExtension]]: + return self._compute_client.virtual_machine_scale_set_extensions.begin_create_or_update + + @property + def _get(self) -> VirtualMachineScaleSetExtension: + return self._compute_client.virtual_machine_scale_set_extensions.get + + def delete(self) -> None: # TODO: Implement this method + raise NotImplementedError() + + def delete_from_instance(self, instance_id: str) -> None: + log.info("Deleting %s from scale set instance %s", self._identifier, instance_id) + + execute_with_retry(lambda: self._compute_client.virtual_machine_scale_set_vm_extensions.begin_delete( + resource_group_name=self._vm.resource_group, + vm_scale_set_name=self._vm.name, + vm_extension_name=self._resource_name, + instance_id=instance_id + ).wait(timeout=_TIMEOUT)) + diff --git a/tests_e2e/tests/pass_test.py b/tests_e2e/tests/pass_test.py new file mode 100755 index 000000000..580db2dc0 --- /dev/null +++ b/tests_e2e/tests/pass_test.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from tests_e2e.tests.lib.agent_test import AgentTest +from tests_e2e.tests.lib.logging import log + + +class PassTest(AgentTest): + """ + A trivial test that passes. + """ + def run(self): + log.info("* PASSED *") + + +if __name__ == "__main__": + PassTest.run_from_command_line()