From b35728e81f1863227b713b1f5f136df4663b25f9 Mon Sep 17 00:00:00 2001 From: Mike Degatano Date: Thu, 14 Nov 2024 19:37:19 +0000 Subject: [PATCH] Support CGroup v2 on Supervised with manual restarts --- supervisor/addons/addon.py | 15 +++ supervisor/docker/addon.py | 20 ++- supervisor/resolution/const.py | 5 + supervisor/resolution/evaluations/cgroup.py | 9 +- .../fixups/addon_execute_restart.py | 60 +++++++++ tests/docker/test_addon.py | 121 +++++++++++++++++- .../evaluation/test_evaluate_cgroup.py | 2 +- .../fixup/test_addon_execute_restart.py | 120 +++++++++++++++++ 8 files changed, 341 insertions(+), 11 deletions(-) create mode 100644 supervisor/resolution/fixups/addon_execute_restart.py create mode 100644 tests/resolution/fixup/test_addon_execute_restart.py diff --git a/supervisor/addons/addon.py b/supervisor/addons/addon.py index 37c23a059da..53fb92d4bfd 100644 --- a/supervisor/addons/addon.py +++ b/supervisor/addons/addon.py @@ -148,6 +148,9 @@ def __init__(self, coresys: CoreSys, slug: str): self._boot_failed_issue = Issue( IssueType.BOOT_FAIL, ContextType.ADDON, reference=self.slug ) + self._device_access_missing_issue = Issue( + IssueType.DEVICE_ACCESS_MISSING, ContextType.ADDON, reference=self.slug + ) def __repr__(self) -> str: """Return internal representation.""" @@ -158,6 +161,11 @@ def boot_failed_issue(self) -> Issue: """Get issue used if start on boot failed.""" return self._boot_failed_issue + @property + def device_access_missing_issue(self) -> Issue: + """Get issue used if device access is missing and can't be automatically added.""" + return self._device_access_missing_issue + @property def state(self) -> AddonState: """Return state of the add-on.""" @@ -182,6 +190,13 @@ def state(self, new_state: AddonState) -> None: ): self.sys_resolution.dismiss_issue(self.boot_failed_issue) + # Dismiss device access missing issue if present and we stopped + if ( + new_state == AddonState.STOPPED + and self.device_access_missing_issue in self.sys_resolution.issues + ): + self.sys_resolution.dismiss_issue(self.device_access_missing_issue) + self.sys_homeassistant.websocket.send_message( { ATTR_TYPE: WSType.SUPERVISOR_EVENT, diff --git a/supervisor/docker/addon.py b/supervisor/docker/addon.py index 8d1d59f74dd..cf92fd9e04d 100644 --- a/supervisor/docker/addon.py +++ b/supervisor/docker/addon.py @@ -10,6 +10,7 @@ from pathlib import Path from typing import TYPE_CHECKING +from attr import evolve from awesomeversion import AwesomeVersion import docker from docker.types import Mount @@ -40,7 +41,7 @@ from ..hardware.data import Device from ..jobs.const import JobCondition, JobExecutionLimit from ..jobs.decorator import Job -from ..resolution.const import ContextType, IssueType, SuggestionType +from ..resolution.const import CGROUP_V2_VERSION, ContextType, IssueType, SuggestionType from ..utils.sentry import capture_exception from .const import ( ENV_TIME, @@ -787,6 +788,13 @@ async def stop(self, remove_container: bool = True) -> None: await super().stop(remove_container) + # If there is a device access issue and the container is removed, clear it + if ( + remove_container + and self.addon.device_access_missing_issue in self.sys_resolution.issues + ): + self.sys_resolution.dismiss_issue(self.addon.device_access_missing_issue) + async def _validate_trust( self, image_id: str, image: str, version: AwesomeVersion ) -> None: @@ -824,6 +832,16 @@ async def _hardware_events(self, device: Device) -> None: f"Can't process Hardware Event on {self.name}: {err!s}", _LOGGER.error ) from err + if ( + self.sys_docker.info.cgroup == CGROUP_V2_VERSION + and not self.sys_os.available + ): + self.sys_resolution.add_issue( + evolve(self.addon.device_access_missing_issue), + suggestions=[SuggestionType.EXECUTE_RESTART], + ) + return + permission = self.sys_hardware.policy.get_cgroups_rule(device) try: await self.sys_dbus.agent.cgroup.add_devices_allowed( diff --git a/supervisor/resolution/const.py b/supervisor/resolution/const.py index 13b74c34044..9a09908d1b7 100644 --- a/supervisor/resolution/const.py +++ b/supervisor/resolution/const.py @@ -15,6 +15,9 @@ DNS_CHECK_HOST = "_checkdns.home-assistant.io" DNS_ERROR_NO_DATA = 1 +CGROUP_V1_VERSION = "1" +CGROUP_V2_VERSION = "2" + class ContextType(StrEnum): """Place where somethings was happening.""" @@ -77,6 +80,7 @@ class IssueType(StrEnum): CORRUPT_FILESYSTEM = "corrupt_filesystem" DETACHED_ADDON_MISSING = "detached_addon_missing" DETACHED_ADDON_REMOVED = "detached_addon_removed" + DEVICE_ACCESS_MISSING = "device_access_missing" DISABLED_DATA_DISK = "disabled_data_disk" DNS_LOOP = "dns_loop" DNS_SERVER_FAILED = "dns_server_failed" @@ -112,6 +116,7 @@ class SuggestionType(StrEnum): EXECUTE_REMOVE = "execute_remove" EXECUTE_REPAIR = "execute_repair" EXECUTE_RESET = "execute_reset" + EXECUTE_RESTART = "execute_restart" EXECUTE_START = "execute_start" EXECUTE_STOP = "execute_stop" EXECUTE_UPDATE = "execute_update" diff --git a/supervisor/resolution/evaluations/cgroup.py b/supervisor/resolution/evaluations/cgroup.py index 186bdab37f0..9f81a6f1115 100644 --- a/supervisor/resolution/evaluations/cgroup.py +++ b/supervisor/resolution/evaluations/cgroup.py @@ -2,12 +2,9 @@ from ...const import CoreState from ...coresys import CoreSys -from ..const import UnsupportedReason +from ..const import CGROUP_V1_VERSION, CGROUP_V2_VERSION, UnsupportedReason from .base import EvaluateBase -CGROUP_V1_VERSION = "1" -CGROUP_V2_VERSION = "2" - def setup(coresys: CoreSys) -> EvaluateBase: """Initialize evaluation-setup function.""" @@ -20,9 +17,7 @@ class EvaluateCGroupVersion(EvaluateBase): @property def expected_versions(self) -> set[str]: """Return expected cgroup versions.""" - if self.coresys.os.available: - return {CGROUP_V1_VERSION, CGROUP_V2_VERSION} - return {CGROUP_V1_VERSION} + return {CGROUP_V1_VERSION, CGROUP_V2_VERSION} @property def reason(self) -> UnsupportedReason: diff --git a/supervisor/resolution/fixups/addon_execute_restart.py b/supervisor/resolution/fixups/addon_execute_restart.py new file mode 100644 index 00000000000..c09610e0f37 --- /dev/null +++ b/supervisor/resolution/fixups/addon_execute_restart.py @@ -0,0 +1,60 @@ +"""Helpers to fix addon by restarting it.""" + +import logging + +from ...coresys import CoreSys +from ...exceptions import AddonsError, ResolutionFixupError +from ..const import ContextType, IssueType, SuggestionType +from .base import FixupBase + +_LOGGER: logging.Logger = logging.getLogger(__name__) + + +def setup(coresys: CoreSys) -> FixupBase: + """Check setup function.""" + return FixupAddonExecuteRestart(coresys) + + +class FixupAddonExecuteRestart(FixupBase): + """Storage class for fixup.""" + + async def process_fixup(self, reference: str | None = None) -> None: + """Initialize the fixup class.""" + if not (addon := self.sys_addons.get(reference, local_only=True)): + _LOGGER.info("Cannot restart addon %s as it does not exist", reference) + return + + # Stop addon + try: + await addon.stop() + except AddonsError as err: + _LOGGER.error("Could not stop %s due to %s", reference, err) + raise ResolutionFixupError() from None + + # Start addon + # Removing the container has already fixed the issue and dismissed it + # So any errors on startup are just logged. We won't wait on the startup task either + try: + await addon.start() + except AddonsError as err: + _LOGGER.error("Could not restart %s due to %s", reference, err) + + @property + def suggestion(self) -> SuggestionType: + """Return a SuggestionType enum.""" + return SuggestionType.EXECUTE_RESTART + + @property + def context(self) -> ContextType: + """Return a ContextType enum.""" + return ContextType.ADDON + + @property + def issues(self) -> list[IssueType]: + """Return a IssueType enum list.""" + return [IssueType.DEVICE_ACCESS_MISSING] + + @property + def auto(self) -> bool: + """Return if a fixup can be apply as auto fix.""" + return False diff --git a/tests/docker/test_addon.py b/tests/docker/test_addon.py index edd11cdb5a3..1b710e2a73f 100644 --- a/tests/docker/test_addon.py +++ b/tests/docker/test_addon.py @@ -1,6 +1,8 @@ """Test docker addon setup.""" +import asyncio from ipaddress import IPv4Address +from pathlib import Path from typing import Any from unittest.mock import MagicMock, Mock, PropertyMock, patch @@ -12,12 +14,17 @@ from supervisor.addons.addon import Addon from supervisor.addons.model import Data from supervisor.addons.options import AddonOptions +from supervisor.const import BusEvent from supervisor.coresys import CoreSys +from supervisor.dbus.agent.cgroup import CGroup from supervisor.docker.addon import DockerAddon +from supervisor.docker.manager import DockerAPI from supervisor.exceptions import CoreDNSError, DockerNotFound +from supervisor.hardware.data import Device +from supervisor.os.manager import OSManager from supervisor.plugins.dns import PluginDns -from supervisor.resolution.const import ContextType, IssueType -from supervisor.resolution.data import Issue +from supervisor.resolution.const import ContextType, IssueType, SuggestionType +from supervisor.resolution.data import Issue, Suggestion from ..common import load_json_fixture from . import DEV_MOUNT @@ -380,3 +387,113 @@ async def test_addon_stop_delete_host_error( await docker_addon.stop() capture_exception.assert_called_once_with(err) + + +TEST_DEV_PATH = "/dev/ttyAMA0" +TEST_SYSFS_PATH = "/sys/devices/platform/soc/ffe09000.usb/ff500000.usb/xhci-hcd.0.auto/usb1/1-1/1-1.1/1-1.1:1.0/tty/ttyACM0" +TEST_HW_DEVICE = Device( + name="ttyACM0", + path=Path("/dev/ttyAMA0"), + sysfs=Path( + "/sys/devices/platform/soc/ffe09000.usb/ff500000.usb/xhci-hcd.0.auto/usb1/1-1/1-1.1/1-1.1:1.0/tty/ttyACM0" + ), + subsystem="tty", + parent=Path( + "/sys/devices/platform/soc/ffe09000.usb/ff500000.usb/xhci-hcd.0.auto/usb1/1-1/1-1.1/1-1.1:1.0" + ), + links=[ + Path( + "/dev/serial/by-id/usb-Texas_Instruments_TI_CC2531_USB_CDC___0X0123456789ABCDEF-if00" + ), + Path("/dev/serial/by-path/platform-xhci-hcd.0.auto-usb-0:1.1:1.0"), + Path("/dev/serial/by-path/platform-xhci-hcd.0.auto-usbv2-0:1.1:1.0"), + ], + attributes={}, + children=[], +) + + +@pytest.mark.usefixtures("path_extern") +@pytest.mark.parametrize( + ("dev_path", "cgroup", "is_os"), + [ + (TEST_DEV_PATH, "1", True), + (TEST_SYSFS_PATH, "1", True), + (TEST_DEV_PATH, "1", False), + (TEST_SYSFS_PATH, "1", False), + (TEST_DEV_PATH, "2", True), + (TEST_SYSFS_PATH, "2", True), + ], +) +async def test_addon_new_device( + coresys: CoreSys, + install_addon_ssh: Addon, + container: MagicMock, + docker: DockerAPI, + dev_path: str, + cgroup: str, + is_os: bool, +): + """Test new device that is listed in static devices.""" + coresys.hardware.disk.get_disk_free_space = lambda x: 5000 + install_addon_ssh.data["devices"] = [dev_path] + container.id = 123 + docker.info.cgroup = cgroup + + with ( + patch.object(Addon, "write_options"), + patch.object(OSManager, "available", new=PropertyMock(return_value=is_os)), + patch.object(CGroup, "add_devices_allowed") as add_devices, + ): + await install_addon_ssh.start() + + coresys.bus.fire_event( + BusEvent.HARDWARE_NEW_DEVICE, + TEST_HW_DEVICE, + ) + await asyncio.sleep(0.01) + + add_devices.assert_called_once_with(123, "c 0:0 rwm") + + +@pytest.mark.usefixtures("path_extern") +@pytest.mark.parametrize("dev_path", [TEST_DEV_PATH, TEST_SYSFS_PATH]) +async def test_addon_new_device_no_haos( + coresys: CoreSys, + install_addon_ssh: Addon, + docker: DockerAPI, + dev_path: str, +): + """Test new device that is listed in static devices on non HAOS system with CGroup V2.""" + coresys.hardware.disk.get_disk_free_space = lambda x: 5000 + install_addon_ssh.data["devices"] = [dev_path] + docker.info.cgroup = "2" + + with ( + patch.object(Addon, "write_options"), + patch.object(OSManager, "available", new=PropertyMock(return_value=False)), + patch.object(CGroup, "add_devices_allowed") as add_devices, + ): + await install_addon_ssh.start() + + coresys.bus.fire_event( + BusEvent.HARDWARE_NEW_DEVICE, + TEST_HW_DEVICE, + ) + await asyncio.sleep(0.01) + + add_devices.assert_not_called() + + # Issue added with hardware event since access cannot be added dynamically + assert install_addon_ssh.device_access_missing_issue in coresys.resolution.issues + assert ( + Suggestion( + SuggestionType.EXECUTE_RESTART, ContextType.ADDON, reference="local_ssh" + ) + in coresys.resolution.suggestions + ) + + # Stopping and removing the container clears it as access granted on next start + await install_addon_ssh.stop() + assert coresys.resolution.issues == [] + assert coresys.resolution.suggestions == [] diff --git a/tests/resolution/evaluation/test_evaluate_cgroup.py b/tests/resolution/evaluation/test_evaluate_cgroup.py index e213ddde940..9377b28d315 100644 --- a/tests/resolution/evaluation/test_evaluate_cgroup.py +++ b/tests/resolution/evaluation/test_evaluate_cgroup.py @@ -26,7 +26,7 @@ async def test_evaluation(coresys: CoreSys): coresys.docker.info.cgroup = CGROUP_V2_VERSION await cgroup_version() - assert cgroup_version.reason in coresys.resolution.unsupported + assert cgroup_version.reason not in coresys.resolution.unsupported coresys.resolution.unsupported.clear() coresys.docker.info.cgroup = CGROUP_V1_VERSION diff --git a/tests/resolution/fixup/test_addon_execute_restart.py b/tests/resolution/fixup/test_addon_execute_restart.py new file mode 100644 index 00000000000..7dec5bd72bf --- /dev/null +++ b/tests/resolution/fixup/test_addon_execute_restart.py @@ -0,0 +1,120 @@ +"""Test fixup addon execute restart.""" + +from unittest.mock import patch + +import pytest + +from supervisor.addons.addon import Addon +from supervisor.const import AddonState +from supervisor.coresys import CoreSys +from supervisor.docker.addon import DockerAddon +from supervisor.docker.interface import DockerInterface +from supervisor.exceptions import DockerError +from supervisor.resolution.const import ContextType, IssueType, SuggestionType +from supervisor.resolution.data import Issue, Suggestion +from supervisor.resolution.fixups.addon_execute_restart import FixupAddonExecuteRestart + +from tests.const import TEST_ADDON_SLUG + +DEVICE_ACCESS_MISSING_ISSUE = Issue( + IssueType.DEVICE_ACCESS_MISSING, + ContextType.ADDON, + reference=TEST_ADDON_SLUG, +) +EXECUTE_RESTART_SUGGESTION = Suggestion( + SuggestionType.EXECUTE_RESTART, ContextType.ADDON, reference="local_ssh" +) + + +@pytest.mark.usefixtures("path_extern") +async def test_fixup(coresys: CoreSys, install_addon_ssh: Addon): + """Test fixup restarts addon.""" + install_addon_ssh.state = AddonState.STARTED + addon_execute_restart = FixupAddonExecuteRestart(coresys) + assert addon_execute_restart.auto is False + + async def mock_stop(*args, **kwargs): + install_addon_ssh.state = AddonState.STOPPED + + coresys.resolution.add_issue( + DEVICE_ACCESS_MISSING_ISSUE, + suggestions=[SuggestionType.EXECUTE_RESTART], + ) + with ( + patch.object(DockerInterface, "stop") as stop, + patch.object(DockerAddon, "run") as run, + patch.object(Addon, "_wait_for_startup"), + patch.object(Addon, "write_options"), + ): + await addon_execute_restart() + stop.assert_called_once() + run.assert_called_once() + + assert not coresys.resolution.issues + assert not coresys.resolution.suggestions + + +@pytest.mark.usefixtures("path_extern") +async def test_fixup_stop_error( + coresys: CoreSys, install_addon_ssh: Addon, caplog: pytest.LogCaptureFixture +): + """Test fixup fails on stop addon failure.""" + install_addon_ssh.state = AddonState.STARTED + addon_execute_start = FixupAddonExecuteRestart(coresys) + + coresys.resolution.add_issue( + DEVICE_ACCESS_MISSING_ISSUE, + suggestions=[SuggestionType.EXECUTE_RESTART], + ) + with ( + patch.object(DockerInterface, "stop", side_effect=DockerError), + patch.object(DockerAddon, "run") as run, + ): + await addon_execute_start() + run.assert_not_called() + + assert DEVICE_ACCESS_MISSING_ISSUE in coresys.resolution.issues + assert EXECUTE_RESTART_SUGGESTION in coresys.resolution.suggestions + assert "Could not stop local_ssh" in caplog.text + + +@pytest.mark.usefixtures("path_extern") +async def test_fixup_start_error( + coresys: CoreSys, install_addon_ssh: Addon, caplog: pytest.LogCaptureFixture +): + """Test fixup logs a start addon failure.""" + install_addon_ssh.state = AddonState.STARTED + addon_execute_start = FixupAddonExecuteRestart(coresys) + + coresys.resolution.add_issue( + DEVICE_ACCESS_MISSING_ISSUE, + suggestions=[SuggestionType.EXECUTE_RESTART], + ) + with ( + patch.object(DockerInterface, "stop") as stop, + patch.object(DockerAddon, "run", side_effect=DockerError), + patch.object(Addon, "write_options"), + ): + await addon_execute_start() + stop.assert_called_once() + + assert DEVICE_ACCESS_MISSING_ISSUE not in coresys.resolution.issues + assert EXECUTE_RESTART_SUGGESTION not in coresys.resolution.suggestions + assert "Could not restart local_ssh" in caplog.text + + +async def test_fixup_no_addon(coresys: CoreSys, caplog: pytest.LogCaptureFixture): + """Test fixup dismisses if addon is missing.""" + addon_execute_start = FixupAddonExecuteRestart(coresys) + + coresys.resolution.add_issue( + DEVICE_ACCESS_MISSING_ISSUE, + suggestions=[SuggestionType.EXECUTE_RESTART], + ) + with patch.object(DockerAddon, "stop") as stop: + await addon_execute_start() + stop.assert_not_called() + + assert not coresys.resolution.issues + assert not coresys.resolution.suggestions + assert "Cannot restart addon local_ssh as it does not exist" in caplog.text