Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: separate nrf due to nucleus exit #1658

Merged
merged 1 commit into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions scripts/loader
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ echo "Greengrass root: "${GG_ROOT}
LAUNCH_DIR="$GG_ROOT/alts/current"
CONFIG_FILE=""

echo "Absolute launch dir: "$(readlink $LAUNCH_DIR)

is_directory_link() {
[ -L "$1" ] && [ -d "$1" ]
}
Expand Down Expand Up @@ -100,4 +102,7 @@ if [ $sigterm_received -eq 0 ] && is_directory_link "${GG_ROOT}/alts/old" && is_
flip_link "${GG_ROOT}/alts/old" "${LAUNCH_DIR}"
fi

## Touch an empty file to indicate rollback due to unexpected Nucleus exit
touch "${GG_ROOT}/work/aws.greengrass.Nucleus/restart_panic"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it possible to just rely on the alts/broken dir link

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

relying on the alts/broken dir links is not sufficient as any form of rollback will create these symlinks

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

however, i could create a new symlink pointing to the absolute broken dir if letting loader touch a file is an issue


exit ${kernel_exit_code}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import com.aws.greengrass.util.Utils;

import java.io.IOException;
import java.nio.file.Files;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.CancellationException;
Expand All @@ -32,12 +33,14 @@
import java.util.stream.Collectors;

import static com.aws.greengrass.deployment.DeploymentConfigMerger.DEPLOYMENT_ID_LOG_KEY;
import static com.aws.greengrass.deployment.DeviceConfiguration.DEFAULT_NUCLEUS_COMPONENT_NAME;
import static com.aws.greengrass.deployment.bootstrap.BootstrapSuccessCode.REQUEST_RESTART;
import static com.aws.greengrass.deployment.model.Deployment.DeploymentStage.KERNEL_ACTIVATION;
import static com.aws.greengrass.deployment.model.Deployment.DeploymentStage.KERNEL_ROLLBACK;
import static com.aws.greengrass.deployment.model.Deployment.DeploymentStage.ROLLBACK_BOOTSTRAP;

public class KernelUpdateDeploymentTask implements DeploymentTask {
public static final String RESTART_PANIC_FILE_NAME = "restart_panic";
private final Kernel kernel;
private final Logger logger;
private final Deployment deployment;
Expand Down Expand Up @@ -149,10 +152,25 @@ private void saveDeploymentStatusDetails(Throwable failureCause) throws IOExcept

private DeploymentException getDeploymentStatusDetails() {
if (Utils.isEmpty(deployment.getStageDetails())) {
return new DeploymentException(
"Nucleus update workflow failed to restart Nucleus. See loader logs for more details",
DeploymentErrorCode.NUCLEUS_RESTART_FAILURE);
try {
if (Files.deleteIfExists(
kernel.getNucleusPaths().workPath(DEFAULT_NUCLEUS_COMPONENT_NAME)
.resolve(RESTART_PANIC_FILE_NAME).toAbsolutePath())) {
return new DeploymentException(
"Nucleus update workflow failed to restart Nucleus. See loader logs for more details",
DeploymentErrorCode.NUCLEUS_RESTART_FAILURE);
} else {
return new DeploymentException("Nucleus update workflow failed to restart Nucleus due to an "
+ "unexpected device IO error",
DeploymentErrorCode.IO_WRITE_ERROR);
}
} catch (IOException e) {
return new DeploymentException("Nucleus update workflow failed to restart Nucleus due to an "
+ "unexpected device IO error. See loader logs for more details", e,
DeploymentErrorCode.IO_WRITE_ERROR);
}
}

List<DeploymentErrorCode> errorStack = deployment.getErrorStack() == null ? Collections.emptyList()
: deployment.getErrorStack().stream().map(DeploymentErrorCode::valueOf).collect(Collectors.toList());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@
import com.aws.greengrass.lifecyclemanager.KernelAlternatives;
import com.aws.greengrass.lifecyclemanager.KernelLifecycle;
import com.aws.greengrass.lifecyclemanager.exceptions.DirectoryValidationException;
import com.aws.greengrass.util.NucleusPaths;
import com.aws.greengrass.util.Pair;
import com.aws.greengrass.util.Utils;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;
Expand All @@ -28,6 +30,8 @@

import static com.aws.greengrass.deployment.DeploymentConfigMerger.DEPLOYMENT_ID_LOG_KEY;
import static com.aws.greengrass.deployment.DeploymentConfigMerger.MERGE_CONFIG_EVENT_KEY;
import static com.aws.greengrass.deployment.DeviceConfiguration.DEFAULT_NUCLEUS_COMPONENT_NAME;
import static com.aws.greengrass.deployment.KernelUpdateDeploymentTask.RESTART_PANIC_FILE_NAME;
import static com.aws.greengrass.deployment.bootstrap.BootstrapSuccessCode.REQUEST_REBOOT;
import static com.aws.greengrass.deployment.bootstrap.BootstrapSuccessCode.REQUEST_RESTART;
import static com.aws.greengrass.deployment.model.Deployment.DeploymentStage.KERNEL_ROLLBACK;
Expand All @@ -39,6 +43,7 @@
public class KernelUpdateActivator extends DeploymentActivator {
private final BootstrapManager bootstrapManager;
private final KernelAlternatives kernelAlternatives;
private final NucleusPaths nucleusPaths;

/**
* Constructor of KernelUpdateActivator.
Expand All @@ -51,6 +56,7 @@ public KernelUpdateActivator(Kernel kernel, BootstrapManager bootstrapManager) {
super(kernel);
this.bootstrapManager = bootstrapManager;
this.kernelAlternatives = kernel.getContext().get(KernelAlternatives.class);
this.nucleusPaths = kernel.getNucleusPaths();
}

@Override
Expand Down Expand Up @@ -81,6 +87,14 @@ public void activate(Map<String, Object> newConfig, Deployment deployment,

updateConfiguration(deploymentDocument.getTimestamp(), newConfig);

// Try and delete restart panic file if it exists
try {
Files.deleteIfExists(nucleusPaths.workPath(DEFAULT_NUCLEUS_COMPONENT_NAME)
.resolve(RESTART_PANIC_FILE_NAME).toAbsolutePath());
} catch (IOException e) {
logger.atWarn().log("Unable to delete an existing restart panic file", e);
}

Path bootstrapTaskFilePath;
try {
bootstrapTaskFilePath = deploymentDirectoryManager.getBootstrapTaskFilePath();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import com.aws.greengrass.logging.api.Logger;
import com.aws.greengrass.logging.impl.LogManager;
import com.aws.greengrass.testcommons.testutilities.GGExtension;
import com.aws.greengrass.util.NucleusPaths;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
Expand All @@ -28,6 +29,9 @@
import org.mockito.junit.jupiter.MockitoExtension;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
Expand All @@ -37,7 +41,11 @@
import static com.aws.greengrass.dependency.State.FINISHED;
import static com.aws.greengrass.dependency.State.RUNNING;
import static com.aws.greengrass.dependency.State.STARTING;
import static com.aws.greengrass.deployment.DeviceConfiguration.DEFAULT_NUCLEUS_COMPONENT_NAME;
import static com.aws.greengrass.deployment.KernelUpdateDeploymentTask.RESTART_PANIC_FILE_NAME;
import static com.aws.greengrass.deployment.bootstrap.BootstrapSuccessCode.REQUEST_RESTART;
import static com.aws.greengrass.deployment.errorcode.DeploymentErrorCode.IO_WRITE_ERROR;
import static com.aws.greengrass.deployment.errorcode.DeploymentErrorCode.NUCLEUS_RESTART_FAILURE;
import static com.aws.greengrass.deployment.model.Deployment.DeploymentStage.KERNEL_ACTIVATION;
import static com.aws.greengrass.deployment.model.Deployment.DeploymentStage.KERNEL_ROLLBACK;
import static com.aws.greengrass.testcommons.testutilities.ExceptionLogProtector.ignoreExceptionOfType;
Expand Down Expand Up @@ -74,6 +82,8 @@ class KernelUpdateDeploymentTaskTest {
GreengrassService mainService;
@Mock
ComponentManager componentManager;
@Mock
NucleusPaths nucleusPaths;
ExecutorService executorService;

KernelUpdateDeploymentTask task;
Expand All @@ -88,6 +98,7 @@ void beforeEach() throws Exception {
lenient().doReturn(true).when(greengrassService).shouldAutoStart();
lenient().doReturn(Arrays.asList(greengrassService)).when(kernel).orderedDependencies();
lenient().doNothing().when(componentManager).cleanupStaleVersions();
lenient().doReturn(nucleusPaths).when(kernel).getNucleusPaths();

Topic topic = mock(Topic.class);
lenient().doReturn(1L).when(topic).getModtime();
Expand Down Expand Up @@ -159,6 +170,58 @@ void GIVEN_deployment_rollback_WHEN_service_broken_THEN_rollback_fails(Extension
assertEquals("mock activate error", result.getFailureCause().getMessage());
}

@Test
void Given_deployment_rollback_WHEN_stage_details_absent_THEN_rollback_succeeds_with_io_error() throws IOException {
doReturn(KERNEL_ROLLBACK).when(deployment).getDeploymentStage();
doReturn(FINISHED).when(greengrassService).getState();
doReturn(true).when(greengrassService).reachedDesiredState();
doReturn(null).when(deployment).getStageDetails();
doReturn(Paths.get("")).when(nucleusPaths).workPath(eq(DEFAULT_NUCLEUS_COMPONENT_NAME));

DeploymentResult result = task.call();
assertEquals(DeploymentResult.DeploymentStatus.FAILED_ROLLBACK_COMPLETE, result.getDeploymentStatus());
assertThat(result.getFailureCause(), isA(DeploymentException.class));
assertEquals("Nucleus update workflow failed to restart Nucleus due to an unexpected device IO error",
result.getFailureCause().getMessage());
assertEquals(IO_WRITE_ERROR, ((DeploymentException) result.getFailureCause()).getErrorCodes().get(0));
}

@Test
void Given_deployment_rollback_WHEN_panic_file_detected_THEN_rollback_succeeds_with_nucleus_restart_failure() throws IOException {
doReturn(KERNEL_ROLLBACK).when(deployment).getDeploymentStage();
doReturn(FINISHED).when(greengrassService).getState();
doReturn(true).when(greengrassService).reachedDesiredState();
doReturn(null).when(deployment).getStageDetails();
Path panicScriptPath = Paths.get("").resolve(RESTART_PANIC_FILE_NAME);
Files.createFile(panicScriptPath.toAbsolutePath());
doReturn(Paths.get("")).when(nucleusPaths).workPath(eq(DEFAULT_NUCLEUS_COMPONENT_NAME));

DeploymentResult result = task.call();
assertEquals(DeploymentResult.DeploymentStatus.FAILED_ROLLBACK_COMPLETE, result.getDeploymentStatus());
assertThat(result.getFailureCause(), isA(DeploymentException.class));
assertEquals("Nucleus update workflow failed to restart Nucleus. See loader logs for more details",
result.getFailureCause().getMessage());
assertEquals(NUCLEUS_RESTART_FAILURE, ((DeploymentException) result.getFailureCause()).getErrorCodes().get(0));
Files.deleteIfExists(panicScriptPath);
}

@Test
void Given_deployment_rollback_WHEN_io_exception_when_resolving_path_THEN_rollback_succeeds_with_io_error() throws IOException {
doReturn(KERNEL_ROLLBACK).when(deployment).getDeploymentStage();
doReturn(FINISHED).when(greengrassService).getState();
doReturn(true).when(greengrassService).reachedDesiredState();
doReturn(null).when(deployment).getStageDetails();
doThrow(new IOException("mock io exception")).when(nucleusPaths).workPath(DEFAULT_NUCLEUS_COMPONENT_NAME);

DeploymentResult result = task.call();
assertEquals(DeploymentResult.DeploymentStatus.FAILED_ROLLBACK_COMPLETE, result.getDeploymentStatus());
assertThat(result.getFailureCause(), isA(DeploymentException.class));
assertEquals("Nucleus update workflow failed to restart Nucleus due to an unexpected device IO error. See loader logs for more details",
result.getFailureCause().getMessage());
assertEquals(IO_WRITE_ERROR, ((DeploymentException) result.getFailureCause()).getErrorCodes().get(0));
assertEquals("mock io exception", result.getFailureCause().getCause().getMessage());
}

@Test
void GIVEN_deployment_rollback_WHEN_service_healthy_THEN_rollback_succeeds() throws Exception {
doReturn(KERNEL_ROLLBACK).when(deployment).getDeploymentStage();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import com.aws.greengrass.lifecyclemanager.exceptions.DirectoryValidationException;
import com.aws.greengrass.testcommons.testutilities.GGExtension;
import com.aws.greengrass.testcommons.testutilities.TestUtils;
import com.aws.greengrass.util.NucleusPaths;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
Expand All @@ -39,6 +40,7 @@
import java.util.Map;
import java.util.concurrent.CompletableFuture;

import static com.aws.greengrass.deployment.DeviceConfiguration.DEFAULT_NUCLEUS_COMPONENT_NAME;
import static com.aws.greengrass.deployment.bootstrap.BootstrapSuccessCode.NO_OP;
import static com.aws.greengrass.deployment.bootstrap.BootstrapSuccessCode.REQUEST_REBOOT;
import static com.aws.greengrass.deployment.bootstrap.BootstrapSuccessCode.REQUEST_RESTART;
Expand Down Expand Up @@ -72,6 +74,8 @@ class KernelUpdateActivatorTest {
@Mock
KernelAlternatives kernelAlternatives;
@Mock
NucleusPaths nucleusPaths;
@Mock
CompletableFuture<DeploymentResult> totallyCompleteFuture;
@Mock
Deployment deployment;
Expand All @@ -85,9 +89,10 @@ class KernelUpdateActivatorTest {
KernelUpdateActivator kernelUpdateActivator;

@BeforeEach
void beforeEach() {
void beforeEach() throws IOException {
doReturn(deploymentDirectoryManager).when(context).get(eq(DeploymentDirectoryManager.class));
doReturn(kernelAlternatives).when(context).get(eq(KernelAlternatives.class));
doReturn(nucleusPaths).when(kernel).getNucleusPaths();
doReturn(context).when(kernel).getContext();
lenient().doReturn(config).when(kernel).getConfig();
kernelUpdateActivator = new KernelUpdateActivator(kernel, bootstrapManager);
Expand Down Expand Up @@ -120,6 +125,8 @@ void GIVEN_deployment_activate_WHEN_prepareBootstrap_fails_THEN_deployment_rollb
doReturn(bootstrapFilePath).when(deploymentDirectoryManager).getBootstrapTaskFilePath();
Path targetConfigFilePath = mock(Path.class);
doReturn(targetConfigFilePath).when(deploymentDirectoryManager).getTargetConfigFilePath();
IOException mockNucleusWorkPathIOE = new IOException("Mock Nucleus work path IOE");
doThrow(mockNucleusWorkPathIOE).when(nucleusPaths).workPath(eq(DEFAULT_NUCLEUS_COMPONENT_NAME));
IOException mockIOE = new IOException("mock error");
doThrow(mockIOE).when(kernelAlternatives).prepareBootstrap(eq("testId"));
doThrow(new IOException()).when(deploymentDirectoryManager).writeDeploymentMetadata(eq(deployment));
Expand All @@ -146,6 +153,8 @@ void GIVEN_deployment_activate_WHEN_bootstrap_task_fails_THEN_deployment_rollbac
doReturn(targetConfigFilePath).when(deploymentDirectoryManager).getTargetConfigFilePath();
ServiceUpdateException mockSUE = new ServiceUpdateException("mock error", DeploymentErrorCode.COMPONENT_BOOTSTRAP_ERROR,
DeploymentErrorType.USER_COMPONENT_ERROR);
IOException mockNucleusWorkPathIOE = new IOException("Mock Nucleus work path IOE");
doThrow(mockNucleusWorkPathIOE).when(nucleusPaths).workPath(eq(DEFAULT_NUCLEUS_COMPONENT_NAME));
doThrow(mockSUE).when(bootstrapManager).executeAllBootstrapTasksSequentially(eq(bootstrapFilePath));
doThrow(new IOException()).when(kernelAlternatives).prepareRollback();

Expand All @@ -163,10 +172,14 @@ void GIVEN_deployment_activate_WHEN_bootstrap_task_fails_THEN_deployment_rollbac
}

@Test
void GIVEN_deployment_activate_WHEN_bootstrap_finishes_THEN_request_restart() throws Exception {
void GIVEN_deployment_activate_WHEN_bootstrap_finishes_THEN_request_restart(ExtensionContext context) throws Exception {
ignoreExceptionOfType(context, IOException.class);

Path bootstrapFilePath = mock(Path.class);
doReturn(bootstrapFilePath).when(deploymentDirectoryManager).getBootstrapTaskFilePath();
Path targetConfigFilePath = mock(Path.class);
IOException mockNucleusWorkPathIOE = new IOException("Mock Nucleus work path IOE");
doThrow(mockNucleusWorkPathIOE).when(nucleusPaths).workPath(eq(DEFAULT_NUCLEUS_COMPONENT_NAME));
doReturn(targetConfigFilePath).when(deploymentDirectoryManager).getTargetConfigFilePath();
doReturn(NO_OP).when(bootstrapManager).executeAllBootstrapTasksSequentially(eq(bootstrapFilePath));
doReturn(false).when(bootstrapManager).hasNext();
Expand All @@ -179,11 +192,15 @@ void GIVEN_deployment_activate_WHEN_bootstrap_finishes_THEN_request_restart() th
}

@Test
void GIVEN_deployment_activate_WHEN_bootstrap_requires_reboot_THEN_request_reboot() throws Exception {
void GIVEN_deployment_activate_WHEN_bootstrap_requires_reboot_THEN_request_reboot(ExtensionContext context) throws Exception {
ignoreExceptionOfType(context, IOException.class);

Path bootstrapFilePath = mock(Path.class);
doReturn(bootstrapFilePath).when(deploymentDirectoryManager).getBootstrapTaskFilePath();
Path targetConfigFilePath = mock(Path.class);
doReturn(targetConfigFilePath).when(deploymentDirectoryManager).getTargetConfigFilePath();
IOException mockNucleusWorkPathIOE = new IOException("Mock Nucleus work path IOE");
doThrow(mockNucleusWorkPathIOE).when(nucleusPaths).workPath(eq(DEFAULT_NUCLEUS_COMPONENT_NAME));
doReturn(REQUEST_REBOOT).when(bootstrapManager).executeAllBootstrapTasksSequentially(eq(bootstrapFilePath));
doReturn(true).when(bootstrapManager).hasNext();

Expand Down
Loading