Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NONE: generate heapdump on crash #2433

Merged
merged 1 commit into from
Sep 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
"minimatch": "^5.0.1",
"moment": "^2.29.4",
"moo": "^0.5.0",
"node-oom-heapdump": "^3.0.3",
"node_extra_ca_certs_mozilla_bundle": "^1.0.5",
"optional-require": "^1.1.8",
"pg": "^8.7.3",
Expand Down
86 changes: 84 additions & 2 deletions src/util/workers-health-monitor.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import AWS from "aws-sdk";
import { waitUntil } from "test/utils/wait-until";
import { booleanFlag, BooleanFlags } from "config/feature-flags";
import { when } from "jest-when";
import oom from "node-oom-heapdump";
jest.mock("cluster", () => {
const workers = {
1: { send: jest.fn(), on: jest.fn(), process: { pid: 1 } },
Expand All @@ -22,6 +23,7 @@ jest.mock("utils/healthcheck-stopper");
jest.mock("services/generate-once-coredump-generator");
jest.mock("services/generate-once-per-node-headump-generator");
jest.mock("aws-sdk");
jest.mock("node-oom-heapdump");
jest.mock("config/feature-flags");

describe("workers-health-monitor", () => {
Expand Down Expand Up @@ -88,6 +90,28 @@ describe("workers-health-monitor", () => {
expect(stopHealthcheck).toBeCalled();
});

it("should charge heapdump generation on OOM", async () => {
intervals.push(...startMonitorOnWorker(logger, {
iAmAliveInervalMsec: 10,
dumpIntervalMsec: 1000,
lowHeapAvailPct: 10
}));
(process.on as jest.Mock).mock.calls[0][1]("heapdump_on_crash");

expect(oom).toBeCalledWith({
path: expect.stringMatching(/^\/tmp\/dump_heap_oom_[0-9]+$/)
});
});

it("should not charge heapdump generation on OOM by default", async () => {
intervals.push(...startMonitorOnWorker(logger, {
iAmAliveInervalMsec: 10,
dumpIntervalMsec: 1000,
lowHeapAvailPct: 10
}));
expect(oom).not.toBeCalled();
});

it("should generate core and heap dumps", async () => {
intervals.push(...startMonitorOnWorker(logger, {
iAmAliveInervalMsec: 1000,
Expand Down Expand Up @@ -144,8 +168,45 @@ describe("workers-health-monitor", () => {
try {
jest.advanceTimersByTime(50);

expect(cluster.workers[1]!.send).not.toBeCalled();
expect(cluster.workers[2]!.send).not.toBeCalled();
try {
expect(cluster.workers[1]!.send).not.toBeCalled();
} catch (err) {
// eslint-disable-next-line jest/no-conditional-expect
expect(cluster.workers[1]!.send).toBeCalledWith("heapdump_on_crash");
}
try {
expect(cluster.workers[2]!.send).not.toBeCalled();
} catch (err) {
// eslint-disable-next-line jest/no-conditional-expect
expect(cluster.workers[2]!.send).toBeCalledWith("heapdump_on_crash");
}

} finally {
clearInterval(workerSendingImAliveToMasterInterval);
}
});

it("charges one worker to trigger headump on oom", async () => {
intervals.push(startMonitorOnMaster(logger, {
pollIntervalMsecs: 10,
workerStartupTimeMsecs: 20,
workerUnresponsiveThresholdMsecs: 40,
numberOfWorkersThreshold: 1
}));

const workerSendingImAliveToMasterInterval = setInterval(() => {
if ((cluster.workers[1]!.on as jest.Mock).mock.calls.length > 0) {
(cluster.workers[1]!.on as jest.Mock).mock.calls[0][1]();
(cluster.workers[2]!.on as jest.Mock).mock.calls[0][1]();
}
}, 9);

try {
jest.advanceTimersByTime(50);

const sendMsgs = (cluster.workers[1]!.send as jest.Mock).mock.calls.map(call => call[0]).join(",") +
(cluster.workers[2]!.send as jest.Mock).mock.calls.map(call => call[0]).join(",");
expect(sendMsgs).toStrictEqual("heapdump_on_crash");

} finally {
clearInterval(workerSendingImAliveToMasterInterval);
Expand All @@ -168,6 +229,8 @@ describe("workers-health-monitor", () => {

const DUMP_READY_FILEPATH = "/tmp/dump_core.123.ready";
const DUMP_UPLOAD_FILEPATH = "/tmp/dump_core.123.ready.uploadinprogress";
const DUMP_OOM_FILEPATH = "/tmp/dump_heap_oom_1.heapsnapshot";
const DUMP_OOM_READY_FILEPATH = "/tmp/dump_heap_oom_1.heapsnapshot.ready";

const deleteFileSafe = (path: string) => {
try {
Expand All @@ -181,11 +244,30 @@ describe("workers-health-monitor", () => {
beforeEach(() => {
deleteFileSafe(DUMP_READY_FILEPATH);
deleteFileSafe(DUMP_UPLOAD_FILEPATH);
deleteFileSafe(DUMP_OOM_FILEPATH);
deleteFileSafe(DUMP_OOM_READY_FILEPATH);
});

afterEach(() => {
deleteFileSafe(DUMP_READY_FILEPATH);
deleteFileSafe(DUMP_UPLOAD_FILEPATH);
deleteFileSafe(DUMP_OOM_FILEPATH);
deleteFileSafe(DUMP_OOM_READY_FILEPATH);
});

it("should prepare oom heapdump for uploading on worker crash", async () => {
intervals.push(startMonitorOnMaster(logger, {
pollIntervalMsecs: 10,
workerStartupTimeMsecs: 20,
workerUnresponsiveThresholdMsecs: 40,
numberOfWorkersThreshold: 1
}));

jest.advanceTimersByTime(50);

fs.writeFileSync(DUMP_OOM_FILEPATH, "foo");
(cluster.workers[1]!.on as jest.Mock).mock.calls.find(call => call[0] === "exit")![1]();
expect(fs.existsSync(DUMP_OOM_READY_FILEPATH)).toBeTruthy();
});

it("should upload dumps to S3", async () => {
Expand Down
42 changes: 39 additions & 3 deletions src/util/workers-health-monitor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,14 @@ import { envVars } from "config/env";
import { GenerateOnceCoredumpGenerator } from "services/generate-once-coredump-generator";
import { GenerateOncePerNodeHeadumpGenerator } from "services/generate-once-per-node-headump-generator";
import { booleanFlag, BooleanFlags } from "config/feature-flags";
import oom from "node-oom-heapdump";

const CONF_SHUTDOWN_MSG = "shutdown";
const SHUTDOWN_MSG = "shutdown";
const HEAPDUMP_ON_CRASH_MSG = "heapdump_on_crash";

const generateHeapdumpPathOnOom = (pid: string) => {
return `/tmp/dump_heap_oom_${pid}`;
};

export const startMonitorOnWorker = (parentLogger: Logger, workerConfig: {
iAmAliveInervalMsec: number,
Expand Down Expand Up @@ -54,10 +60,18 @@ export const startMonitorOnWorker = (parentLogger: Logger, workerConfig: {

process.on("message", (msg: string) => {
logger.info(`worker received a message: ${msg}`);
if (msg === CONF_SHUTDOWN_MSG) {
if (msg === SHUTDOWN_MSG) {
logger.warn("shutdown received, stop healthcheck");
stopHealthcheck();
}
if (msg === HEAPDUMP_ON_CRASH_MSG) {
if (dumpsFlagValue) {
logger.warn("charging heapdump on crash");
oom({
path: generateHeapdumpPathOnOom(process.pid.toString())
});
}
}
});

const workerPingingServerInterval = setInterval(() => {
Expand Down Expand Up @@ -113,6 +127,11 @@ export const startMonitorOnMaster = (parentLogger: Logger, config: {
liveWorkers[workerPid] = Date.now();
});
worker.on("exit", (code, signal) => {
const maybeOomHeapdumpPath = generateHeapdumpPathOnOom(workerPid.toString()) + ".heapsnapshot";
if (fs.existsSync(maybeOomHeapdumpPath)) {
logger.info(`found ${maybeOomHeapdumpPath}, prepare for uploading`);
fs.renameSync(maybeOomHeapdumpPath, maybeOomHeapdumpPath + ".ready");
}
if (signal) {
logger.warn(`worker was killed by signal: ${signal}, code=${code}`);
} else if (code !== 0) {
Expand Down Expand Up @@ -150,6 +169,22 @@ export const startMonitorOnMaster = (parentLogger: Logger, config: {
}
};

// Given that heapdump eats a lot of mem and CPU, let's listen to only one worker. Otherwise, if 2 or more workers
// crash, that would put the whole node under risk,
let workerToReportOnCrashPid: string | undefined;
const maybeChargeWorkerToGenerateHeapdumpOnCrash = () => {
if (areWorkersReady() && !workerToReportOnCrashPid && Object.keys(registeredWorkers).length > 0) {
const pids = Object.keys(registeredWorkers);
workerToReportOnCrashPid = pids[Math.floor(Math.random() * pids.length)];
const worker = cluster.workers[workerToReportOnCrashPid];
if (!worker) {
workerToReportOnCrashPid = undefined;
return;
}
worker.send(HEAPDUMP_ON_CRASH_MSG);
}
};

const maybeRemoveDeadWorkers = () => {
if (areWorkersReady()) {
logger.info(`removing dead workers`);
Expand Down Expand Up @@ -177,7 +212,7 @@ export const startMonitorOnMaster = (parentLogger: Logger, config: {
nLiveWorkers
}, `send shutdown signal to all workers`);
for (const worker of Object.values(cluster.workers)) {
worker?.send(CONF_SHUTDOWN_MSG);
worker?.send(SHUTDOWN_MSG);
}
logRunningProcesses(logger);
} else {
Expand Down Expand Up @@ -231,6 +266,7 @@ export const startMonitorOnMaster = (parentLogger: Logger, config: {
return setInterval(() => {
registerNewWorkers(); // must be called periodically to make sure we pick up new/respawned workers
maybeSetupWorkersReadyAt();
maybeChargeWorkerToGenerateHeapdumpOnCrash();
maybeRemoveDeadWorkers();
maybeSendShutdownToAllWorkers();
maybeUploadeDumpFiles();
Expand Down
55 changes: 55 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,21 @@
semver "^7.3.5"
tar "^6.1.11"

"@mapbox/node-pre-gyp@^1.0.11":
version "1.0.11"
resolved "https://registry.yarnpkg.com/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz#417db42b7f5323d79e93b34a6d7a2a12c0df43fa"
integrity sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==
dependencies:
detect-libc "^2.0.0"
https-proxy-agent "^5.0.0"
make-dir "^3.1.0"
node-fetch "^2.6.7"
nopt "^5.0.0"
npmlog "^5.0.1"
rimraf "^3.0.2"
semver "^7.3.5"
tar "^6.1.11"

"@nodelib/fs.scandir@2.1.5":
version "2.1.5"
resolved "https://registry.yarnpkg.com/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz#7619c2eb21b25483f6d167548b4cfd5a7488c3d5"
Expand Down Expand Up @@ -2133,6 +2148,14 @@ chownr@^2.0.0:
resolved "https://registry.npmjs.org/chownr/-/chownr-2.0.0.tgz#15bfbe53d2eab4cf70f18a8cd68ebe5b3cb1dece"
integrity sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==

chrome-remote-interface@^0.33.0:
version "0.33.0"
resolved "https://registry.yarnpkg.com/chrome-remote-interface/-/chrome-remote-interface-0.33.0.tgz#9140b5612ee5cdc39212cd0296d3b61ea881c47a"
integrity sha512-tv/SgeBfShXk43fwFpQ9wnS7mOCPzETnzDXTNxCb6TqKOiOeIfbrJz+2NAp8GmzwizpKa058wnU1Te7apONaYg==
dependencies:
commander "2.11.x"
ws "^7.2.0"

ci-info@^3.2.0:
version "3.3.0"
resolved "https://registry.yarnpkg.com/ci-info/-/ci-info-3.3.0.tgz#b4ed1fb6818dea4803a55c623041f9165d2066b2"
Expand Down Expand Up @@ -2253,6 +2276,11 @@ combined-stream@^1.0.8:
dependencies:
delayed-stream "~1.0.0"

commander@2.11.x:
version "2.11.0"
resolved "https://registry.yarnpkg.com/commander/-/commander-2.11.0.tgz#157152fd1e7a6c8d98a5b715cf376df928004563"
integrity sha512-b0553uYA5YAEGgyYIGYROzKQ7X5RAqedkfjiZxwi0kL1g3bOaBNNZfYkzt/CL0umgD5wc9Jec2FbB98CjkMRvQ==

commander@^2.19.0:
version "2.20.3"
resolved "https://registry.yarnpkg.com/commander/-/commander-2.20.3.tgz#fd485e84c03eb4881c20722ba48035e8531aeb33"
Expand Down Expand Up @@ -5509,6 +5537,11 @@ nan@^2.13.2, nan@^2.14.0:
resolved "https://registry.yarnpkg.com/nan/-/nan-2.14.2.tgz#f5376400695168f4cc694ac9393d0c9585eeea19"
integrity sha512-M2ufzIiINKCuDfBSAUr1vWQ+vuVcA9kqx8JJUsbQi6yf1uGRyb7HfpdfUr5qLXf3B/t8dPvcjhKMmlfnP47EzQ==

nan@^2.17.0:
version "2.18.0"
resolved "https://registry.yarnpkg.com/nan/-/nan-2.18.0.tgz#26a6faae7ffbeb293a39660e88a76b82e30b7554"
integrity sha512-W7tfG7vMOGtD30sHoZSSc/JVYiyDPEyQVso/Zz+/uQd0B0L46gtC+pHha5FFMRpil6fm/AoEcRWyOVi4+E/f8w==

nanoid@^3.3.6:
version "3.3.6"
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.6.tgz#443380c856d6e9f9824267d960b4236ad583ea4c"
Expand Down Expand Up @@ -5619,6 +5652,18 @@ node-int64@^0.4.0:
resolved "https://registry.yarnpkg.com/node-int64/-/node-int64-0.4.0.tgz#87a9065cdb355d3182d8f94ce11188b825c68a3b"
integrity sha1-h6kGXNs1XTGC2PlM4RGIuCXGijs=

node-oom-heapdump@^3.0.3:
version "3.0.3"
resolved "https://registry.yarnpkg.com/node-oom-heapdump/-/node-oom-heapdump-3.0.3.tgz#73e336b20266a6aef331700c0e4e34a631517da8"
integrity sha512-0KZNXIAozEmkPyg1LcQdHybwl81NGYjJYQyWByFnBrfJXRI2Qjm72ih2RFIM9lhXdJn3EVb4rFGui2tDkbuPtA==
dependencies:
"@mapbox/node-pre-gyp" "^1.0.11"
bindings "^1.5.0"
chrome-remote-interface "^0.33.0"
nan "^2.17.0"
require-main-filename "^2.0.0"
ws "^8.13.0"

node-releases@^2.0.2:
version "2.0.2"
resolved "https://registry.yarnpkg.com/node-releases/-/node-releases-2.0.2.tgz#7139fe71e2f4f11b47d4d2986aaf8c48699e0c01"
Expand Down Expand Up @@ -8122,11 +8167,21 @@ write-file-atomic@^3.0.0:
signal-exit "^3.0.2"
typedarray-to-buffer "^3.1.5"

ws@^7.2.0:
version "7.5.9"
resolved "https://registry.yarnpkg.com/ws/-/ws-7.5.9.tgz#54fa7db29f4c7cec68b1ddd3a89de099942bb591"
integrity sha512-F+P9Jil7UiSKSkppIiD94dN07AwvFixvLIj1Og1Rl9GGMuNipJnV9JzjD6XuqmAeiswGvUmNLjr5cFuXwNS77Q==

ws@^7.4.6:
version "7.5.7"
resolved "https://registry.yarnpkg.com/ws/-/ws-7.5.7.tgz#9e0ac77ee50af70d58326ecff7e85eb3fa375e67"
integrity sha512-KMvVuFzpKBuiIXW3E4u3mySRO2/mCHSyZDJQM5NQ9Q9KHWHWh0NHgfbRMLLrceUK5qAL4ytALJbpRMjixFZh8A==

ws@^8.13.0:
version "8.14.1"
resolved "https://registry.yarnpkg.com/ws/-/ws-8.14.1.tgz#4b9586b4f70f9e6534c7bb1d3dc0baa8b8cf01e0"
integrity sha512-4OOseMUq8AzRBI/7SLMUwO+FEDnguetSk7KMb1sHwvF2w2Wv5Hoj0nlifx8vtGsftE/jWHojPy8sMMzYLJ2G/A==

x-xss-protection@1.3.0:
version "1.3.0"
resolved "https://registry.yarnpkg.com/x-xss-protection/-/x-xss-protection-1.3.0.tgz#3e3a8dd638da80421b0e9fff11a2dbe168f6d52c"
Expand Down
Loading