Skip to content

Commit

Permalink
NONE: generate heapdump on crash (#2433)
Browse files Browse the repository at this point in the history
  • Loading branch information
bgvozdev authored Sep 18, 2023
1 parent 84bd063 commit e084e12
Show file tree
Hide file tree
Showing 4 changed files with 179 additions and 5 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
"minimatch": "^5.0.1",
"moment": "^2.29.4",
"moo": "^0.5.0",
"node-oom-heapdump": "^3.0.3",
"node_extra_ca_certs_mozilla_bundle": "^1.0.5",
"optional-require": "^1.1.8",
"pg": "^8.7.3",
Expand Down
86 changes: 84 additions & 2 deletions src/util/workers-health-monitor.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import AWS from "aws-sdk";
import { waitUntil } from "test/utils/wait-until";
import { booleanFlag, BooleanFlags } from "config/feature-flags";
import { when } from "jest-when";
import oom from "node-oom-heapdump";
jest.mock("cluster", () => {
const workers = {
1: { send: jest.fn(), on: jest.fn(), process: { pid: 1 } },
Expand All @@ -22,6 +23,7 @@ jest.mock("utils/healthcheck-stopper");
jest.mock("services/generate-once-coredump-generator");
jest.mock("services/generate-once-per-node-headump-generator");
jest.mock("aws-sdk");
jest.mock("node-oom-heapdump");
jest.mock("config/feature-flags");

describe("workers-health-monitor", () => {
Expand Down Expand Up @@ -88,6 +90,28 @@ describe("workers-health-monitor", () => {
expect(stopHealthcheck).toBeCalled();
});

it("should charge heapdump generation on OOM", async () => {
intervals.push(...startMonitorOnWorker(logger, {
iAmAliveInervalMsec: 10,
dumpIntervalMsec: 1000,
lowHeapAvailPct: 10
}));
(process.on as jest.Mock).mock.calls[0][1]("heapdump_on_crash");

expect(oom).toBeCalledWith({
path: expect.stringMatching(/^\/tmp\/dump_heap_oom_[0-9]+$/)
});
});

it("should not charge heapdump generation on OOM by default", async () => {
intervals.push(...startMonitorOnWorker(logger, {
iAmAliveInervalMsec: 10,
dumpIntervalMsec: 1000,
lowHeapAvailPct: 10
}));
expect(oom).not.toBeCalled();
});

it("should generate core and heap dumps", async () => {
intervals.push(...startMonitorOnWorker(logger, {
iAmAliveInervalMsec: 1000,
Expand Down Expand Up @@ -144,8 +168,45 @@ describe("workers-health-monitor", () => {
try {
jest.advanceTimersByTime(50);

expect(cluster.workers[1]!.send).not.toBeCalled();
expect(cluster.workers[2]!.send).not.toBeCalled();
try {
expect(cluster.workers[1]!.send).not.toBeCalled();
} catch (err) {
// eslint-disable-next-line jest/no-conditional-expect
expect(cluster.workers[1]!.send).toBeCalledWith("heapdump_on_crash");
}
try {
expect(cluster.workers[2]!.send).not.toBeCalled();
} catch (err) {
// eslint-disable-next-line jest/no-conditional-expect
expect(cluster.workers[2]!.send).toBeCalledWith("heapdump_on_crash");
}

} finally {
clearInterval(workerSendingImAliveToMasterInterval);
}
});

it("charges one worker to trigger headump on oom", async () => {
intervals.push(startMonitorOnMaster(logger, {
pollIntervalMsecs: 10,
workerStartupTimeMsecs: 20,
workerUnresponsiveThresholdMsecs: 40,
numberOfWorkersThreshold: 1
}));

const workerSendingImAliveToMasterInterval = setInterval(() => {
if ((cluster.workers[1]!.on as jest.Mock).mock.calls.length > 0) {
(cluster.workers[1]!.on as jest.Mock).mock.calls[0][1]();
(cluster.workers[2]!.on as jest.Mock).mock.calls[0][1]();
}
}, 9);

try {
jest.advanceTimersByTime(50);

const sendMsgs = (cluster.workers[1]!.send as jest.Mock).mock.calls.map(call => call[0]).join(",") +
(cluster.workers[2]!.send as jest.Mock).mock.calls.map(call => call[0]).join(",");
expect(sendMsgs).toStrictEqual("heapdump_on_crash");

} finally {
clearInterval(workerSendingImAliveToMasterInterval);
Expand All @@ -168,6 +229,8 @@ describe("workers-health-monitor", () => {

const DUMP_READY_FILEPATH = "/tmp/dump_core.123.ready";
const DUMP_UPLOAD_FILEPATH = "/tmp/dump_core.123.ready.uploadinprogress";
const DUMP_OOM_FILEPATH = "/tmp/dump_heap_oom_1.heapsnapshot";
const DUMP_OOM_READY_FILEPATH = "/tmp/dump_heap_oom_1.heapsnapshot.ready";

const deleteFileSafe = (path: string) => {
try {
Expand All @@ -181,11 +244,30 @@ describe("workers-health-monitor", () => {
beforeEach(() => {
deleteFileSafe(DUMP_READY_FILEPATH);
deleteFileSafe(DUMP_UPLOAD_FILEPATH);
deleteFileSafe(DUMP_OOM_FILEPATH);
deleteFileSafe(DUMP_OOM_READY_FILEPATH);
});

afterEach(() => {
deleteFileSafe(DUMP_READY_FILEPATH);
deleteFileSafe(DUMP_UPLOAD_FILEPATH);
deleteFileSafe(DUMP_OOM_FILEPATH);
deleteFileSafe(DUMP_OOM_READY_FILEPATH);
});

it("should prepare oom heapdump for uploading on worker crash", async () => {
intervals.push(startMonitorOnMaster(logger, {
pollIntervalMsecs: 10,
workerStartupTimeMsecs: 20,
workerUnresponsiveThresholdMsecs: 40,
numberOfWorkersThreshold: 1
}));

jest.advanceTimersByTime(50);

fs.writeFileSync(DUMP_OOM_FILEPATH, "foo");
(cluster.workers[1]!.on as jest.Mock).mock.calls.find(call => call[0] === "exit")![1]();
expect(fs.existsSync(DUMP_OOM_READY_FILEPATH)).toBeTruthy();
});

it("should upload dumps to S3", async () => {
Expand Down
42 changes: 39 additions & 3 deletions src/util/workers-health-monitor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,14 @@ import { envVars } from "config/env";
import { GenerateOnceCoredumpGenerator } from "services/generate-once-coredump-generator";
import { GenerateOncePerNodeHeadumpGenerator } from "services/generate-once-per-node-headump-generator";
import { booleanFlag, BooleanFlags } from "config/feature-flags";
import oom from "node-oom-heapdump";

const CONF_SHUTDOWN_MSG = "shutdown";
const SHUTDOWN_MSG = "shutdown";
const HEAPDUMP_ON_CRASH_MSG = "heapdump_on_crash";

const generateHeapdumpPathOnOom = (pid: string) => {
return `/tmp/dump_heap_oom_${pid}`;
};

export const startMonitorOnWorker = (parentLogger: Logger, workerConfig: {
iAmAliveInervalMsec: number,
Expand Down Expand Up @@ -54,10 +60,18 @@ export const startMonitorOnWorker = (parentLogger: Logger, workerConfig: {

process.on("message", (msg: string) => {
logger.info(`worker received a message: ${msg}`);
if (msg === CONF_SHUTDOWN_MSG) {
if (msg === SHUTDOWN_MSG) {
logger.warn("shutdown received, stop healthcheck");
stopHealthcheck();
}
if (msg === HEAPDUMP_ON_CRASH_MSG) {
if (dumpsFlagValue) {
logger.warn("charging heapdump on crash");
oom({
path: generateHeapdumpPathOnOom(process.pid.toString())
});
}
}
});

const workerPingingServerInterval = setInterval(() => {
Expand Down Expand Up @@ -113,6 +127,11 @@ export const startMonitorOnMaster = (parentLogger: Logger, config: {
liveWorkers[workerPid] = Date.now();
});
worker.on("exit", (code, signal) => {
const maybeOomHeapdumpPath = generateHeapdumpPathOnOom(workerPid.toString()) + ".heapsnapshot";
if (fs.existsSync(maybeOomHeapdumpPath)) {
logger.info(`found ${maybeOomHeapdumpPath}, prepare for uploading`);
fs.renameSync(maybeOomHeapdumpPath, maybeOomHeapdumpPath + ".ready");
}
if (signal) {
logger.warn(`worker was killed by signal: ${signal}, code=${code}`);
} else if (code !== 0) {
Expand Down Expand Up @@ -150,6 +169,22 @@ export const startMonitorOnMaster = (parentLogger: Logger, config: {
}
};

// Given that heapdump eats a lot of mem and CPU, let's listen to only one worker. Otherwise, if 2 or more workers
// crash, that would put the whole node under risk,
let workerToReportOnCrashPid: string | undefined;
const maybeChargeWorkerToGenerateHeapdumpOnCrash = () => {
if (areWorkersReady() && !workerToReportOnCrashPid && Object.keys(registeredWorkers).length > 0) {
const pids = Object.keys(registeredWorkers);
workerToReportOnCrashPid = pids[Math.floor(Math.random() * pids.length)];
const worker = cluster.workers[workerToReportOnCrashPid];
if (!worker) {
workerToReportOnCrashPid = undefined;
return;
}
worker.send(HEAPDUMP_ON_CRASH_MSG);
}
};

const maybeRemoveDeadWorkers = () => {
if (areWorkersReady()) {
logger.info(`removing dead workers`);
Expand Down Expand Up @@ -177,7 +212,7 @@ export const startMonitorOnMaster = (parentLogger: Logger, config: {
nLiveWorkers
}, `send shutdown signal to all workers`);
for (const worker of Object.values(cluster.workers)) {
worker?.send(CONF_SHUTDOWN_MSG);
worker?.send(SHUTDOWN_MSG);
}
logRunningProcesses(logger);
} else {
Expand Down Expand Up @@ -231,6 +266,7 @@ export const startMonitorOnMaster = (parentLogger: Logger, config: {
return setInterval(() => {
registerNewWorkers(); // must be called periodically to make sure we pick up new/respawned workers
maybeSetupWorkersReadyAt();
maybeChargeWorkerToGenerateHeapdumpOnCrash();
maybeRemoveDeadWorkers();
maybeSendShutdownToAllWorkers();
maybeUploadeDumpFiles();
Expand Down
55 changes: 55 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,21 @@
semver "^7.3.5"
tar "^6.1.11"

"@mapbox/node-pre-gyp@^1.0.11":
version "1.0.11"
resolved "https://registry.yarnpkg.com/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz#417db42b7f5323d79e93b34a6d7a2a12c0df43fa"
integrity sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==
dependencies:
detect-libc "^2.0.0"
https-proxy-agent "^5.0.0"
make-dir "^3.1.0"
node-fetch "^2.6.7"
nopt "^5.0.0"
npmlog "^5.0.1"
rimraf "^3.0.2"
semver "^7.3.5"
tar "^6.1.11"

"@nodelib/fs.scandir@2.1.5":
version "2.1.5"
resolved "https://registry.yarnpkg.com/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz#7619c2eb21b25483f6d167548b4cfd5a7488c3d5"
Expand Down Expand Up @@ -2133,6 +2148,14 @@ chownr@^2.0.0:
resolved "https://registry.npmjs.org/chownr/-/chownr-2.0.0.tgz#15bfbe53d2eab4cf70f18a8cd68ebe5b3cb1dece"
integrity sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==

chrome-remote-interface@^0.33.0:
version "0.33.0"
resolved "https://registry.yarnpkg.com/chrome-remote-interface/-/chrome-remote-interface-0.33.0.tgz#9140b5612ee5cdc39212cd0296d3b61ea881c47a"
integrity sha512-tv/SgeBfShXk43fwFpQ9wnS7mOCPzETnzDXTNxCb6TqKOiOeIfbrJz+2NAp8GmzwizpKa058wnU1Te7apONaYg==
dependencies:
commander "2.11.x"
ws "^7.2.0"

ci-info@^3.2.0:
version "3.3.0"
resolved "https://registry.yarnpkg.com/ci-info/-/ci-info-3.3.0.tgz#b4ed1fb6818dea4803a55c623041f9165d2066b2"
Expand Down Expand Up @@ -2253,6 +2276,11 @@ combined-stream@^1.0.8:
dependencies:
delayed-stream "~1.0.0"

commander@2.11.x:
version "2.11.0"
resolved "https://registry.yarnpkg.com/commander/-/commander-2.11.0.tgz#157152fd1e7a6c8d98a5b715cf376df928004563"
integrity sha512-b0553uYA5YAEGgyYIGYROzKQ7X5RAqedkfjiZxwi0kL1g3bOaBNNZfYkzt/CL0umgD5wc9Jec2FbB98CjkMRvQ==

commander@^2.19.0:
version "2.20.3"
resolved "https://registry.yarnpkg.com/commander/-/commander-2.20.3.tgz#fd485e84c03eb4881c20722ba48035e8531aeb33"
Expand Down Expand Up @@ -5509,6 +5537,11 @@ nan@^2.13.2, nan@^2.14.0:
resolved "https://registry.yarnpkg.com/nan/-/nan-2.14.2.tgz#f5376400695168f4cc694ac9393d0c9585eeea19"
integrity sha512-M2ufzIiINKCuDfBSAUr1vWQ+vuVcA9kqx8JJUsbQi6yf1uGRyb7HfpdfUr5qLXf3B/t8dPvcjhKMmlfnP47EzQ==

nan@^2.17.0:
version "2.18.0"
resolved "https://registry.yarnpkg.com/nan/-/nan-2.18.0.tgz#26a6faae7ffbeb293a39660e88a76b82e30b7554"
integrity sha512-W7tfG7vMOGtD30sHoZSSc/JVYiyDPEyQVso/Zz+/uQd0B0L46gtC+pHha5FFMRpil6fm/AoEcRWyOVi4+E/f8w==

nanoid@^3.3.6:
version "3.3.6"
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.6.tgz#443380c856d6e9f9824267d960b4236ad583ea4c"
Expand Down Expand Up @@ -5619,6 +5652,18 @@ node-int64@^0.4.0:
resolved "https://registry.yarnpkg.com/node-int64/-/node-int64-0.4.0.tgz#87a9065cdb355d3182d8f94ce11188b825c68a3b"
integrity sha1-h6kGXNs1XTGC2PlM4RGIuCXGijs=

node-oom-heapdump@^3.0.3:
version "3.0.3"
resolved "https://registry.yarnpkg.com/node-oom-heapdump/-/node-oom-heapdump-3.0.3.tgz#73e336b20266a6aef331700c0e4e34a631517da8"
integrity sha512-0KZNXIAozEmkPyg1LcQdHybwl81NGYjJYQyWByFnBrfJXRI2Qjm72ih2RFIM9lhXdJn3EVb4rFGui2tDkbuPtA==
dependencies:
"@mapbox/node-pre-gyp" "^1.0.11"
bindings "^1.5.0"
chrome-remote-interface "^0.33.0"
nan "^2.17.0"
require-main-filename "^2.0.0"
ws "^8.13.0"

node-releases@^2.0.2:
version "2.0.2"
resolved "https://registry.yarnpkg.com/node-releases/-/node-releases-2.0.2.tgz#7139fe71e2f4f11b47d4d2986aaf8c48699e0c01"
Expand Down Expand Up @@ -8122,11 +8167,21 @@ write-file-atomic@^3.0.0:
signal-exit "^3.0.2"
typedarray-to-buffer "^3.1.5"

ws@^7.2.0:
version "7.5.9"
resolved "https://registry.yarnpkg.com/ws/-/ws-7.5.9.tgz#54fa7db29f4c7cec68b1ddd3a89de099942bb591"
integrity sha512-F+P9Jil7UiSKSkppIiD94dN07AwvFixvLIj1Og1Rl9GGMuNipJnV9JzjD6XuqmAeiswGvUmNLjr5cFuXwNS77Q==

ws@^7.4.6:
version "7.5.7"
resolved "https://registry.yarnpkg.com/ws/-/ws-7.5.7.tgz#9e0ac77ee50af70d58326ecff7e85eb3fa375e67"
integrity sha512-KMvVuFzpKBuiIXW3E4u3mySRO2/mCHSyZDJQM5NQ9Q9KHWHWh0NHgfbRMLLrceUK5qAL4ytALJbpRMjixFZh8A==

ws@^8.13.0:
version "8.14.1"
resolved "https://registry.yarnpkg.com/ws/-/ws-8.14.1.tgz#4b9586b4f70f9e6534c7bb1d3dc0baa8b8cf01e0"
integrity sha512-4OOseMUq8AzRBI/7SLMUwO+FEDnguetSk7KMb1sHwvF2w2Wv5Hoj0nlifx8vtGsftE/jWHojPy8sMMzYLJ2G/A==

x-xss-protection@1.3.0:
version "1.3.0"
resolved "https://registry.yarnpkg.com/x-xss-protection/-/x-xss-protection-1.3.0.tgz#3e3a8dd638da80421b0e9fff11a2dbe168f6d52c"
Expand Down

0 comments on commit e084e12

Please sign in to comment.