Skip to content

Commit

Permalink
expand support for CHPL_LAUNCHER_GPUS_PER_NODE
Browse files Browse the repository at this point in the history
Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
  • Loading branch information
jabraham17 committed Oct 29, 2024
1 parent 336a42d commit 7a0253f
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 11 deletions.
34 changes: 34 additions & 0 deletions runtime/src/launch/slurm-gasnetrun_common/slurm-gasnetrun_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#define CHPL_NODELIST_FLAG "--nodelist"
#define CHPL_PARTITION_FLAG "--partition"
#define CHPL_EXCLUDE_FLAG "--exclude"
#define CHPL_GPUS_PER_NODE_FLAG "--gpus-per-node"

#define CHPL_LPN_VAR "LOCALES_PER_NODE"

Expand All @@ -54,6 +55,7 @@ static char* walltime = NULL;
static char* nodelist = NULL;
static char* partition = NULL;
static char* exclude = NULL;
static char* gpusPerNode = NULL;
char slurmFilename[FILENAME_MAX];

/* copies of binary to run per node */
Expand Down Expand Up @@ -137,6 +139,12 @@ static void genNumLocalesOptions(FILE* slurmFile, sbatchVersion sbatch,
exclude = getenv("CHPL_LAUNCHER_EXCLUDE");
}

// command line gpus per node takes precedence over env var
if (!gpusPerNode) {
gpusPerNode = getenv("CHPL_LAUNCHER_GPUS_PER_NODE");
}


if (walltime)
fprintf(slurmFile, "#SBATCH --time=%s\n", walltime);
if (nodelist)
Expand All @@ -145,6 +153,8 @@ static void genNumLocalesOptions(FILE* slurmFile, sbatchVersion sbatch,
fprintf(slurmFile, "#SBATCH --partition=%s\n", partition);
if (exclude)
fprintf(slurmFile, "#SBATCH --exclude=%s\n", exclude);
if (gpusPerNode)
fprintf(slurmFile, "#SBATCH --gpus-per-node=%s\n", gpusPerNode);
switch (sbatch) {
case slurm: {
fprintf(slurmFile, "#SBATCH --nodes=%d\n", numNodes);
Expand Down Expand Up @@ -237,6 +247,11 @@ static char* chpl_launch_create_command(int argc, char* argv[],
exclude = getenv("CHPL_LAUNCHER_EXCLUDE");
}

// command line gpus per node takes precedence over env var
if (!gpusPerNode) {
gpusPerNode = getenv("CHPL_LAUNCHER_GPUS_PER_NODE");
}

// request exclusive node access by default, but allow user to override
nodeAccessEnv = getenv("CHPL_LAUNCHER_NODE_ACCESS");
if (nodeAccessEnv == NULL || strcmp(nodeAccessEnv, "exclusive") == 0) {
Expand Down Expand Up @@ -316,6 +331,8 @@ static char* chpl_launch_create_command(int argc, char* argv[],
len += snprintf(iCom+len, sizeof(iCom)-len, "--partition=%s ", partition);
if(exclude)
len += snprintf(iCom+len, sizeof(iCom)-len, "--exclude=%s ", exclude);
if(gpusPerNode)
len += snprintf(iCom+len, sizeof(iCom)-len, "--gpus-per-node=%s ", gpusPerNode);
if(projectString && strlen(projectString) > 0)
len += snprintf(iCom+len, sizeof(iCom)-len, "--account=%s ",
projectString);
Expand Down Expand Up @@ -410,6 +427,16 @@ int chpl_launch_handle_arg(int argc, char* argv[], int argNum,
exclude = &(argv[argNum][strlen(CHPL_EXCLUDE_FLAG)+1]);
return 1;
}

// handle --gpus-per-node <gpus> or --gpus-per-node=<gpus>
if (!strcmp(argv[argNum], CHPL_GPUS_PER_NODE_FLAG)) {
gpusPerNode = argv[argNum+1];
return 2;
} else if (!strncmp(argv[argNum], CHPL_GPUS_PER_NODE_FLAG"=", strlen(CHPL_GPUS_PER_NODE_FLAG))) {
gpusPerNode = &(argv[argNum][strlen(CHPL_GPUS_PER_NODE_FLAG)+1]);
return 1;
}

return 0;
}

Expand Down Expand Up @@ -441,6 +468,13 @@ const argDescTuple_t* chpl_launch_get_help(void) {
{ "",
"(or use $CHPL_LAUNCHER_EXCLUDE)"
},
{
CHPL_GPUS_PER_NODE_FLAG " <gpus>",
"specify the number of GPUs per node"
},
{ "",
"(or use $CHPL_LAUNCHER_GPUS_PER_NODE)"
},
{ NULL, NULL },
};
return args;
Expand Down
45 changes: 34 additions & 11 deletions runtime/src/launch/slurm-srun/launch-slurm-srun.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
#define CHPL_NODELIST_FLAG "--nodelist"
#define CHPL_PARTITION_FLAG "--partition"
#define CHPL_EXCLUDE_FLAG "--exclude"
#define CHPL_GPUS_PER_NODE_FLAG "--gpus-per-node"


static char* debug = NULL;
static char* walltime = NULL;
Expand All @@ -46,6 +48,7 @@ static char* nodelist = NULL;
static char* partition = NULL;
static char* reservation = NULL;
static char* exclude = NULL;
static char* gpusPerNode = NULL;

char slurmFilename[FILENAME_MAX];

Expand Down Expand Up @@ -234,7 +237,6 @@ static char* chpl_launch_create_command(int argc, char* argv[],
char* outputfn = getenv("CHPL_LAUNCHER_SLURM_OUTPUT_FILENAME");
char* errorfn = getenv("CHPL_LAUNCHER_SLURM_ERROR_FILENAME");
char* nodeAccessEnv = getenv("CHPL_LAUNCHER_NODE_ACCESS");
char* gpusPerNode = getenv("CHPL_LAUNCHER_GPUS_PER_NODE");
char* memEnv = getenv("CHPL_LAUNCHER_MEM");
const char* nodeAccessStr = NULL;
const char* memStr = NULL;
Expand Down Expand Up @@ -304,6 +306,11 @@ static char* chpl_launch_create_command(int argc, char* argv[],
exclude = getenv("CHPL_LAUNCHER_EXCLUDE");
}

// command line gpus per node takes precedence over env var
if (!gpusPerNode) {
gpusPerNode = getenv("CHPL_LAUNCHER_GPUS_PER_NODE");
}

reservation = getenv("SLURM_RESERVATION");

// request exclusive node access by default, but allow user to override
Expand Down Expand Up @@ -407,6 +414,11 @@ static char* chpl_launch_create_command(int argc, char* argv[],
fprintf(slurmFile, "#SBATCH --exclude=%s\n", exclude);
}

// Set the gpus per node if it was specified
if (gpusPerNode) {
fprintf(slurmFile, "#SBATCH --gpus-per-node=%s\n", gpusPerNode);
}

// If needed a constraint can be specified with the env var CHPL_LAUNCHER_CONSTRAINT
if (constraint) {
fprintf(slurmFile, "#SBATCH --constraint=%s\n", constraint);
Expand All @@ -417,11 +429,6 @@ static char* chpl_launch_create_command(int argc, char* argv[],
fprintf(slurmFile, "#SBATCH --account=%s\n", account);
}

// set gpus-per-node if one was provided
if (gpusPerNode && strlen(gpusPerNode) > 0) {
fprintf(slurmFile, "#SBATCH --gpus-per-node=%s\n", gpusPerNode);
}

// set the output file name to either the user specified
// name or to the binaryName.<jobID>.out if none specified
if (outputfn != NULL) {
Expand Down Expand Up @@ -556,6 +563,12 @@ static char* chpl_launch_create_command(int argc, char* argv[],
len += snprintf(iCom+len, sizeof(iCom)-len, "--exclude=%s ", exclude);
}

// Set the gpus per node if it was specified
if (gpusPerNode) {
len += snprintf(iCom+len, sizeof(iCom)-len, "--gpus-per-node=%s ",
gpusPerNode);
}

// set any constraints
if (constraint) {
len += snprintf(iCom+len, sizeof(iCom)-len, "--constraint=%s ", constraint);
Expand All @@ -566,11 +579,6 @@ static char* chpl_launch_create_command(int argc, char* argv[],
len += snprintf(iCom+len, sizeof(iCom)-len, "--account=%s ", account);
}

// set gpus-per-node if one was provided
if (gpusPerNode && strlen(gpusPerNode) > 0) {
len += snprintf(iCom+len, sizeof(iCom)-len, "--gpus-per-node=%s ", gpusPerNode);
}

// add the (possibly wrapped) binary name
len += snprintf(iCom+len, sizeof(iCom)-len, "%s %s",
chpl_get_real_binary_wrapper(), chpl_get_real_binary_name());
Expand Down Expand Up @@ -685,6 +693,15 @@ int chpl_launch_handle_arg(int argc, char* argv[], int argNum,
return 1;
}

// handle --gpus-per-node <gpus> or --gpus-per-node=<gpus>
if (!strcmp(argv[argNum], CHPL_GPUS_PER_NODE_FLAG)) {
gpusPerNode = argv[argNum+1];
return 2;
} else if (!strncmp(argv[argNum], CHPL_GPUS_PER_NODE_FLAG"=", strlen(CHPL_GPUS_PER_NODE_FLAG))) {
gpusPerNode = &(argv[argNum][strlen(CHPL_GPUS_PER_NODE_FLAG)+1]);
return 1;
}

// handle --generate-sbatch-script
if (!strcmp(argv[argNum], CHPL_GENERATE_SBATCH_SCRIPT)) {
generate_sbatch_script = 1;
Expand Down Expand Up @@ -729,6 +746,12 @@ const argDescTuple_t* chpl_launch_get_help(void) {
{ "",
"(or use $CHPL_LAUNCHER_EXCLUDE)"
},
{ CHPL_GPUS_PER_NODE_FLAG " <gpus>",
"specify number of gpus per node"
},
{ "",
"(or use $CHPL_LAUNCHER_GPUS_PER_NODE)"
},
{ NULL, NULL },
};
return args;
Expand Down

0 comments on commit 7a0253f

Please sign in to comment.