forked from cdt-data-science/cluster-scripts
-
Notifications
You must be signed in to change notification settings - Fork 1
/
interactive_gpu
executable file
·127 lines (104 loc) · 3.9 KB
/
interactive_gpu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/bin/bash
set -e
default_time="08:00:00"
# Exceptions to defaults
if [ "$HOSTNAME" = "uhtred.inf.ed.ac.uk" ]; then
# mlp cluter has a max time of 2hrs
default_time="02:00:00"
fi
default_args="--time=${default_time} --mem=14000 --cpus-per-task=4 --gres=gpu:1"
print_welcome () {
cat << EOM
=============================== WELCOME MESSAGE ===============================
It's good practice to set a few parameters when using srun (this command is
essentailly an alias for 'srun --pty bash'). We have added some default
parameters to maximise your chance of getting an interactive session. See the
Slurm Documentation for further info about parameters for srun:
https://slurm.schedmd.com/srun.html
Default args added:
[$default_args]
Useful arg examples:
* --time=08:00:00 terminate job after 8 hours (good
practice and curteous)
* --time=1-04:00:30 terminate job after 1 day, 4 hours,
and 30 seconds
* --gres=gpu:1 give me 1 gpu
* --nodelist=charles19 put me on a specific node
* --mem=4000 give me 4G of RAM (note this is not GPU memory)
* --cpus-per-task=1 give me 1 cpu
* --partition=... there is a likely a partition explicitly for
interactive jobs (called *-Interactive).
You are welcome to run interactive
jobs on other partitions if the
Interactive partition is full. To
get the full list of partitions, run "sinfo"
For more informtion about node configuration, see the computing support docs:
http://computing.help.inf.ed.ac.uk/cluster-computing
===============================================================================
EOM
}
print_usage () {
cat << EOM
Identical to interactive, except gets you a node with a gpu too.
A shortcut to get you an interactive session. Will attempt to get a session on
a node which is in a partition with 'interactive' in its name. You can hit
ctrl-c at any time, and it will try another partition if there is one
available to try.
Usage: $0 [-h] [other args for srun]
Arguments:
-h
Print a friendly starter for 10
[other args for srun]
see documentation for srun:
https://slurm.schedmd.com/srun.html
You can specify arguments to this command exactly as you would for
srun. If you specify any arguments, no defaults arguments will be used.
EOM
}
while getopts 'h' flag; do
case "${flag}" in
h) print_usage
print_welcome
exit 1 ;;
esac
done
get_interactive_session() {
srun_args=$@
echo
echo "Doing a --test-only to estimate wait time..."
srun ${srun_args} --test-only --pty bash
echo
echo "Running the following command to get you an interactive session:"
echo "srun ${srun_args} --pty bash"
echo "..."
srun ${srun_args} --pty bash
}
if [ $# -eq 0 ]; then
args=$default_args
else
# If the user supplies args, don't set any default args
args=$@
get_interactive_session ${args}
exit 0 # <-------------- EXIT 1 (user specifies args)
fi
interactive_partitions=`sinfo -o %R -h | grep -i interactive`
if [ -z "$interactive_partitions" ]; then
echo "No partitions found with 'interactive' in their name"
echo "Attempting to get a session on the default partition"
get_interactive_session ${args}
exit 0 # <-------------- EXIT 2 (no args, no interactive partitions)
else
succeeded=0
for partition in $interactive_partitions; do
get_interactive_session --partition=${partition} ${args} \
&& succeeded=1 \
|| true
if [ $succeeded -eq 1 ]; then
exit 0 # <-------------- EXIT 3 (no args, interactive partition found)
fi
done
# will only get here if all else fails:
# tried interactive partitions, trying the default partition
get_interactive_session ${args}
exit 0 # <-------------- EXIT 4 (no args, int part found but default used)
fi