|
a |
|
b/tools/slurm_train.sh |
|
|
1 |
#!/usr/bin/env bash |
|
|
2 |
|
|
|
3 |
export MASTER_PORT=$((12000 + $RANDOM % 20000)) |
|
|
4 |
set -x |
|
|
5 |
|
|
|
6 |
PARTITION=$1 |
|
|
7 |
JOB_NAME=$2 |
|
|
8 |
CONFIG=$3 |
|
|
9 |
GPUS=${GPUS:-8} |
|
|
10 |
GPUS_PER_NODE=${GPUS_PER_NODE:-8} |
|
|
11 |
CPUS_PER_TASK=${CPUS_PER_TASK:-5} |
|
|
12 |
SRUN_ARGS=${SRUN_ARGS:-""} |
|
|
13 |
PY_ARGS=${@:4} # Any arguments from the forth one are captured by this |
|
|
14 |
|
|
|
15 |
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ |
|
|
16 |
srun -p ${PARTITION} \ |
|
|
17 |
--job-name=${JOB_NAME} \ |
|
|
18 |
--gres=gpu:${GPUS_PER_NODE} \ |
|
|
19 |
--ntasks=${GPUS} \ |
|
|
20 |
--ntasks-per-node=${GPUS_PER_NODE} \ |
|
|
21 |
--cpus-per-task=${CPUS_PER_TASK} \ |
|
|
22 |
--kill-on-bad-exit=1 \ |
|
|
23 |
${SRUN_ARGS} \ |
|
|
24 |
python -u tools/train.py ${CONFIG} --launcher="slurm" ${PY_ARGS} |