Diff of /tools/slurm_train.sh [000000] .. [6d389a]

Switch to unified view

a b/tools/slurm_train.sh
1
#!/usr/bin/env bash
2
3
export MASTER_PORT=$((12000 + $RANDOM % 20000))
4
set -x
5
6
PARTITION=$1
7
JOB_NAME=$2
8
CONFIG=$3
9
GPUS=${GPUS:-8}
10
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
11
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
12
SRUN_ARGS=${SRUN_ARGS:-""}
13
PY_ARGS=${@:4}  # Any arguments from the forth one are captured by this
14
15
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
16
srun -p ${PARTITION} \
17
    --job-name=${JOB_NAME} \
18
    --gres=gpu:${GPUS_PER_NODE} \
19
    --ntasks=${GPUS} \
20
    --ntasks-per-node=${GPUS_PER_NODE} \
21
    --cpus-per-task=${CPUS_PER_TASK} \
22
    --kill-on-bad-exit=1 \
23
    ${SRUN_ARGS} \
24
    python -u tools/train.py ${CONFIG} --launcher="slurm" ${PY_ARGS}