Diff of /tools/slurm_train [000000] .. [6d389a]

Switch to unified view

a b/tools/slurm_train
1
#!/usr/bin/env bash
2
3
ARGPARSE_DESCRIPTION="Train recognizer on slurm cluster"
4
source $(dirname $0)/argparse.bash || exit 1
5
argparse "$@" <<EOF || exit 1
6
parser.add_argument('-p', '--partition', default='${PARTITION:-""}', type=str,
7
                    help='partition (default: env PARTITION)')
8
parser.add_argument('--gpus', default=${GPUS:-8}, type=int,
9
                    help='number of gpus to use (default: env GPUS, or 8)')
10
parser.add_argument('--gpus_per_node', default=${GPUS_PER_NODE:-8}, type=int,
11
                    help='gpus per node (default: env GPUS_PER_NODE, or 8)')
12
parser.add_argument('--cpus_per_task', default=${CPUS_PER_TASK:-5}, type=int,
13
                    help='cpus per task (default: env CPUS_PER_TASK, or 5)')
14
parser.add_argument('-j', '--job_name', default='', type=str,
15
                    help='name of job (default: config filename)')
16
parser.add_argument('--srun_args', default='${SRUN_ARGS:-""}', type=str,
17
                    help='extra args for srun (default: env SRUN_ARGS)')
18
parser.add_argument('config', type=str,
19
                    help='config file')
20
parser.add_argument('py_args', nargs=argparse.REMAINDER,
21
                    help='extra args to tools/train.py (can be empty)')
22
EOF
23
24
BARECONFIG="${CONFIG##*/}"
25
JOB_NAME="${JOB_NAME:-${BARECONFIG%.*}}"
26
27
echo partition: "${PARTITION}"
28
echo jobname: "${JOB_NAME}"
29
echo config: "${CONFIG}"
30
echo gpus: "${GPUS}"
31
echo gpus per node: "${GPUS_PER_NODE}"
32
echo cpus per task: "${CPUS_PER_TASK}"
33
echo srun args: "${SRUN_ARGS[@]}"
34
echo py args: "${PY_ARGS[@]}"
35
echo command to run:
36
echo ----
37
echo srun -p "${PARTITION}" \
38
    --job-name="${JOB_NAME}" \
39
    --gres=gpu:${GPUS_PER_NODE} \
40
    --ntasks=${GPUS} \
41
    --ntasks-per-node=${GPUS_PER_NODE} \
42
    --cpus-per-task=${CPUS_PER_TASK} \
43
    --kill-on-bad-exit=1 \
44
    ${SRUN_ARGS} \
45
    python -u tools/train.py ${CONFIG} --launcher="slurm" ${PY_ARGS[@]}
46
echo ----
47
echo
48
srun -p "${PARTITION}" \
49
    --job-name="${JOB_NAME}" \
50
    --gres=gpu:${GPUS_PER_NODE} \
51
    --ntasks=${GPUS} \
52
    --ntasks-per-node=${GPUS_PER_NODE} \
53
    --cpus-per-task=${CPUS_PER_TASK} \
54
    --kill-on-bad-exit=1 \
55
    ${SRUN_ARGS} \
56
    python -u tools/train.py ${CONFIG} --launcher="slurm" ${PY_ARGS[@]}