a b/experiments/enqueue_all.sbatch
1
#!/bin/bash
2
#SBATCH -p dgx2q # partition (queue)
3
#SBATCH -N 1 # number of nodes
4
#SBATCH -c 4 # number of cores
5
#SBATCH -w g001
6
#SBATCH --gres=gpu:1
7
#       #SBATCH --mem 128G # memory pool for all cores   # Removed due to bug in Slurm 20.02.5
8
#SBATCH -t 4-0:00 # time (D-HH:MM)
9
#SBATCH -o slurm.%N.%j.out # STDOUT
10
#SBATCH -e slurm.%N.%j.err # STDERR
11
12
ulimit -s 10240
13
14
module purge
15
module load slurm/20.02.7
16
module load cuda11.0/blas/11.0.3
17
module load cuda11.0/fft/11.0.3
18
module load cuda11.0/nsight/11.0.3
19
module load cuda11.0/profiler/11.0.3
20
module load cuda11.0/toolkit/11.0.3
21
22
if [ -n "$SLURM_CPUS_PER_TASK" ]; then
23
  omp_threads=$SLURM_CPUS_PER_TASK
24
else
25
  omp_threads=4
26
fi
27
export OMP_NUM_THREADS=$omp_threads        # OpenMP, Numpy
28
export MKL_NUM_THREADS=$omp_threads   # Intel MKL
29
export NUMEXPR_NUM_THREADS=$omp_threads      # Python3 Multiproc
30
# export OPENBLAS_NUM_THREADS=2     # Using OpenBLAS?
31
# export VECLIB_MAXIMUM_THREADS=2    # Accelware Vector Lib
32
33
export PYTHONPATH=$PWD
34
echo "Starting jobs"
35
srun python experiments/train_predictors_for_model.py "$SLURM_ARRAY_TASK_ID" "$EXPERIMENT_MODEL"