|
a |
|
b/tasks_base/train.pbs.template |
|
|
1 |
#!/bin/sh |
|
|
2 |
|
|
|
3 |
### The following requests all resources on 1 DGX-1 node |
|
|
4 |
#PBS -l select=1:ncpus=40:ngpus=8:mem=160G |
|
|
5 |
|
|
|
6 |
### The "select=1" specifies the number of nodes |
|
|
7 |
### The "ncpus=40:ngpus=8" asks for acccess to all 8 GPU cards |
|
|
8 |
### If you request less than 8 GPU then make the ncpus value |
|
|
9 |
### five times the ngpus value, e.g. select=1:ncpus=5:ngpus=1 |
|
|
10 |
|
|
|
11 |
### Specify amount of time required |
|
|
12 |
### values less than 4 hours go into a higher priority queue |
|
|
13 |
#PBS -l walltime=23:59:59 |
|
|
14 |
|
|
|
15 |
### Specify DGX queue |
|
|
16 |
#PBS -q dgx |
|
|
17 |
|
|
|
18 |
### Specify project code |
|
|
19 |
### e.g. 41000001 was the pilot project code |
|
|
20 |
### Personal is your personal lifetime allowance |
|
|
21 |
### Job will not submit unless this is changed |
|
|
22 |
#PBS -P 12001577 |
|
|
23 |
|
|
|
24 |
### Specify name for job |
|
|
25 |
#PBS -N train_base_{0} |
|
|
26 |
|
|
|
27 |
### Standard output by default goes to file $PBS_JOBNAME.o$PBS_JOBID |
|
|
28 |
### Standard error by default goes to file $PBS_JOBNAME.e$PBS_JOBID |
|
|
29 |
### To merge standard output and error use the following |
|
|
30 |
#PBS -j oe |
|
|
31 |
|
|
|
32 |
### Start of commands to be run |
|
|
33 |
|
|
|
34 |
# Docker image to use for container |
|
|
35 |
# To see available images run command: nscc-docker images |
|
|
36 |
# If image is not present, email help@nscc.sg to request pulling image into repository on all DGX nodes |
|
|
37 |
image="nvcr.io/nvidia/pytorch:20.01-py3" |
|
|
38 |
|
|
|
39 |
# Change to directory where job was submitted |
|
|
40 |
cd "$PBS_O_WORKDIR" || exit $? |
|
|
41 |
# Please note that when you start a Docker container then inside the container it will start in a different directory |
|
|
42 |
# You will also need to change to the correct directory inside the container |
|
|
43 |
|
|
|
44 |
# The "nscc-docker run $image" command runs the following Docker command: |
|
|
45 |
# nvidia-docker -u $UID:$GID -v /home:/home -v /raid:/raid --rm -i --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 $image /bin/sh |
|
|
46 |
# See full list of options with "nscc-docker run -h" |
|
|
47 |
# Pass the commands that you wish to run inside the container on the standard input |
|
|
48 |
# Edit file stdin as required |
|
|
49 |
nscc-docker run $image < script.{0}.sh # > stdout.$PBS_JOBID 2> stderr.$PBS_JOBID |