[d986f2]: / shell_scripts / job_starter.sh

Download this file

186 lines (160 with data), 6.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/bin/bash
#wrapper for cluster_runner_....sh which copies job-specific, frequently changing files (e.g. configs.py) before the actual sbatch job
#is submitted since the job might pend in queue before execution --> hazard of job-specific files being unintentionally changed during queue wait time.
#positonal
# -arg #1 identifies the folder name of the dataset-related code (e.g. >toy_exp< or >lidc_exp<) within the code source directory
# -arg #2 is the experiment and first part of the job name,
# optional args and flags:
# -c / --create: (flag) whether to create the exp, i.e., if this is a new start of the exp with configs etc from source dir.
# -f / --folds FOLDS: (option) fold(s) to run on (FOLDS needs to be only one int or string of multiple ints separated by space), default None (-->set to all in config)
# -m / --mode MODE: (option) string, one of "train", "train_test", "test", defaults to "train_test"
# -p / --exp_parent_dir: (option) name of parent_dir rel to dataset folder on cluster. exp_dir is exp_parent_dir/exp_name, if not given defaults to "experiments"
# -q / --queue: (option) which queue (-q parameter for bsub) to send job to. default: gputest. others: gputest-short (max 5h jobs).
# -w / --which: (option) same as argument -m to bsub; host or host list (string separated by space) to send the job to.
# use nodenameXX where XX==nr of node or nodenameXX,nodenameYY,... or nodename[XX-YY]. nodename is e.g. e132-comp.
# --gmem: (option) how much gpu memory to request for job (in gigabytes), defaults to 11.9. Currently, the smaller nodes have 11.9G, the larger ones 31.7G.
# --resume: (flag) only with explicit fold argument, if set, resumes from checkpoint in exp_dir/fold_x/last_state.pth.
# --no_parallel: (flag) if set, folds won't start as parallel jobs on cluster, but run sequentially in one job.
dataset_name="${1}"
exp_name="${2}"
#arguments not passed, e.g. $7 if no seventh argument, are null.
if [ ! -z "${18}" ]; then #-z checks if is null string
echo "Error: Received too many arguments."
exit
fi
#make args optional: move up if some args are missing inbetween
while [ ${#} -gt 2 ]; do
case "${3}" in
-c|--create)
create_exp="c"
shift
;;
-f|--folds)
folds="${4}"
shift; shift
;;
-m|--mode)
mode="${4}"
shift; shift
;;
-p|--exp_parent_dir)
exp_parent_dir="${4}"
shift; shift
;;
-q|--queue)
queue="${4}"
shift; shift
;;
-w|--which)
which="${4}"
shift; shift
;;
-R|--resource)
resource="${4}"
shift; shift
;;
--gmem)
gmem="${4}"
shift; shift
;;
--resume)
resume=true
shift
;;
--no_parallel)
no_parallel=true
shift
;;
*)
echo "Invalid argument/option passed: ${3}"
exit 1
;;
esac
done
# default values
if [ -z ${exp_parent_dir} ]; then
exp_parent_dir="experiments"
fi
if [ -z ${mode} ]; then
mode="train_test"
fi
if [ -z ${queue} ]; then
queue="gputest"
fi
if [ -z ${gmem} ]; then
gmem="11"
fi
root_dir=/home/ramien #assumes /home/ramien exists
#medicaldetectiontoolkit
source_dir=${root_dir}/mdt-public
dataset_abs_path=${source_dir}/experiments/${dataset_name} #set as second argument passed to this script
exp_parent_dir=/datasets/datasets_ramien/${dataset_name}/${exp_parent_dir}
exp_dir=${exp_parent_dir}/${exp_name}
#activate virtualenv that has all the packages:
source_dl="module load python/3.7.0; module load gcc/7.2.0; source ${root_dir}/.virtualenvs/mdt/bin/activate;"
eval ${source_dl}
# directly from prep node:
create_cmd="python ${source_dir}/exec.py --server_env --mode create_exp --exp_dir ${exp_dir} --exp_source ${dataset_abs_path};"
#if create_exp, check if would overwrite existing exp_dir
if [ ! -z ${create_exp} ] && [ ${create_exp} = "c" ]; then #-n doesnt work as replacement for !-z
if [ -d ${exp_dir} ]; then
echo "Please confirm to overwrite exp ${exp_name} settings, (Y/n): "; read confirmation
if ([ "${confirmation}" = "y" ] || [ "${confirmation}" = "yes" ] || [ "${confirmation}" = "Y" ] || [ -z "${confirmation}" ]); then
echo "Overwriting ${exp_name}"
else
echo "Exiting due to overwrite denial. Adjust options."
exit
fi
fi
#echo "opts: name ${exp_name}, ${source_dir}/exec.py --server_env --mode create_exp --exp_dir ${exp_dir} --exp_source ${dataset_abs_path}"
echo "Creating ${exp_name}"
eval ${create_cmd}
else
if [ ! -d ${exp_dir} ]; then
echo "Experiment directory ${exp_dir} does not exist."
echo "Run create_exp? (Y/n): "; read confirmation
if ([ "${confirmation}" = "y" ] || [ "${confirmation}" = "yes" ] || [ "${confirmation}" = "Y" ] || [ -z "${confirmation}" ]); then
echo "Creating ${exp_name}"
eval ${create_cmd}
fi
fi
fi
#if not create_exp, check if would overwrite existing folds (possibly valuable trained params!)
if [ -z ${create_exp} ] && ([ ${mode} = "train" ] || [ ${mode} = "train_test" ]) && [ -z "${resume}" ]; then
for f in ${folds}; do #if folds is null this check won't apply and folds will be quietly overwritten.
if [ -d ${exp_dir}/fold_${f} ]; then #-d checks if is dir
echo "please confirm to overwrite fold_${f}, (Y/n):"; read confirmation
if ([ "${confirmation}" = "y" ] || [ "${confirmation}" = "yes" ] || [ "${confirmation}" = "Y" ] || [ -z "${confirmation}" ]); then
echo "Overwriting "${exp_name}/fold_${f}
else
echo "Exiting due to overwrite denial. Adjust options."
exit
fi
fi
done
fi
bsub_opts="bsub -N -q ${queue} -gpu num=1:j_exclusive=yes:mode=exclusive_process:gmem=${gmem}G"
if [ ! -z "$resource" ]; then
bsub_opts=$bsub_opts $resource
fi
if [ ! -z ${which} ]; then
bsub_opts="${bsub_opts} -m ${which}"
fi
#----- parallel/separate fold jobs (each fold in a single job) -----------
if [ ! -z "${folds}" ] && [ -z ${no_parallel} ]; then #WHY do i need to convert to string again?
for f in ${folds}; do
out_file=${exp_dir}/logs/fold_${f}_lsf_output.out
bsub_opts="$bsub_opts -J '${dataset_name} ${exp_name} fold ${f} ${mode}' -oo '${out_file}'"
eval "${bsub_opts} sh cluster_runner_meddec.sh ${source_dir} ${exp_dir} ${dataset_abs_path} ${mode} ${f} ${resume}"
done
#----- consecutive folds job (all folds in one single job) -----------
else
if [ ! -z ${resume} ]; then
echo "You need to explicitly specify folds if you would like to resume from a checkpoint. Exiting."
exit
fi
out_file=${exp_dir}/logs/lsf_output.out
bsub_opts="$bsub_opts -J '${dataset_name} ${exp_name} folds ${folds} ${mode}' -oo '${out_file}'"
eval "${bsub_opts} sh cluster_runner_meddec.sh ${source_dir} ${exp_dir} ${dataset_abs_path} ${mode} ${folds} ${resume}"
echo "Started in no parallel, folds:" ${folds}
fi