|
a |
|
b/scripts/training_data_tradeoff.sh |
|
|
1 |
#!/bin/bash |
|
|
2 |
CORPUS=ons # (ons|i2b2|nursing) |
|
|
3 |
|
|
|
4 |
source activate deidentify |
|
|
5 |
|
|
|
6 |
# Disable MKL multithreading as it will actually slow down spaCy tokenization |
|
|
7 |
export MKL_NUM_TRHEADS=1 |
|
|
8 |
# Specify GPU to run on |
|
|
9 |
export CUDA_VISIBLE_DEVICES=0 |
|
|
10 |
|
|
|
11 |
# Fraction of training data to use |
|
|
12 |
train_sizes=(0.1 0.25 0.4 0.55 0.7 0.85 1) |
|
|
13 |
# Random seeds for sampling the training data. The number of seeds corresponds to the number of |
|
|
14 |
# repetitions for each training size. |
|
|
15 |
seeds=(42 43 44) |
|
|
16 |
|
|
|
17 |
for size in "${train_sizes[@]}"; do |
|
|
18 |
for seed in "${seeds[@]}"; do |
|
|
19 |
echo "========= size: $size - seed: $seed =========" |
|
|
20 |
|
|
|
21 |
python deidentify/methods/crf/run_crf_training_sample.py "$CORPUS" subset_training liu_2015 --train_sample_frac="$size" --random_seed="$seed" |
|
|
22 |
python deidentify/methods/bilstmcrf/run_bilstmcrf_training_sample.py "$CORPUS" subset_training --train_sample_frac="$size" --random_seed="$seed" |
|
|
23 |
done |
|
|
24 |
done |