|
a |
|
b/src/hyperparameter_tuning.py |
|
|
1 |
from transformers import BertTokenizer, BertForSequenceClassification |
|
|
2 |
from utils.BertArchitecture import BertNER, BioBertNER |
|
|
3 |
from utils.training import train_loop |
|
|
4 |
from utils.dataloader import Dataloader |
|
|
5 |
import torch |
|
|
6 |
from torch.utils.data import DataLoader |
|
|
7 |
from sklearn.model_selection import KFold |
|
|
8 |
from torch.optim import SGD |
|
|
9 |
from torch.optim import Adam |
|
|
10 |
|
|
|
11 |
def get_label_descriptions(transfer_learning, type): |
|
|
12 |
""" |
|
|
13 |
This function returns the correct label descriptions according to the |
|
|
14 |
current Model Architecture in use. |
|
|
15 |
|
|
|
16 |
Parameters: |
|
|
17 |
transfer_learning (bool): Whether we train BioBERT for transfer learning or not. |
|
|
18 |
|
|
|
19 |
Returns: |
|
|
20 |
tuple: |
|
|
21 |
- label_to_ids (dict): A dictionary mapping labels to their respective IDs. |
|
|
22 |
- ids_to_label (dict): A dictionary mapping IDs back to their respective labels. |
|
|
23 |
""" |
|
|
24 |
if not transfer_learning: |
|
|
25 |
|
|
|
26 |
if type == 'Medical Condition': |
|
|
27 |
type = 'MEDCOND' |
|
|
28 |
elif type == 'Symptom': |
|
|
29 |
type = 'SYMPTOM' |
|
|
30 |
elif type == 'Medication': |
|
|
31 |
type = 'MEDICATION' |
|
|
32 |
elif type == 'Vital Statistic': |
|
|
33 |
type = 'VITALSTAT' |
|
|
34 |
elif type == 'Measurement Value': |
|
|
35 |
type = 'MEASVAL' |
|
|
36 |
elif type == 'Negation Cue': |
|
|
37 |
type = 'NEGATION' |
|
|
38 |
elif type == 'Medical Procedure': |
|
|
39 |
type = 'PROCEDURE' |
|
|
40 |
else: |
|
|
41 |
raise ValueError('Type of annotation needs to be one of the following: Medical Condition, Symptom, Medication, Vital Statistic, Measurement Value, Negation Cue, Medical Procedure') |
|
|
42 |
else: |
|
|
43 |
if not type == 'Medical Condition': |
|
|
44 |
raise ValueError('Type of annotation needs to be Medical Condition when using BioBERT as baseline.') |
|
|
45 |
type = 'DISEASE' |
|
|
46 |
|
|
|
47 |
label_to_ids = { |
|
|
48 |
'B-' + type: 0, |
|
|
49 |
'I-' + type: 1, |
|
|
50 |
'O': 2 |
|
|
51 |
} |
|
|
52 |
|
|
|
53 |
ids_to_label = { |
|
|
54 |
0:'B-' + type, |
|
|
55 |
1:'I-' + type, |
|
|
56 |
2:'O' |
|
|
57 |
} |
|
|
58 |
return label_to_ids, ids_to_label, type |
|
|
59 |
|
|
|
60 |
def initialize_model(transfer_learning): |
|
|
61 |
""" |
|
|
62 |
Initializes the model architecture according to whether we would like to use BioBERT |
|
|
63 |
or not. |
|
|
64 |
|
|
|
65 |
Parameters: |
|
|
66 |
transfer_learning (bool): Whether we train BioBERT for transfer learning or not. |
|
|
67 |
|
|
|
68 |
Returns: |
|
|
69 |
model (BertNER | BioBertNER) |
|
|
70 |
""" |
|
|
71 |
if not transfer_learning: |
|
|
72 |
model = BertNER(3) #O, B-, I- -> 3 entities |
|
|
73 |
else: |
|
|
74 |
model = BioBertNER(3) |
|
|
75 |
return model |
|
|
76 |
|
|
|
77 |
def train_fold(transfer_learning, train_idx, val_idx, batch_size, learning_rate, optimizer_name, epoch, type): |
|
|
78 |
""" |
|
|
79 |
Trains a whole fold during hyperparameter tuning. |
|
|
80 |
|
|
|
81 |
Parameters: |
|
|
82 |
transfer_learning (bool): Whether we train BioBERT for transfer learning or not. |
|
|
83 |
train_idx (int): Index of the current split in training data. |
|
|
84 |
val_idx (int): Index of the current split in testing data. |
|
|
85 |
batch_size (int): Batch size used for training. |
|
|
86 |
learning_rate (float): Learning rate used for optimizer. |
|
|
87 |
optimizer_name (string): Name of the optimizer to be used (SGD or Adam). |
|
|
88 |
epoch (int): Number of epochs used for training. |
|
|
89 |
|
|
|
90 |
Returns: |
|
|
91 |
tuple: |
|
|
92 |
- train_res (dict): A dictionary containing the results obtained during training. |
|
|
93 |
- test_res (dict): A dictionary containing the results obtained during testing. |
|
|
94 |
""" |
|
|
95 |
model = initialize_model(transfer_learning) |
|
|
96 |
|
|
|
97 |
if optimizer_name == 'SGD': |
|
|
98 |
optimizer = SGD(model.parameters(), lr=learning_rate, momentum = 0.9) |
|
|
99 |
else: |
|
|
100 |
optimizer = Adam(model.parameters(), lr=learning_rate) |
|
|
101 |
|
|
|
102 |
train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx) |
|
|
103 |
val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx) |
|
|
104 |
|
|
|
105 |
parameters = { |
|
|
106 |
"model": model, |
|
|
107 |
"train_dataset": data, |
|
|
108 |
"eval_dataset" : data, |
|
|
109 |
"optimizer" : optimizer, |
|
|
110 |
"batch_size" : batch_size, |
|
|
111 |
"epochs" : epoch, |
|
|
112 |
"train_sampler": train_subsampler, |
|
|
113 |
"eval_sampler": val_subsampler, |
|
|
114 |
"type" : type |
|
|
115 |
} |
|
|
116 |
|
|
|
117 |
train_res, test_res = train_loop(**parameters, verbose=False) |
|
|
118 |
return train_res, test_res |
|
|
119 |
|
|
|
120 |
import argparse |
|
|
121 |
|
|
|
122 |
parser = argparse.ArgumentParser( |
|
|
123 |
description='This class is used to optimize the hyperparameters of either the pretrained BioBERT or the base BERT.') |
|
|
124 |
parser.add_argument('-tr', '--transfer_learning', type=bool, default=False, |
|
|
125 |
help='Choose whether the BioBERT model should be used as baseline or not.') |
|
|
126 |
parser.add_argument('-t', '--type', type=str, required=True, |
|
|
127 |
help='Specify the type of annotation to process. Type of annotation needs to be one of the following: Medical Condition, Symptom, Medication, Vital Statistic, Measurement Value, Negation Cue, Medical Procedure') |
|
|
128 |
|
|
|
129 |
args = parser.parse_args() |
|
|
130 |
|
|
|
131 |
if args.type not in ['Medical Condition', 'Symptom', 'Medication', 'Vital Statistic', 'Measurement Value', 'Negation Cue', 'Medical Procedure']: |
|
|
132 |
raise ValueError('Type of annotation needs to be one of the following: Medical Condition, Symptom, Medication, Vital Statistic, Measurement Value, Negation Cue, Medical Procedure') |
|
|
133 |
|
|
|
134 |
|
|
|
135 |
#-----hyperparameter grids-----# |
|
|
136 |
|
|
|
137 |
batch_sizes = [8, 16] #[8,16,32] |
|
|
138 |
learning_rates = [0.1] #[0.1, 0.01, 0.001, 0.0001] |
|
|
139 |
optimizers = ['SGD'] #['SGD', 'Adam'] |
|
|
140 |
epochs = [1] #[5, 10] |
|
|
141 |
max_tokens = 128 |
|
|
142 |
|
|
|
143 |
label_to_ids, ids_to_label, type = get_label_descriptions(args.transfer_learning, args.type) |
|
|
144 |
dataloader = Dataloader(label_to_ids, ids_to_label, args.transfer_learning, max_tokens, type) |
|
|
145 |
data = dataloader.load_dataset(full=True) |
|
|
146 |
|
|
|
147 |
best_f1_score = 0 |
|
|
148 |
best_param_grid = { |
|
|
149 |
"batch_size": 0, |
|
|
150 |
"learning_rate": 0, |
|
|
151 |
"epochs" : 0, |
|
|
152 |
"optimizer" : "", |
|
|
153 |
"max_tokens" : 0 |
|
|
154 |
} |
|
|
155 |
|
|
|
156 |
for batch_size in batch_sizes: |
|
|
157 |
for learning_rate in learning_rates: |
|
|
158 |
for optimizer_name in optimizers: |
|
|
159 |
for epoch in epochs: |
|
|
160 |
kf = KFold(n_splits=4, shuffle=True, random_state=7) |
|
|
161 |
test_f1_scores = [] |
|
|
162 |
for fold, (train_idx, val_idx) in enumerate(kf.split(data)): |
|
|
163 |
train_res, test_res = train_fold(args.transfer_learning, train_idx, val_idx, batch_size, learning_rate, optimizer_name, epoch, type) |
|
|
164 |
test_f1_scores.append(test_res['avg_f1_score']) |
|
|
165 |
print(f"Finished fold {fold+1} of 4!") |
|
|
166 |
local_best_f1 = sum(test_f1_scores) / len(test_f1_scores) |
|
|
167 |
if local_best_f1 > best_f1_score: |
|
|
168 |
best_f1_score = local_best_f1 |
|
|
169 |
best_param_grid = { |
|
|
170 |
"batch_size": batch_size, |
|
|
171 |
"learning_rate": learning_rate, |
|
|
172 |
"epochs" : epoch, |
|
|
173 |
"optimizer" : optimizer_name, |
|
|
174 |
"max_tokens" : max_tokens |
|
|
175 |
} |
|
|
176 |
print(f"Found new best f1 score: {best_f1_score}") |
|
|
177 |
print(best_param_grid) |
|
|
178 |
|
|
|
179 |
print("-------FINAL RESULTS-------") |
|
|
180 |
print(f"Best f1 score: {best_f1_score}") |
|
|
181 |
print(best_param_grid) |