Switch to unified view

a b/src/hyperparameter_tuning.py
1
from transformers import BertTokenizer, BertForSequenceClassification
2
from utils.BertArchitecture import BertNER, BioBertNER
3
from utils.training import train_loop
4
from utils.dataloader import Dataloader
5
import torch
6
from torch.utils.data import DataLoader
7
from sklearn.model_selection import KFold
8
from torch.optim import SGD
9
from torch.optim import Adam
10
11
def get_label_descriptions(transfer_learning, type):
12
    """
13
    This function returns the correct label descriptions according to the
14
    current Model Architecture in use.
15
16
    Parameters:
17
    transfer_learning (bool): Whether we train BioBERT for transfer learning or not.
18
19
    Returns:
20
    tuple:
21
        - label_to_ids (dict): A dictionary mapping labels to their respective IDs.
22
        - ids_to_label (dict): A dictionary mapping IDs back to their respective labels.
23
    """
24
    if not transfer_learning:
25
26
        if type == 'Medical Condition':
27
            type = 'MEDCOND'
28
        elif type == 'Symptom':
29
            type = 'SYMPTOM'
30
        elif type == 'Medication':
31
            type = 'MEDICATION'
32
        elif type == 'Vital Statistic':
33
            type = 'VITALSTAT'
34
        elif type == 'Measurement Value':
35
            type = 'MEASVAL'
36
        elif type == 'Negation Cue':
37
            type = 'NEGATION'
38
        elif type == 'Medical Procedure':
39
            type = 'PROCEDURE'
40
        else:    
41
            raise ValueError('Type of annotation needs to be one of the following: Medical Condition, Symptom, Medication, Vital Statistic, Measurement Value, Negation Cue, Medical Procedure')
42
    else:
43
        if not type == 'Medical Condition':
44
            raise ValueError('Type of annotation needs to be Medical Condition when using BioBERT as baseline.')
45
        type = 'DISEASE'
46
47
    label_to_ids = {
48
        'B-' + type: 0,
49
        'I-' + type: 1,
50
        'O': 2
51
    }
52
53
    ids_to_label = {
54
        0:'B-' + type,
55
        1:'I-' + type,
56
        2:'O'
57
    }
58
    return label_to_ids, ids_to_label, type
59
60
def initialize_model(transfer_learning):
61
    """
62
    Initializes the model architecture according to whether we would like to use BioBERT
63
    or not.
64
65
    Parameters:
66
    transfer_learning (bool): Whether we train BioBERT for transfer learning or not.
67
68
    Returns:
69
    model (BertNER | BioBertNER)
70
    """
71
    if not transfer_learning:
72
        model = BertNER(3) #O, B-, I- -> 3 entities
73
    else:
74
        model = BioBertNER(3)
75
    return model
76
77
def train_fold(transfer_learning, train_idx, val_idx, batch_size, learning_rate, optimizer_name, epoch, type):
78
    """
79
    Trains a whole fold during hyperparameter tuning.
80
81
    Parameters:
82
    transfer_learning (bool): Whether we train BioBERT for transfer learning or not.
83
    train_idx (int): Index of the current split in training data.
84
    val_idx (int): Index of the current split in testing data.
85
    batch_size (int): Batch size used for training.
86
    learning_rate (float): Learning rate used for optimizer.
87
    optimizer_name (string): Name of the optimizer to be used (SGD or Adam).
88
    epoch (int): Number of epochs used for training.
89
90
    Returns:
91
    tuple:
92
        - train_res (dict): A dictionary containing the results obtained during training.
93
        - test_res (dict): A dictionary containing the results obtained during testing.
94
    """
95
    model = initialize_model(transfer_learning)
96
97
    if optimizer_name == 'SGD':
98
        optimizer = SGD(model.parameters(), lr=learning_rate, momentum = 0.9)
99
    else:
100
        optimizer = Adam(model.parameters(), lr=learning_rate)
101
102
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
103
    val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)
104
105
    parameters = {
106
        "model": model,
107
        "train_dataset": data,
108
        "eval_dataset" : data,
109
        "optimizer" : optimizer,
110
        "batch_size" : batch_size,
111
        "epochs" : epoch,
112
        "train_sampler": train_subsampler,
113
        "eval_sampler": val_subsampler,
114
        "type" : type
115
    }
116
117
    train_res, test_res = train_loop(**parameters, verbose=False)
118
    return train_res, test_res
119
120
import argparse
121
122
parser = argparse.ArgumentParser(
123
        description='This class is used to optimize the hyperparameters of either the pretrained BioBERT or the base BERT.')
124
parser.add_argument('-tr', '--transfer_learning', type=bool, default=False,
125
                    help='Choose whether the BioBERT model should be used as baseline or not.')
126
parser.add_argument('-t', '--type', type=str, required=True,
127
                    help='Specify the type of annotation to process. Type of annotation needs to be one of the following: Medical Condition, Symptom, Medication, Vital Statistic, Measurement Value, Negation Cue, Medical Procedure')
128
129
args = parser.parse_args()
130
131
if args.type not in ['Medical Condition', 'Symptom', 'Medication', 'Vital Statistic', 'Measurement Value', 'Negation Cue', 'Medical Procedure']:
132
    raise ValueError('Type of annotation needs to be one of the following: Medical Condition, Symptom, Medication, Vital Statistic, Measurement Value, Negation Cue, Medical Procedure')
133
134
135
#-----hyperparameter grids-----#
136
137
batch_sizes = [8, 16]  #[8,16,32]
138
learning_rates = [0.1] #[0.1, 0.01, 0.001, 0.0001]
139
optimizers = ['SGD']  #['SGD', 'Adam']
140
epochs = [1] #[5, 10]
141
max_tokens = 128
142
143
label_to_ids, ids_to_label, type = get_label_descriptions(args.transfer_learning, args.type)
144
dataloader = Dataloader(label_to_ids, ids_to_label, args.transfer_learning, max_tokens, type)
145
data = dataloader.load_dataset(full=True)
146
147
best_f1_score = 0
148
best_param_grid = {
149
    "batch_size": 0,
150
    "learning_rate": 0,
151
    "epochs" : 0,
152
    "optimizer" : "",
153
    "max_tokens" : 0 
154
}
155
156
for batch_size in batch_sizes:
157
    for learning_rate in learning_rates:
158
        for optimizer_name in optimizers:
159
            for epoch in epochs:
160
                kf = KFold(n_splits=4, shuffle=True, random_state=7)
161
                test_f1_scores = []
162
                for fold, (train_idx, val_idx) in enumerate(kf.split(data)):
163
                    train_res, test_res = train_fold(args.transfer_learning, train_idx, val_idx, batch_size, learning_rate, optimizer_name, epoch, type)
164
                    test_f1_scores.append(test_res['avg_f1_score'])
165
                    print(f"Finished fold {fold+1} of 4!")
166
                local_best_f1 = sum(test_f1_scores) / len(test_f1_scores)
167
                if local_best_f1 > best_f1_score:
168
                    best_f1_score = local_best_f1
169
                    best_param_grid = {
170
                        "batch_size": batch_size,
171
                        "learning_rate": learning_rate,
172
                        "epochs" : epoch,
173
                        "optimizer" : optimizer_name,
174
                        "max_tokens" : max_tokens
175
                    }
176
                    print(f"Found new best f1 score: {best_f1_score}")
177
                    print(best_param_grid)
178
179
print("-------FINAL RESULTS-------")
180
print(f"Best f1 score: {best_f1_score}")
181
print(best_param_grid)