Diff of /main.py [000000] .. [87f2bb]

Switch to unified view

a b/main.py
1
import sys
2
3
import torch
4
import numpy as np
5
6
import consts
7
from datasets import ContrastiveDataset, SimpleDataset
8
from torch.utils.data import DataLoader
9
from transformers import AutoModel
10
11
from data import process_raw_data
12
from SNN_model import BERT_Arch, SiameseNeuralNetwork
13
from SNN_training import train_siamese_network
14
from projection import construct_train_matrix, extract_prototypes, project_to_dissimilarity_space
15
from SVM_model import ensemble_of_classifiers
16
from sklearn.model_selection import train_test_split
17
18
from transformers import BertTokenizerFast
19
20
21
def predict(projected_test, classifiers_list, categories_order):
22
23
    pred_y = []
24
25
    for classifier in classifiers_list:
26
         pred_y.append(classifier.predict_proba(projected_test)[:,1])    # predict_proba returns probabiltiy for class==0 and for class==1, so we take only the probabilities of class==1
27
28
    pred_y = np.vstack(pred_y) # (num_classifiers, num_samples_test)
29
    highest_predictions = categories_order[np.argmax(pred_y, axis=0)]
30
    print(pred_y)
31
    print(highest_predictions)
32
33
    return highest_predictions
34
35
36
37
if __name__ == '__main__':
38
39
    data_path = sys.argv[1]
40
41
    # ------------- Data --------------------------
42
43
    data, test_unseen_categories= process_raw_data(data_path)
44
45
    train_text, temp_text, train_labels, temp_labels = train_test_split(data['description'], data['labels'],
46
                                                                        random_state=42,
47
                                                                        test_size=0.3,
48
                                                                        stratify=data['labels'])
49
50
    val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
51
                                                                    random_state=42,
52
                                                                    test_size=0.5,
53
                                                                    stratify=temp_labels)
54
55
    unseen_train_text, unseen_test_text, unseen_train_labels, unseen_test_labels = train_test_split(
56
        test_unseen_categories['description'], test_unseen_categories['labels'],
57
        random_state=42,
58
        test_size=0.2,
59
        stratify=test_unseen_categories['labels'])
60
61
62
63
    # Tokinization
64
65
    # Load the BERT tokenizer
66
    model_name = consts.model_name
67
    tokenizer = BertTokenizerFast.from_pretrained(model_name)
68
69
    # tokenize and encode sequences in the sets set
70
    texts = [train_text, val_text, test_text, unseen_train_text, unseen_test_text]
71
    tokens_texts = []
72
73
    for text in texts:
74
        tokens_texts.append(
75
            tokenizer.batch_encode_plus(text.tolist(), max_length=consts.MAX_SENTENCE_LENGTh, padding='max_length',
76
                                        truncation=True))
77
78
    train_tokinized, val_tokinized, test_tokinized, unseen_train_tokinized, unseen_test_tokinized = tokens_texts
79
80
    def convert_to_tensors(data, labels):
81
        seq = torch.tensor(data['input_ids'])
82
        mask = torch.tensor(data['attention_mask'])
83
        y = torch.tensor(labels.tolist())
84
85
        return seq, mask, y
86
87
88
    train_seq, train_mask, train_y = convert_to_tensors(train_tokinized, train_labels)
89
    val_seq, val_mask, val_y = convert_to_tensors(val_tokinized, val_labels)
90
    test_seq, test_mask, test_y = convert_to_tensors(test_tokinized, test_labels)
91
92
    unseen_train_seq, unseen_train_mask, unseen_train_y = convert_to_tensors(unseen_train_tokinized, unseen_train_labels)
93
    unseen_test_seq, unseen_test_mask, unseen_test_y = convert_to_tensors(unseen_test_tokinized, unseen_test_labels)
94
95
96
    train_set = ContrastiveDataset(train_seq, train_mask, train_y)
97
    val_set = ContrastiveDataset(val_seq, val_mask, val_y)
98
    test_set = ContrastiveDataset(test_seq, test_mask, test_y)
99
100
    train_set_simple = SimpleDataset(train_seq, train_mask, train_y)
101
    test_set_simple = SimpleDataset(test_seq, test_mask, test_y)
102
    unseen_train_set_simple = SimpleDataset(unseen_train_seq, unseen_train_mask, unseen_train_y)
103
    unseen_test_set_simple = SimpleDataset(unseen_test_seq, unseen_test_mask, unseen_test_y)
104
105
    trainLoader = DataLoader(train_set, batch_size=32, shuffle=True, drop_last=False, num_workers=0)
106
    valLoader = DataLoader(val_set, batch_size=32, shuffle=True, drop_last=False, num_workers=0)
107
    testLoader = DataLoader(test_set, batch_size=10, shuffle=False, drop_last=False, num_workers=0)
108
109
    trainLoader_simple = DataLoader(train_set_simple, batch_size=32, shuffle=False, drop_last=False, num_workers=0)
110
    testLoader_simple = DataLoader(test_set_simple, batch_size=64, shuffle=False, drop_last=False, num_workers=0)
111
    unseen_trainLoader_simple = DataLoader(unseen_train_set_simple, batch_size=64, shuffle=False, drop_last=False,
112
                                           num_workers=0)
113
    unseen_testLoader_simple = DataLoader(unseen_test_set_simple, batch_size=64, shuffle=False, drop_last=False,
114
                                          num_workers=0)
115
116
117
    # -------------- Parametrs --------------------
118
    model_name = consts.model_name
119
120
121
    # ------------- Train SNN --------------------------
122
123
    # specify GPU
124
    # Get cpu or gpu device for training.
125
    device = "cuda" if torch.cuda.is_available() else "cpu"
126
    print("Using {} device".format(device))
127
128
    # import BERT-base pretrained model
129
    bert = AutoModel.from_pretrained(
130
        model_name)  # ('bert-base-uncased') 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
131
132
    # freeze all the parameters
133
    for param in bert.parameters():
134
        param.requires_grad = False
135
136
    # pass the pre-trained BERT to our define architecture
137
    bert_arch = BERT_Arch(bert)
138
139
    SNN_model = SiameseNeuralNetwork(bert_arch).to(device)
140
    num_epochs = 30
141
142
    train_loss_history, val_loss_history, similarities_list = train_siamese_network(SNN_model,
143
                                                                                    dataloaders={"train": trainLoader,
144
                                                                                                 "val": valLoader},
145
                                                                                    num_epochs=num_epochs,
146
                                                                                    device=device)
147
148
    non_matching_similarity, matching_similarity, val_non_matching_similarity, val_matching_similarity = similarities_list
149
150
151
    # ----------------- Prototypes Selection ------------------------
152
153
    train_matrix = construct_train_matrix(SNN_model, trainLoader_simple)
154
    prototypes_list = extract_prototypes(100, trainLoader_simple, train_labels, train_matrix)
155
156
    # ---------------- Data Projection ------------------------------
157
158
    projected_train = project_to_dissimilarity_space(trainLoader_simple, SNN_model, prototypes_list)
159
160
    # ----------------- SVM Ensemble -----------------------------------
161
162
    classifiers, categories_order = ensemble_of_classifiers(projected_train, train_labels)
163
164
165
    # ------------------ Test: Seen categories ---------------------
166
    projected_test = project_to_dissimilarity_space(testLoader_simple, SNN_model, prototypes_list)
167
    preds = predict(projected_test, classifiers, categories_order)
168
169
    # ------------------ Test: Unseen categories ---------------------
170
    unseen_train_matrix = construct_train_matrix(SNN_model, unseen_trainLoader_simple)
171
    unseen_prototypes_list = extract_prototypes(100, unseen_trainLoader_simple, unseen_train_labels,
172
                                                unseen_train_matrix)
173
    unseen_projected_train = project_to_dissimilarity_space(unseen_trainLoader_simple, SNN_model,
174
                                                            unseen_prototypes_list)
175
    unseen_classifiers, unseen_categories_order = ensemble_of_classifiers(unseen_projected_train, unseen_train_labels)
176
    unseen_projected_test = project_to_dissimilarity_space(unseen_testLoader_simple, SNN_model, unseen_prototypes_list)
177
    unseen_preds = predict(unseen_projected_test, unseen_classifiers, unseen_categories_order)
178
179
180