|
a |
|
b/main.py |
|
|
1 |
import sys |
|
|
2 |
|
|
|
3 |
import torch |
|
|
4 |
import numpy as np |
|
|
5 |
|
|
|
6 |
import consts |
|
|
7 |
from datasets import ContrastiveDataset, SimpleDataset |
|
|
8 |
from torch.utils.data import DataLoader |
|
|
9 |
from transformers import AutoModel |
|
|
10 |
|
|
|
11 |
from data import process_raw_data |
|
|
12 |
from SNN_model import BERT_Arch, SiameseNeuralNetwork |
|
|
13 |
from SNN_training import train_siamese_network |
|
|
14 |
from projection import construct_train_matrix, extract_prototypes, project_to_dissimilarity_space |
|
|
15 |
from SVM_model import ensemble_of_classifiers |
|
|
16 |
from sklearn.model_selection import train_test_split |
|
|
17 |
|
|
|
18 |
from transformers import BertTokenizerFast |
|
|
19 |
|
|
|
20 |
|
|
|
21 |
def predict(projected_test, classifiers_list, categories_order): |
|
|
22 |
|
|
|
23 |
pred_y = [] |
|
|
24 |
|
|
|
25 |
for classifier in classifiers_list: |
|
|
26 |
pred_y.append(classifier.predict_proba(projected_test)[:,1]) # predict_proba returns probabiltiy for class==0 and for class==1, so we take only the probabilities of class==1 |
|
|
27 |
|
|
|
28 |
pred_y = np.vstack(pred_y) # (num_classifiers, num_samples_test) |
|
|
29 |
highest_predictions = categories_order[np.argmax(pred_y, axis=0)] |
|
|
30 |
print(pred_y) |
|
|
31 |
print(highest_predictions) |
|
|
32 |
|
|
|
33 |
return highest_predictions |
|
|
34 |
|
|
|
35 |
|
|
|
36 |
|
|
|
37 |
if __name__ == '__main__': |
|
|
38 |
|
|
|
39 |
data_path = sys.argv[1] |
|
|
40 |
|
|
|
41 |
# ------------- Data -------------------------- |
|
|
42 |
|
|
|
43 |
data, test_unseen_categories= process_raw_data(data_path) |
|
|
44 |
|
|
|
45 |
train_text, temp_text, train_labels, temp_labels = train_test_split(data['description'], data['labels'], |
|
|
46 |
random_state=42, |
|
|
47 |
test_size=0.3, |
|
|
48 |
stratify=data['labels']) |
|
|
49 |
|
|
|
50 |
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, |
|
|
51 |
random_state=42, |
|
|
52 |
test_size=0.5, |
|
|
53 |
stratify=temp_labels) |
|
|
54 |
|
|
|
55 |
unseen_train_text, unseen_test_text, unseen_train_labels, unseen_test_labels = train_test_split( |
|
|
56 |
test_unseen_categories['description'], test_unseen_categories['labels'], |
|
|
57 |
random_state=42, |
|
|
58 |
test_size=0.2, |
|
|
59 |
stratify=test_unseen_categories['labels']) |
|
|
60 |
|
|
|
61 |
|
|
|
62 |
|
|
|
63 |
# Tokinization |
|
|
64 |
|
|
|
65 |
# Load the BERT tokenizer |
|
|
66 |
model_name = consts.model_name |
|
|
67 |
tokenizer = BertTokenizerFast.from_pretrained(model_name) |
|
|
68 |
|
|
|
69 |
# tokenize and encode sequences in the sets set |
|
|
70 |
texts = [train_text, val_text, test_text, unseen_train_text, unseen_test_text] |
|
|
71 |
tokens_texts = [] |
|
|
72 |
|
|
|
73 |
for text in texts: |
|
|
74 |
tokens_texts.append( |
|
|
75 |
tokenizer.batch_encode_plus(text.tolist(), max_length=consts.MAX_SENTENCE_LENGTh, padding='max_length', |
|
|
76 |
truncation=True)) |
|
|
77 |
|
|
|
78 |
train_tokinized, val_tokinized, test_tokinized, unseen_train_tokinized, unseen_test_tokinized = tokens_texts |
|
|
79 |
|
|
|
80 |
def convert_to_tensors(data, labels): |
|
|
81 |
seq = torch.tensor(data['input_ids']) |
|
|
82 |
mask = torch.tensor(data['attention_mask']) |
|
|
83 |
y = torch.tensor(labels.tolist()) |
|
|
84 |
|
|
|
85 |
return seq, mask, y |
|
|
86 |
|
|
|
87 |
|
|
|
88 |
train_seq, train_mask, train_y = convert_to_tensors(train_tokinized, train_labels) |
|
|
89 |
val_seq, val_mask, val_y = convert_to_tensors(val_tokinized, val_labels) |
|
|
90 |
test_seq, test_mask, test_y = convert_to_tensors(test_tokinized, test_labels) |
|
|
91 |
|
|
|
92 |
unseen_train_seq, unseen_train_mask, unseen_train_y = convert_to_tensors(unseen_train_tokinized, unseen_train_labels) |
|
|
93 |
unseen_test_seq, unseen_test_mask, unseen_test_y = convert_to_tensors(unseen_test_tokinized, unseen_test_labels) |
|
|
94 |
|
|
|
95 |
|
|
|
96 |
train_set = ContrastiveDataset(train_seq, train_mask, train_y) |
|
|
97 |
val_set = ContrastiveDataset(val_seq, val_mask, val_y) |
|
|
98 |
test_set = ContrastiveDataset(test_seq, test_mask, test_y) |
|
|
99 |
|
|
|
100 |
train_set_simple = SimpleDataset(train_seq, train_mask, train_y) |
|
|
101 |
test_set_simple = SimpleDataset(test_seq, test_mask, test_y) |
|
|
102 |
unseen_train_set_simple = SimpleDataset(unseen_train_seq, unseen_train_mask, unseen_train_y) |
|
|
103 |
unseen_test_set_simple = SimpleDataset(unseen_test_seq, unseen_test_mask, unseen_test_y) |
|
|
104 |
|
|
|
105 |
trainLoader = DataLoader(train_set, batch_size=32, shuffle=True, drop_last=False, num_workers=0) |
|
|
106 |
valLoader = DataLoader(val_set, batch_size=32, shuffle=True, drop_last=False, num_workers=0) |
|
|
107 |
testLoader = DataLoader(test_set, batch_size=10, shuffle=False, drop_last=False, num_workers=0) |
|
|
108 |
|
|
|
109 |
trainLoader_simple = DataLoader(train_set_simple, batch_size=32, shuffle=False, drop_last=False, num_workers=0) |
|
|
110 |
testLoader_simple = DataLoader(test_set_simple, batch_size=64, shuffle=False, drop_last=False, num_workers=0) |
|
|
111 |
unseen_trainLoader_simple = DataLoader(unseen_train_set_simple, batch_size=64, shuffle=False, drop_last=False, |
|
|
112 |
num_workers=0) |
|
|
113 |
unseen_testLoader_simple = DataLoader(unseen_test_set_simple, batch_size=64, shuffle=False, drop_last=False, |
|
|
114 |
num_workers=0) |
|
|
115 |
|
|
|
116 |
|
|
|
117 |
# -------------- Parametrs -------------------- |
|
|
118 |
model_name = consts.model_name |
|
|
119 |
|
|
|
120 |
|
|
|
121 |
# ------------- Train SNN -------------------------- |
|
|
122 |
|
|
|
123 |
# specify GPU |
|
|
124 |
# Get cpu or gpu device for training. |
|
|
125 |
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
126 |
print("Using {} device".format(device)) |
|
|
127 |
|
|
|
128 |
# import BERT-base pretrained model |
|
|
129 |
bert = AutoModel.from_pretrained( |
|
|
130 |
model_name) # ('bert-base-uncased') 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' |
|
|
131 |
|
|
|
132 |
# freeze all the parameters |
|
|
133 |
for param in bert.parameters(): |
|
|
134 |
param.requires_grad = False |
|
|
135 |
|
|
|
136 |
# pass the pre-trained BERT to our define architecture |
|
|
137 |
bert_arch = BERT_Arch(bert) |
|
|
138 |
|
|
|
139 |
SNN_model = SiameseNeuralNetwork(bert_arch).to(device) |
|
|
140 |
num_epochs = 30 |
|
|
141 |
|
|
|
142 |
train_loss_history, val_loss_history, similarities_list = train_siamese_network(SNN_model, |
|
|
143 |
dataloaders={"train": trainLoader, |
|
|
144 |
"val": valLoader}, |
|
|
145 |
num_epochs=num_epochs, |
|
|
146 |
device=device) |
|
|
147 |
|
|
|
148 |
non_matching_similarity, matching_similarity, val_non_matching_similarity, val_matching_similarity = similarities_list |
|
|
149 |
|
|
|
150 |
|
|
|
151 |
# ----------------- Prototypes Selection ------------------------ |
|
|
152 |
|
|
|
153 |
train_matrix = construct_train_matrix(SNN_model, trainLoader_simple) |
|
|
154 |
prototypes_list = extract_prototypes(100, trainLoader_simple, train_labels, train_matrix) |
|
|
155 |
|
|
|
156 |
# ---------------- Data Projection ------------------------------ |
|
|
157 |
|
|
|
158 |
projected_train = project_to_dissimilarity_space(trainLoader_simple, SNN_model, prototypes_list) |
|
|
159 |
|
|
|
160 |
# ----------------- SVM Ensemble ----------------------------------- |
|
|
161 |
|
|
|
162 |
classifiers, categories_order = ensemble_of_classifiers(projected_train, train_labels) |
|
|
163 |
|
|
|
164 |
|
|
|
165 |
# ------------------ Test: Seen categories --------------------- |
|
|
166 |
projected_test = project_to_dissimilarity_space(testLoader_simple, SNN_model, prototypes_list) |
|
|
167 |
preds = predict(projected_test, classifiers, categories_order) |
|
|
168 |
|
|
|
169 |
# ------------------ Test: Unseen categories --------------------- |
|
|
170 |
unseen_train_matrix = construct_train_matrix(SNN_model, unseen_trainLoader_simple) |
|
|
171 |
unseen_prototypes_list = extract_prototypes(100, unseen_trainLoader_simple, unseen_train_labels, |
|
|
172 |
unseen_train_matrix) |
|
|
173 |
unseen_projected_train = project_to_dissimilarity_space(unseen_trainLoader_simple, SNN_model, |
|
|
174 |
unseen_prototypes_list) |
|
|
175 |
unseen_classifiers, unseen_categories_order = ensemble_of_classifiers(unseen_projected_train, unseen_train_labels) |
|
|
176 |
unseen_projected_test = project_to_dissimilarity_space(unseen_testLoader_simple, SNN_model, unseen_prototypes_list) |
|
|
177 |
unseen_preds = predict(unseen_projected_test, unseen_classifiers, unseen_categories_order) |
|
|
178 |
|
|
|
179 |
|
|
|
180 |
|