|
a |
|
b/model2.py |
|
|
1 |
import pickle |
|
|
2 |
|
|
|
3 |
import torch |
|
|
4 |
from simpletransformers.classification import (MultiLabelClassificationArgs, MultiLabelClassificationModel) |
|
|
5 |
from sklearn.metrics import accuracy_score, hamming_loss, classification_report, roc_auc_score |
|
|
6 |
from sklearn.preprocessing import MultiLabelBinarizer |
|
|
7 |
import pandas as pd |
|
|
8 |
import numpy as np |
|
|
9 |
|
|
|
10 |
train_df = pd.read_csv('/Users/aakansha/Desktop/NCCS NLP for Histology Reports/Datasets for Trials/train_data_for_model2.csv') |
|
|
11 |
train_df['Combined Diagnosis'] = train_df['Diagnosis'] + train_df['Gross Description'] \ |
|
|
12 |
+ train_df['Microscopic Description'] |
|
|
13 |
train_df = train_df[['Combined Diagnosis', 'Primary Site of Cancer']] |
|
|
14 |
|
|
|
15 |
eval_df = pd.read_csv('/Users/aakansha/Desktop/NCCS NLP for Histology Reports/Datasets for Trials/test_data_for_model2.csv') |
|
|
16 |
eval_df['Combined Diagnosis'] = eval_df['Diagnosis'] + eval_df['Gross Description'] \ |
|
|
17 |
+ eval_df['Microscopic Description'] |
|
|
18 |
eval_df = eval_df[['Combined Diagnosis', 'Primary Site of Cancer']] |
|
|
19 |
|
|
|
20 |
eval_metrics_df = pd.DataFrame(columns=['Model Type', 'Model Name', 'Epoch', 'Overall Accuracy', 'Overall AUROC Score', |
|
|
21 |
'Hamming Loss', 'Eval Loss']) |
|
|
22 |
# Multi-hot Encoding for Train Data |
|
|
23 |
|
|
|
24 |
# Drop NA rows |
|
|
25 |
train_df = train_df.dropna(subset=['Primary Site of Cancer']).reset_index(drop=True) |
|
|
26 |
|
|
|
27 |
# Convert datatype of elements to string |
|
|
28 |
train_df['Primary Site of Cancer'] = train_df['Primary Site of Cancer'].astype('str') |
|
|
29 |
|
|
|
30 |
# Put primary sites into lists, separate multiple sites for each report |
|
|
31 |
for (i, row) in train_df.iterrows(): |
|
|
32 |
val = train_df['Primary Site of Cancer'].iloc[i] |
|
|
33 |
list_separated = val.split(",") |
|
|
34 |
stripped = [s.strip().upper() for s in list_separated] |
|
|
35 |
set_list = set(stripped) |
|
|
36 |
train_df.at[i, 'Primary Site of Cancer'] = set_list |
|
|
37 |
|
|
|
38 |
# Initialize MultiLabelBinarizer |
|
|
39 |
mlb = MultiLabelBinarizer() |
|
|
40 |
mlb.fit(train_df['Primary Site of Cancer']) |
|
|
41 |
cols = ["%s" % c for c in mlb.classes_] |
|
|
42 |
|
|
|
43 |
# Fit data into binarizer, generate multi-hot encodings |
|
|
44 |
df = pd.DataFrame(mlb.fit_transform(train_df['Primary Site of Cancer']), columns=mlb.classes_) |
|
|
45 |
|
|
|
46 |
# Merge original text with multi-hot encodings |
|
|
47 |
train_df = pd.concat([train_df[['Combined Diagnosis']], df], axis=1) |
|
|
48 |
|
|
|
49 |
print(cols) |
|
|
50 |
# Generate labels columns as list |
|
|
51 |
count = len(cols) |
|
|
52 |
train_df['labels'] = '' |
|
|
53 |
|
|
|
54 |
for (i, row) in train_df.iterrows(): |
|
|
55 |
labels = [] |
|
|
56 |
j = 1 |
|
|
57 |
while j <= count: |
|
|
58 |
labels.append(train_df.iloc[i].iloc[j]) |
|
|
59 |
j += 1 |
|
|
60 |
tup = tuple(labels) |
|
|
61 |
train_df.at[i, 'labels'] = tup |
|
|
62 |
|
|
|
63 |
train_df = train_df[['Combined Diagnosis', 'labels']] |
|
|
64 |
|
|
|
65 |
# Multi-hot Encoding for Test Data |
|
|
66 |
|
|
|
67 |
# Drop NA rows |
|
|
68 |
eval_df = eval_df.dropna(subset=['Primary Site of Cancer']) |
|
|
69 |
|
|
|
70 |
# Convert datatype of elements to string |
|
|
71 |
eval_df['Primary Site of Cancer'] = eval_df['Primary Site of Cancer'].astype('str') |
|
|
72 |
|
|
|
73 |
# Put primary sites into lists, separate multiple sites for each report |
|
|
74 |
for (i, row) in eval_df.iterrows(): |
|
|
75 |
val = eval_df['Primary Site of Cancer'].iloc[i] |
|
|
76 |
list_separated = val.split(",") |
|
|
77 |
stripped = [s.strip().upper() for s in list_separated] |
|
|
78 |
set_list = set(stripped) |
|
|
79 |
eval_df.at[i, 'Primary Site of Cancer'] = set_list |
|
|
80 |
|
|
|
81 |
# Fit data into binarizer, generate multi-hot encodings |
|
|
82 |
eval_df_individual_labels = pd.DataFrame(mlb.transform(eval_df['Primary Site of Cancer']), columns=cols) |
|
|
83 |
|
|
|
84 |
# Merge original text with multi-hot encodings |
|
|
85 |
eval_df = pd.concat([eval_df[['Combined Diagnosis']], eval_df_individual_labels], axis=1) |
|
|
86 |
|
|
|
87 |
# Generate labels columns as list |
|
|
88 |
eval_df['labels'] = '' |
|
|
89 |
|
|
|
90 |
for (i, row) in eval_df.iterrows(): |
|
|
91 |
labels = [] |
|
|
92 |
j = 1 |
|
|
93 |
while j <= count: |
|
|
94 |
labels.append(eval_df.iloc[i].iloc[j]) |
|
|
95 |
j += 1 |
|
|
96 |
|
|
|
97 |
tup = tuple(labels) |
|
|
98 |
eval_df.at[i, 'labels'] = tup |
|
|
99 |
|
|
|
100 |
eval_df = eval_df[['Combined Diagnosis', 'labels']] |
|
|
101 |
|
|
|
102 |
for n in [2]: |
|
|
103 |
|
|
|
104 |
curr_epoch = "Epoch" + str(n) |
|
|
105 |
|
|
|
106 |
# Configure model args |
|
|
107 |
model_args = MultiLabelClassificationArgs(num_train_epochs=n) |
|
|
108 |
model_args.evaluate_during_training_steps = -1 |
|
|
109 |
model_args.save_eval_checkpoints = False |
|
|
110 |
model_args.save_model_every_epoch = False |
|
|
111 |
model_args.learning_rate = 1e-5 |
|
|
112 |
model_args.manual_seed = 4 |
|
|
113 |
model_args.multiprocessing_chunksize = 5000 |
|
|
114 |
model_args.no_cache = True |
|
|
115 |
model_args.reprocess_input_data = True |
|
|
116 |
model_args.train_batch_size = 16 |
|
|
117 |
model_args.gradient_accumulation_steps = 2 |
|
|
118 |
model_args.use_multiprocessing = True |
|
|
119 |
model_args.overwrite_output_dir = True |
|
|
120 |
model_args.labels_list = cols |
|
|
121 |
|
|
|
122 |
# model |
|
|
123 |
model_type = "roberta" |
|
|
124 |
model_name = "roberta-large" |
|
|
125 |
|
|
|
126 |
# Create Transformer Model |
|
|
127 |
model = MultiLabelClassificationModel(model_type, model_name, num_labels=count, use_cuda=False, args=model_args) |
|
|
128 |
|
|
|
129 |
if __name__ == '__main__': |
|
|
130 |
|
|
|
131 |
# Train the model |
|
|
132 |
model.train_model(train_df) |
|
|
133 |
pickle.dump(model, open('model2.pkl', 'wb')) |
|
|
134 |
|
|
|
135 |
prediction_df = eval_df['Combined Diagnosis'].values.tolist() |
|
|
136 |
|
|
|
137 |
# Predict output |
|
|
138 |
prediction, outputs = model.predict(prediction_df) |
|
|
139 |
outputs_df = pd.DataFrame(outputs, columns=cols) |
|
|
140 |
prediction_df = pd.DataFrame(prediction, columns=cols) |
|
|
141 |
|
|
|
142 |
# Save outputs to csv file |
|
|
143 |
filename_prefix = "/Users/aakansha/Desktop/Model2/" + model_name + "_outputs_df" |
|
|
144 |
filename = "%s.csv" % filename_prefix |
|
|
145 |
outputs_df.to_csv(filename) |
|
|
146 |
|
|
|
147 |
# Save true and predicted labels to csv file |
|
|
148 |
combined_cols_df = pd.concat([eval_df, prediction_df], axis=1) |
|
|
149 |
filename_prefix = "/Users/aakansha/Desktop/Model2/" + model_name + "_combined_cols_df" |
|
|
150 |
filename = "%s.csv" % filename_prefix |
|
|
151 |
combined_cols_df.to_csv(filename) |
|
|
152 |
|
|
|
153 |
# Calculate individual label accuracies |
|
|
154 |
label_accuracy_df = pd.concat([eval_df_individual_labels, prediction_df], axis=1) |
|
|
155 |
new_acc_cols_order = np.unique( |
|
|
156 |
np.array(list(zip(eval_df_individual_labels.columns, prediction_df.columns))).flatten()) |
|
|
157 |
label_accuracy_df = label_accuracy_df[new_acc_cols_order] |
|
|
158 |
|
|
|
159 |
count = len(label_accuracy_df.columns) |
|
|
160 |
i = 0 |
|
|
161 |
colnames = [] |
|
|
162 |
accuracies = [] |
|
|
163 |
auroc = [] |
|
|
164 |
|
|
|
165 |
while i < count: |
|
|
166 |
actualValue = label_accuracy_df.iloc[:, i] |
|
|
167 |
predictedValue = label_accuracy_df.iloc[:, i + 1] |
|
|
168 |
actualValue = actualValue.values |
|
|
169 |
predictedValue = predictedValue.values |
|
|
170 |
acc = accuracy_score(actualValue, predictedValue) |
|
|
171 |
# temporary fix, try-except block will be removed in the future with a more balanced dataset |
|
|
172 |
try: |
|
|
173 |
auroc_score = roc_auc_score(actualValue, predictedValue) |
|
|
174 |
except ValueError: |
|
|
175 |
auroc_score = 0 |
|
|
176 |
colnames.append(label_accuracy_df.columns[i]) |
|
|
177 |
accuracies.append(acc) |
|
|
178 |
auroc.append(auroc_score) |
|
|
179 |
i += 2 |
|
|
180 |
|
|
|
181 |
accuracy_auroc_df = pd.DataFrame(list(zip(colnames, accuracies, auroc)), |
|
|
182 |
columns=['Site', 'Accuracy', 'AUROC Score']) |
|
|
183 |
|
|
|
184 |
# Evaluate model |
|
|
185 |
result, model_outputs, wrong_predictions = model.eval_model(eval_df) |
|
|
186 |
|
|
|
187 |
# Processing for metrics calculation |
|
|
188 |
eval_df_multi_hot_encodings = [] |
|
|
189 |
|
|
|
190 |
for (i, row) in eval_df.iterrows(): |
|
|
191 |
val = eval_df['labels'].iloc[i] |
|
|
192 |
val_array = np.asarray(val) |
|
|
193 |
eval_df_multi_hot_encodings.append(val_array) |
|
|
194 |
|
|
|
195 |
eval_df_true = np.array(eval_df_multi_hot_encodings) |
|
|
196 |
prediction_data = np.array(prediction) |
|
|
197 |
|
|
|
198 |
# Calculate metrics |
|
|
199 |
overall_acc = accuracy_score(eval_df_true, prediction_data) |
|
|
200 |
# temporary fix, try-except block will be removed in the future with a more balanced dataset |
|
|
201 |
try: |
|
|
202 |
overall_auroc = roc_auc_score(eval_df_true, prediction_data) |
|
|
203 |
except: |
|
|
204 |
overall_auroc = 0 |
|
|
205 |
hamming_loss = hamming_loss(eval_df_true, prediction_data) |
|
|
206 |
|
|
|
207 |
target_names = cols |
|
|
208 |
other_metrics_report = classification_report(eval_df_true, prediction_data, target_names=target_names, |
|
|
209 |
output_dict=True) |
|
|
210 |
classification_report_df = pd.DataFrame(other_metrics_report).transpose() |
|
|
211 |
|
|
|
212 |
# Combine individual accuracies to classification report |
|
|
213 |
classification_report_df = classification_report_df.reset_index() |
|
|
214 |
accuracy_auroc_df = accuracy_auroc_df.reset_index() |
|
|
215 |
classification_report_df_final = pd.concat([classification_report_df, accuracy_auroc_df['Accuracy'], |
|
|
216 |
accuracy_auroc_df['AUROC Score']], axis=1) |
|
|
217 |
|
|
|
218 |
# Save classification report to csv file |
|
|
219 |
filename_prefix = "/Users/aakansha/Desktop/Model2/" + model_name + "_classification_metrics_df" |
|
|
220 |
filename = "%s.csv" % filename_prefix |
|
|
221 |
classification_report_df_final.to_csv(filename) |
|
|
222 |
|
|
|
223 |
# Save other metrics to csv file |
|
|
224 |
metrics_data = [model_type, model_name, curr_epoch, overall_acc, overall_auroc, hamming_loss, |
|
|
225 |
result['eval_loss']] |
|
|
226 |
df_length = len(eval_metrics_df) |
|
|
227 |
eval_metrics_df.loc[df_length] = metrics_data |
|
|
228 |
filename_prefix = "/Users/aakansha/Desktop/Model2/" + model_name + "_eval_metrics_df" |
|
|
229 |
filename = "%s.csv" % filename_prefix |
|
|
230 |
eval_metrics_df.to_csv(filename) |