a b/model2.py
1
import pickle
2
3
import torch
4
from simpletransformers.classification import (MultiLabelClassificationArgs, MultiLabelClassificationModel)
5
from sklearn.metrics import accuracy_score, hamming_loss, classification_report, roc_auc_score
6
from sklearn.preprocessing import MultiLabelBinarizer
7
import pandas as pd
8
import numpy as np
9
10
train_df = pd.read_csv('/Users/aakansha/Desktop/NCCS NLP for Histology Reports/Datasets for Trials/train_data_for_model2.csv')
11
train_df['Combined Diagnosis'] = train_df['Diagnosis'] + train_df['Gross Description'] \
12
                                 + train_df['Microscopic Description']
13
train_df = train_df[['Combined Diagnosis', 'Primary Site of Cancer']]
14
15
eval_df = pd.read_csv('/Users/aakansha/Desktop/NCCS NLP for Histology Reports/Datasets for Trials/test_data_for_model2.csv')
16
eval_df['Combined Diagnosis'] = eval_df['Diagnosis'] + eval_df['Gross Description'] \
17
                                 + eval_df['Microscopic Description']
18
eval_df = eval_df[['Combined Diagnosis', 'Primary Site of Cancer']]
19
20
eval_metrics_df = pd.DataFrame(columns=['Model Type', 'Model Name', 'Epoch', 'Overall Accuracy', 'Overall AUROC Score',
21
                                        'Hamming Loss', 'Eval Loss'])
22
# Multi-hot Encoding for Train Data
23
24
# Drop NA rows
25
train_df = train_df.dropna(subset=['Primary Site of Cancer']).reset_index(drop=True)
26
27
# Convert datatype of elements to string
28
train_df['Primary Site of Cancer'] = train_df['Primary Site of Cancer'].astype('str')
29
30
# Put primary sites into lists, separate multiple sites for each report
31
for (i, row) in train_df.iterrows():
32
    val = train_df['Primary Site of Cancer'].iloc[i]
33
    list_separated = val.split(",")
34
    stripped = [s.strip().upper() for s in list_separated]
35
    set_list = set(stripped)
36
    train_df.at[i, 'Primary Site of Cancer'] = set_list
37
38
# Initialize MultiLabelBinarizer
39
mlb = MultiLabelBinarizer()
40
mlb.fit(train_df['Primary Site of Cancer'])
41
cols = ["%s" % c for c in mlb.classes_]
42
43
# Fit data into binarizer, generate multi-hot encodings
44
df = pd.DataFrame(mlb.fit_transform(train_df['Primary Site of Cancer']), columns=mlb.classes_)
45
46
# Merge original text with multi-hot encodings
47
train_df = pd.concat([train_df[['Combined Diagnosis']], df], axis=1)
48
49
print(cols)
50
# Generate labels columns as list
51
count = len(cols)
52
train_df['labels'] = ''
53
54
for (i, row) in train_df.iterrows():
55
    labels = []
56
    j = 1
57
    while j <= count:
58
        labels.append(train_df.iloc[i].iloc[j])
59
        j += 1
60
    tup = tuple(labels)
61
    train_df.at[i, 'labels'] = tup
62
63
train_df = train_df[['Combined Diagnosis', 'labels']]
64
65
# Multi-hot Encoding for Test Data
66
67
# Drop NA rows
68
eval_df = eval_df.dropna(subset=['Primary Site of Cancer'])
69
70
# Convert datatype of elements to string
71
eval_df['Primary Site of Cancer'] = eval_df['Primary Site of Cancer'].astype('str')
72
73
# Put primary sites into lists, separate multiple sites for each report
74
for (i, row) in eval_df.iterrows():
75
    val = eval_df['Primary Site of Cancer'].iloc[i]
76
    list_separated = val.split(",")
77
    stripped = [s.strip().upper() for s in list_separated]
78
    set_list = set(stripped)
79
    eval_df.at[i, 'Primary Site of Cancer'] = set_list
80
81
# Fit data into binarizer, generate multi-hot encodings
82
eval_df_individual_labels = pd.DataFrame(mlb.transform(eval_df['Primary Site of Cancer']), columns=cols)
83
84
# Merge original text with multi-hot encodings
85
eval_df = pd.concat([eval_df[['Combined Diagnosis']], eval_df_individual_labels], axis=1)
86
87
# Generate labels columns as list
88
eval_df['labels'] = ''
89
90
for (i, row) in eval_df.iterrows():
91
    labels = []
92
    j = 1
93
    while j <= count:
94
        labels.append(eval_df.iloc[i].iloc[j])
95
        j += 1
96
97
    tup = tuple(labels)
98
    eval_df.at[i, 'labels'] = tup
99
100
eval_df = eval_df[['Combined Diagnosis', 'labels']]
101
102
for n in [2]:
103
104
    curr_epoch = "Epoch" + str(n)
105
106
    # Configure model args
107
    model_args = MultiLabelClassificationArgs(num_train_epochs=n)
108
    model_args.evaluate_during_training_steps = -1
109
    model_args.save_eval_checkpoints = False
110
    model_args.save_model_every_epoch = False
111
    model_args.learning_rate = 1e-5
112
    model_args.manual_seed = 4
113
    model_args.multiprocessing_chunksize = 5000
114
    model_args.no_cache = True
115
    model_args.reprocess_input_data = True
116
    model_args.train_batch_size = 16
117
    model_args.gradient_accumulation_steps = 2
118
    model_args.use_multiprocessing = True
119
    model_args.overwrite_output_dir = True
120
    model_args.labels_list = cols
121
122
    # model
123
    model_type = "roberta"
124
    model_name = "roberta-large"
125
126
    # Create Transformer Model
127
    model = MultiLabelClassificationModel(model_type, model_name, num_labels=count, use_cuda=False, args=model_args)
128
129
    if __name__ == '__main__':
130
131
        # Train the model
132
        model.train_model(train_df)
133
        pickle.dump(model, open('model2.pkl', 'wb'))
134
135
        prediction_df = eval_df['Combined Diagnosis'].values.tolist()
136
137
        # Predict output
138
        prediction, outputs = model.predict(prediction_df)
139
        outputs_df = pd.DataFrame(outputs, columns=cols)
140
        prediction_df = pd.DataFrame(prediction, columns=cols)
141
142
        # Save outputs to csv file
143
        filename_prefix = "/Users/aakansha/Desktop/Model2/" + model_name + "_outputs_df"
144
        filename = "%s.csv" % filename_prefix
145
        outputs_df.to_csv(filename)
146
147
        # Save true and predicted labels to csv file
148
        combined_cols_df = pd.concat([eval_df, prediction_df], axis=1)
149
        filename_prefix = "/Users/aakansha/Desktop/Model2/" + model_name + "_combined_cols_df"
150
        filename = "%s.csv" % filename_prefix
151
        combined_cols_df.to_csv(filename)
152
153
        # Calculate individual label accuracies
154
        label_accuracy_df = pd.concat([eval_df_individual_labels, prediction_df], axis=1)
155
        new_acc_cols_order = np.unique(
156
            np.array(list(zip(eval_df_individual_labels.columns, prediction_df.columns))).flatten())
157
        label_accuracy_df = label_accuracy_df[new_acc_cols_order]
158
159
        count = len(label_accuracy_df.columns)
160
        i = 0
161
        colnames = []
162
        accuracies = []
163
        auroc = []
164
165
        while i < count:
166
            actualValue = label_accuracy_df.iloc[:, i]
167
            predictedValue = label_accuracy_df.iloc[:, i + 1]
168
            actualValue = actualValue.values
169
            predictedValue = predictedValue.values
170
            acc = accuracy_score(actualValue, predictedValue)
171
            # temporary fix, try-except block will be removed in the future with a more balanced dataset
172
            try:
173
                auroc_score = roc_auc_score(actualValue, predictedValue)
174
            except ValueError:
175
                auroc_score = 0
176
            colnames.append(label_accuracy_df.columns[i])
177
            accuracies.append(acc)
178
            auroc.append(auroc_score)
179
            i += 2
180
181
        accuracy_auroc_df = pd.DataFrame(list(zip(colnames, accuracies, auroc)),
182
                                         columns=['Site', 'Accuracy', 'AUROC Score'])
183
184
        # Evaluate model
185
        result, model_outputs, wrong_predictions = model.eval_model(eval_df)
186
187
        # Processing for metrics calculation
188
        eval_df_multi_hot_encodings = []
189
190
        for (i, row) in eval_df.iterrows():
191
            val = eval_df['labels'].iloc[i]
192
            val_array = np.asarray(val)
193
            eval_df_multi_hot_encodings.append(val_array)
194
195
        eval_df_true = np.array(eval_df_multi_hot_encodings)
196
        prediction_data = np.array(prediction)
197
198
        # Calculate metrics
199
        overall_acc = accuracy_score(eval_df_true, prediction_data)
200
        # temporary fix, try-except block will be removed in the future with a more balanced dataset
201
        try:
202
            overall_auroc = roc_auc_score(eval_df_true, prediction_data)
203
        except:
204
            overall_auroc = 0
205
        hamming_loss = hamming_loss(eval_df_true, prediction_data)
206
207
        target_names = cols
208
        other_metrics_report = classification_report(eval_df_true, prediction_data, target_names=target_names,
209
                                                     output_dict=True)
210
        classification_report_df = pd.DataFrame(other_metrics_report).transpose()
211
212
        # Combine individual accuracies to classification report
213
        classification_report_df = classification_report_df.reset_index()
214
        accuracy_auroc_df = accuracy_auroc_df.reset_index()
215
        classification_report_df_final = pd.concat([classification_report_df, accuracy_auroc_df['Accuracy'],
216
                                                    accuracy_auroc_df['AUROC Score']], axis=1)
217
218
        # Save classification report to csv file
219
        filename_prefix = "/Users/aakansha/Desktop/Model2/" + model_name + "_classification_metrics_df"
220
        filename = "%s.csv" % filename_prefix
221
        classification_report_df_final.to_csv(filename)
222
223
        # Save other metrics to csv file
224
        metrics_data = [model_type, model_name, curr_epoch, overall_acc, overall_auroc, hamming_loss,
225
                        result['eval_loss']]
226
        df_length = len(eval_metrics_df)
227
        eval_metrics_df.loc[df_length] = metrics_data
228
        filename_prefix = "/Users/aakansha/Desktop/Model2/" + model_name + "_eval_metrics_df"
229
        filename = "%s.csv" % filename_prefix
230
        eval_metrics_df.to_csv(filename)