Diff of /model2.py [000000] .. [ec103b]

Switch to side-by-side view

--- a
+++ b/model2.py
@@ -0,0 +1,230 @@
+import pickle
+
+import torch
+from simpletransformers.classification import (MultiLabelClassificationArgs, MultiLabelClassificationModel)
+from sklearn.metrics import accuracy_score, hamming_loss, classification_report, roc_auc_score
+from sklearn.preprocessing import MultiLabelBinarizer
+import pandas as pd
+import numpy as np
+
+train_df = pd.read_csv('/Users/aakansha/Desktop/NCCS NLP for Histology Reports/Datasets for Trials/train_data_for_model2.csv')
+train_df['Combined Diagnosis'] = train_df['Diagnosis'] + train_df['Gross Description'] \
+                                 + train_df['Microscopic Description']
+train_df = train_df[['Combined Diagnosis', 'Primary Site of Cancer']]
+
+eval_df = pd.read_csv('/Users/aakansha/Desktop/NCCS NLP for Histology Reports/Datasets for Trials/test_data_for_model2.csv')
+eval_df['Combined Diagnosis'] = eval_df['Diagnosis'] + eval_df['Gross Description'] \
+                                 + eval_df['Microscopic Description']
+eval_df = eval_df[['Combined Diagnosis', 'Primary Site of Cancer']]
+
+eval_metrics_df = pd.DataFrame(columns=['Model Type', 'Model Name', 'Epoch', 'Overall Accuracy', 'Overall AUROC Score',
+                                        'Hamming Loss', 'Eval Loss'])
+# Multi-hot Encoding for Train Data
+
+# Drop NA rows
+train_df = train_df.dropna(subset=['Primary Site of Cancer']).reset_index(drop=True)
+
+# Convert datatype of elements to string
+train_df['Primary Site of Cancer'] = train_df['Primary Site of Cancer'].astype('str')
+
+# Put primary sites into lists, separate multiple sites for each report
+for (i, row) in train_df.iterrows():
+    val = train_df['Primary Site of Cancer'].iloc[i]
+    list_separated = val.split(",")
+    stripped = [s.strip().upper() for s in list_separated]
+    set_list = set(stripped)
+    train_df.at[i, 'Primary Site of Cancer'] = set_list
+
+# Initialize MultiLabelBinarizer
+mlb = MultiLabelBinarizer()
+mlb.fit(train_df['Primary Site of Cancer'])
+cols = ["%s" % c for c in mlb.classes_]
+
+# Fit data into binarizer, generate multi-hot encodings
+df = pd.DataFrame(mlb.fit_transform(train_df['Primary Site of Cancer']), columns=mlb.classes_)
+
+# Merge original text with multi-hot encodings
+train_df = pd.concat([train_df[['Combined Diagnosis']], df], axis=1)
+
+print(cols)
+# Generate labels columns as list
+count = len(cols)
+train_df['labels'] = ''
+
+for (i, row) in train_df.iterrows():
+    labels = []
+    j = 1
+    while j <= count:
+        labels.append(train_df.iloc[i].iloc[j])
+        j += 1
+    tup = tuple(labels)
+    train_df.at[i, 'labels'] = tup
+
+train_df = train_df[['Combined Diagnosis', 'labels']]
+
+# Multi-hot Encoding for Test Data
+
+# Drop NA rows
+eval_df = eval_df.dropna(subset=['Primary Site of Cancer'])
+
+# Convert datatype of elements to string
+eval_df['Primary Site of Cancer'] = eval_df['Primary Site of Cancer'].astype('str')
+
+# Put primary sites into lists, separate multiple sites for each report
+for (i, row) in eval_df.iterrows():
+    val = eval_df['Primary Site of Cancer'].iloc[i]
+    list_separated = val.split(",")
+    stripped = [s.strip().upper() for s in list_separated]
+    set_list = set(stripped)
+    eval_df.at[i, 'Primary Site of Cancer'] = set_list
+
+# Fit data into binarizer, generate multi-hot encodings
+eval_df_individual_labels = pd.DataFrame(mlb.transform(eval_df['Primary Site of Cancer']), columns=cols)
+
+# Merge original text with multi-hot encodings
+eval_df = pd.concat([eval_df[['Combined Diagnosis']], eval_df_individual_labels], axis=1)
+
+# Generate labels columns as list
+eval_df['labels'] = ''
+
+for (i, row) in eval_df.iterrows():
+    labels = []
+    j = 1
+    while j <= count:
+        labels.append(eval_df.iloc[i].iloc[j])
+        j += 1
+
+    tup = tuple(labels)
+    eval_df.at[i, 'labels'] = tup
+
+eval_df = eval_df[['Combined Diagnosis', 'labels']]
+
+for n in [2]:
+
+    curr_epoch = "Epoch" + str(n)
+
+    # Configure model args
+    model_args = MultiLabelClassificationArgs(num_train_epochs=n)
+    model_args.evaluate_during_training_steps = -1
+    model_args.save_eval_checkpoints = False
+    model_args.save_model_every_epoch = False
+    model_args.learning_rate = 1e-5
+    model_args.manual_seed = 4
+    model_args.multiprocessing_chunksize = 5000
+    model_args.no_cache = True
+    model_args.reprocess_input_data = True
+    model_args.train_batch_size = 16
+    model_args.gradient_accumulation_steps = 2
+    model_args.use_multiprocessing = True
+    model_args.overwrite_output_dir = True
+    model_args.labels_list = cols
+
+    # model
+    model_type = "roberta"
+    model_name = "roberta-large"
+
+    # Create Transformer Model
+    model = MultiLabelClassificationModel(model_type, model_name, num_labels=count, use_cuda=False, args=model_args)
+
+    if __name__ == '__main__':
+
+        # Train the model
+        model.train_model(train_df)
+        pickle.dump(model, open('model2.pkl', 'wb'))
+
+        prediction_df = eval_df['Combined Diagnosis'].values.tolist()
+
+        # Predict output
+        prediction, outputs = model.predict(prediction_df)
+        outputs_df = pd.DataFrame(outputs, columns=cols)
+        prediction_df = pd.DataFrame(prediction, columns=cols)
+
+        # Save outputs to csv file
+        filename_prefix = "/Users/aakansha/Desktop/Model2/" + model_name + "_outputs_df"
+        filename = "%s.csv" % filename_prefix
+        outputs_df.to_csv(filename)
+
+        # Save true and predicted labels to csv file
+        combined_cols_df = pd.concat([eval_df, prediction_df], axis=1)
+        filename_prefix = "/Users/aakansha/Desktop/Model2/" + model_name + "_combined_cols_df"
+        filename = "%s.csv" % filename_prefix
+        combined_cols_df.to_csv(filename)
+
+        # Calculate individual label accuracies
+        label_accuracy_df = pd.concat([eval_df_individual_labels, prediction_df], axis=1)
+        new_acc_cols_order = np.unique(
+            np.array(list(zip(eval_df_individual_labels.columns, prediction_df.columns))).flatten())
+        label_accuracy_df = label_accuracy_df[new_acc_cols_order]
+
+        count = len(label_accuracy_df.columns)
+        i = 0
+        colnames = []
+        accuracies = []
+        auroc = []
+
+        while i < count:
+            actualValue = label_accuracy_df.iloc[:, i]
+            predictedValue = label_accuracy_df.iloc[:, i + 1]
+            actualValue = actualValue.values
+            predictedValue = predictedValue.values
+            acc = accuracy_score(actualValue, predictedValue)
+            # temporary fix, try-except block will be removed in the future with a more balanced dataset
+            try:
+                auroc_score = roc_auc_score(actualValue, predictedValue)
+            except ValueError:
+                auroc_score = 0
+            colnames.append(label_accuracy_df.columns[i])
+            accuracies.append(acc)
+            auroc.append(auroc_score)
+            i += 2
+
+        accuracy_auroc_df = pd.DataFrame(list(zip(colnames, accuracies, auroc)),
+                                         columns=['Site', 'Accuracy', 'AUROC Score'])
+
+        # Evaluate model
+        result, model_outputs, wrong_predictions = model.eval_model(eval_df)
+
+        # Processing for metrics calculation
+        eval_df_multi_hot_encodings = []
+
+        for (i, row) in eval_df.iterrows():
+            val = eval_df['labels'].iloc[i]
+            val_array = np.asarray(val)
+            eval_df_multi_hot_encodings.append(val_array)
+
+        eval_df_true = np.array(eval_df_multi_hot_encodings)
+        prediction_data = np.array(prediction)
+
+        # Calculate metrics
+        overall_acc = accuracy_score(eval_df_true, prediction_data)
+        # temporary fix, try-except block will be removed in the future with a more balanced dataset
+        try:
+            overall_auroc = roc_auc_score(eval_df_true, prediction_data)
+        except:
+            overall_auroc = 0
+        hamming_loss = hamming_loss(eval_df_true, prediction_data)
+
+        target_names = cols
+        other_metrics_report = classification_report(eval_df_true, prediction_data, target_names=target_names,
+                                                     output_dict=True)
+        classification_report_df = pd.DataFrame(other_metrics_report).transpose()
+
+        # Combine individual accuracies to classification report
+        classification_report_df = classification_report_df.reset_index()
+        accuracy_auroc_df = accuracy_auroc_df.reset_index()
+        classification_report_df_final = pd.concat([classification_report_df, accuracy_auroc_df['Accuracy'],
+                                                    accuracy_auroc_df['AUROC Score']], axis=1)
+
+        # Save classification report to csv file
+        filename_prefix = "/Users/aakansha/Desktop/Model2/" + model_name + "_classification_metrics_df"
+        filename = "%s.csv" % filename_prefix
+        classification_report_df_final.to_csv(filename)
+
+        # Save other metrics to csv file
+        metrics_data = [model_type, model_name, curr_epoch, overall_acc, overall_auroc, hamming_loss,
+                        result['eval_loss']]
+        df_length = len(eval_metrics_df)
+        eval_metrics_df.loc[df_length] = metrics_data
+        filename_prefix = "/Users/aakansha/Desktop/Model2/" + model_name + "_eval_metrics_df"
+        filename = "%s.csv" % filename_prefix
+        eval_metrics_df.to_csv(filename)
\ No newline at end of file