--- a +++ b/model2.py @@ -0,0 +1,230 @@ +import pickle + +import torch +from simpletransformers.classification import (MultiLabelClassificationArgs, MultiLabelClassificationModel) +from sklearn.metrics import accuracy_score, hamming_loss, classification_report, roc_auc_score +from sklearn.preprocessing import MultiLabelBinarizer +import pandas as pd +import numpy as np + +train_df = pd.read_csv('/Users/aakansha/Desktop/NCCS NLP for Histology Reports/Datasets for Trials/train_data_for_model2.csv') +train_df['Combined Diagnosis'] = train_df['Diagnosis'] + train_df['Gross Description'] \ + + train_df['Microscopic Description'] +train_df = train_df[['Combined Diagnosis', 'Primary Site of Cancer']] + +eval_df = pd.read_csv('/Users/aakansha/Desktop/NCCS NLP for Histology Reports/Datasets for Trials/test_data_for_model2.csv') +eval_df['Combined Diagnosis'] = eval_df['Diagnosis'] + eval_df['Gross Description'] \ + + eval_df['Microscopic Description'] +eval_df = eval_df[['Combined Diagnosis', 'Primary Site of Cancer']] + +eval_metrics_df = pd.DataFrame(columns=['Model Type', 'Model Name', 'Epoch', 'Overall Accuracy', 'Overall AUROC Score', + 'Hamming Loss', 'Eval Loss']) +# Multi-hot Encoding for Train Data + +# Drop NA rows +train_df = train_df.dropna(subset=['Primary Site of Cancer']).reset_index(drop=True) + +# Convert datatype of elements to string +train_df['Primary Site of Cancer'] = train_df['Primary Site of Cancer'].astype('str') + +# Put primary sites into lists, separate multiple sites for each report +for (i, row) in train_df.iterrows(): + val = train_df['Primary Site of Cancer'].iloc[i] + list_separated = val.split(",") + stripped = [s.strip().upper() for s in list_separated] + set_list = set(stripped) + train_df.at[i, 'Primary Site of Cancer'] = set_list + +# Initialize MultiLabelBinarizer +mlb = MultiLabelBinarizer() +mlb.fit(train_df['Primary Site of Cancer']) +cols = ["%s" % c for c in mlb.classes_] + +# Fit data into binarizer, generate multi-hot encodings +df = pd.DataFrame(mlb.fit_transform(train_df['Primary Site of Cancer']), columns=mlb.classes_) + +# Merge original text with multi-hot encodings +train_df = pd.concat([train_df[['Combined Diagnosis']], df], axis=1) + +print(cols) +# Generate labels columns as list +count = len(cols) +train_df['labels'] = '' + +for (i, row) in train_df.iterrows(): + labels = [] + j = 1 + while j <= count: + labels.append(train_df.iloc[i].iloc[j]) + j += 1 + tup = tuple(labels) + train_df.at[i, 'labels'] = tup + +train_df = train_df[['Combined Diagnosis', 'labels']] + +# Multi-hot Encoding for Test Data + +# Drop NA rows +eval_df = eval_df.dropna(subset=['Primary Site of Cancer']) + +# Convert datatype of elements to string +eval_df['Primary Site of Cancer'] = eval_df['Primary Site of Cancer'].astype('str') + +# Put primary sites into lists, separate multiple sites for each report +for (i, row) in eval_df.iterrows(): + val = eval_df['Primary Site of Cancer'].iloc[i] + list_separated = val.split(",") + stripped = [s.strip().upper() for s in list_separated] + set_list = set(stripped) + eval_df.at[i, 'Primary Site of Cancer'] = set_list + +# Fit data into binarizer, generate multi-hot encodings +eval_df_individual_labels = pd.DataFrame(mlb.transform(eval_df['Primary Site of Cancer']), columns=cols) + +# Merge original text with multi-hot encodings +eval_df = pd.concat([eval_df[['Combined Diagnosis']], eval_df_individual_labels], axis=1) + +# Generate labels columns as list +eval_df['labels'] = '' + +for (i, row) in eval_df.iterrows(): + labels = [] + j = 1 + while j <= count: + labels.append(eval_df.iloc[i].iloc[j]) + j += 1 + + tup = tuple(labels) + eval_df.at[i, 'labels'] = tup + +eval_df = eval_df[['Combined Diagnosis', 'labels']] + +for n in [2]: + + curr_epoch = "Epoch" + str(n) + + # Configure model args + model_args = MultiLabelClassificationArgs(num_train_epochs=n) + model_args.evaluate_during_training_steps = -1 + model_args.save_eval_checkpoints = False + model_args.save_model_every_epoch = False + model_args.learning_rate = 1e-5 + model_args.manual_seed = 4 + model_args.multiprocessing_chunksize = 5000 + model_args.no_cache = True + model_args.reprocess_input_data = True + model_args.train_batch_size = 16 + model_args.gradient_accumulation_steps = 2 + model_args.use_multiprocessing = True + model_args.overwrite_output_dir = True + model_args.labels_list = cols + + # model + model_type = "roberta" + model_name = "roberta-large" + + # Create Transformer Model + model = MultiLabelClassificationModel(model_type, model_name, num_labels=count, use_cuda=False, args=model_args) + + if __name__ == '__main__': + + # Train the model + model.train_model(train_df) + pickle.dump(model, open('model2.pkl', 'wb')) + + prediction_df = eval_df['Combined Diagnosis'].values.tolist() + + # Predict output + prediction, outputs = model.predict(prediction_df) + outputs_df = pd.DataFrame(outputs, columns=cols) + prediction_df = pd.DataFrame(prediction, columns=cols) + + # Save outputs to csv file + filename_prefix = "/Users/aakansha/Desktop/Model2/" + model_name + "_outputs_df" + filename = "%s.csv" % filename_prefix + outputs_df.to_csv(filename) + + # Save true and predicted labels to csv file + combined_cols_df = pd.concat([eval_df, prediction_df], axis=1) + filename_prefix = "/Users/aakansha/Desktop/Model2/" + model_name + "_combined_cols_df" + filename = "%s.csv" % filename_prefix + combined_cols_df.to_csv(filename) + + # Calculate individual label accuracies + label_accuracy_df = pd.concat([eval_df_individual_labels, prediction_df], axis=1) + new_acc_cols_order = np.unique( + np.array(list(zip(eval_df_individual_labels.columns, prediction_df.columns))).flatten()) + label_accuracy_df = label_accuracy_df[new_acc_cols_order] + + count = len(label_accuracy_df.columns) + i = 0 + colnames = [] + accuracies = [] + auroc = [] + + while i < count: + actualValue = label_accuracy_df.iloc[:, i] + predictedValue = label_accuracy_df.iloc[:, i + 1] + actualValue = actualValue.values + predictedValue = predictedValue.values + acc = accuracy_score(actualValue, predictedValue) + # temporary fix, try-except block will be removed in the future with a more balanced dataset + try: + auroc_score = roc_auc_score(actualValue, predictedValue) + except ValueError: + auroc_score = 0 + colnames.append(label_accuracy_df.columns[i]) + accuracies.append(acc) + auroc.append(auroc_score) + i += 2 + + accuracy_auroc_df = pd.DataFrame(list(zip(colnames, accuracies, auroc)), + columns=['Site', 'Accuracy', 'AUROC Score']) + + # Evaluate model + result, model_outputs, wrong_predictions = model.eval_model(eval_df) + + # Processing for metrics calculation + eval_df_multi_hot_encodings = [] + + for (i, row) in eval_df.iterrows(): + val = eval_df['labels'].iloc[i] + val_array = np.asarray(val) + eval_df_multi_hot_encodings.append(val_array) + + eval_df_true = np.array(eval_df_multi_hot_encodings) + prediction_data = np.array(prediction) + + # Calculate metrics + overall_acc = accuracy_score(eval_df_true, prediction_data) + # temporary fix, try-except block will be removed in the future with a more balanced dataset + try: + overall_auroc = roc_auc_score(eval_df_true, prediction_data) + except: + overall_auroc = 0 + hamming_loss = hamming_loss(eval_df_true, prediction_data) + + target_names = cols + other_metrics_report = classification_report(eval_df_true, prediction_data, target_names=target_names, + output_dict=True) + classification_report_df = pd.DataFrame(other_metrics_report).transpose() + + # Combine individual accuracies to classification report + classification_report_df = classification_report_df.reset_index() + accuracy_auroc_df = accuracy_auroc_df.reset_index() + classification_report_df_final = pd.concat([classification_report_df, accuracy_auroc_df['Accuracy'], + accuracy_auroc_df['AUROC Score']], axis=1) + + # Save classification report to csv file + filename_prefix = "/Users/aakansha/Desktop/Model2/" + model_name + "_classification_metrics_df" + filename = "%s.csv" % filename_prefix + classification_report_df_final.to_csv(filename) + + # Save other metrics to csv file + metrics_data = [model_type, model_name, curr_epoch, overall_acc, overall_auroc, hamming_loss, + result['eval_loss']] + df_length = len(eval_metrics_df) + eval_metrics_df.loc[df_length] = metrics_data + filename_prefix = "/Users/aakansha/Desktop/Model2/" + model_name + "_eval_metrics_df" + filename = "%s.csv" % filename_prefix + eval_metrics_df.to_csv(filename) \ No newline at end of file