|
a |
|
b/PredictionModel.py |
|
|
1 |
# Import packages |
|
|
2 |
from collections import Counter |
|
|
3 |
import numpy as np |
|
|
4 |
import collections, numpy |
|
|
5 |
import mlxtend |
|
|
6 |
import matplotlib |
|
|
7 |
import re |
|
|
8 |
import pd |
|
|
9 |
import pandas as pd |
|
|
10 |
from datetime import datetime |
|
|
11 |
import us |
|
|
12 |
from matplotlib import pyplot |
|
|
13 |
import matplotlib.pyplot as plt |
|
|
14 |
matplotlib.rcParams['figure.figsize'] = (10, 10) |
|
|
15 |
|
|
|
16 |
from sklearn.datasets import make_classification |
|
|
17 |
from sklearn.metrics import confusion_matrix |
|
|
18 |
from mlxtend.plotting import plot_confusion_matrix |
|
|
19 |
|
|
|
20 |
from sklearn.linear_model import LogisticRegression |
|
|
21 |
from sklearn.ensemble import RandomForestClassifier |
|
|
22 |
from sklearn import svm |
|
|
23 |
from sklearn.neighbors import KNeighborsClassifier |
|
|
24 |
from sklearn.model_selection import train_test_split |
|
|
25 |
from sklearn.model_selection import cross_validate |
|
|
26 |
from sklearn.preprocessing import OneHotEncoder |
|
|
27 |
from sklearn.preprocessing import StandardScaler |
|
|
28 |
from sklearn import preprocessing |
|
|
29 |
|
|
|
30 |
from sklearn.metrics import classification_report |
|
|
31 |
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score |
|
|
32 |
from sklearn.metrics import roc_curve, precision_recall_curve, auc |
|
|
33 |
import xgboost as xgb |
|
|
34 |
from xgboost import XGBClassifier |
|
|
35 |
from xgboost import plot_importance |
|
|
36 |
|
|
|
37 |
final_table1 = pd.read_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/final_table1.csv") |
|
|
38 |
# final_table1['dropout_percentage_all'].median() # = 7.6602 |
|
|
39 |
|
|
|
40 |
''' |
|
|
41 |
final_table.isnull().sum() |
|
|
42 |
# allocation has 116 NA, completion date has 31, primary purpose has 5, minimum is 9, maximum is 536, ruca is 219 |
|
|
43 |
final_table.drop(columns='Maximum Age', inplace=True) |
|
|
44 |
final_table.drop(columns='Allocation', inplace=True) |
|
|
45 |
final_table.drop(columns='Zipcode', inplace=True) |
|
|
46 |
final_table.drop(columns='New Completion Date', inplace=True) |
|
|
47 |
final_table.drop(columns='Completion Date', inplace=True) |
|
|
48 |
final_table['Primary Purpose'].fillna(final_table['Primary Purpose'].mode()[0], inplace=True) |
|
|
49 |
final_table['length_of_trial'].fillna(final_table['length_of_trial'].median(), inplace=True) |
|
|
50 |
final_table['Minimum Age'].fillna(final_table['Minimum Age'].median(), inplace=True) |
|
|
51 |
final_table['RUCA2.0'].fillna(final_table['RUCA2.0'].median(), inplace=True) |
|
|
52 |
''' |
|
|
53 |
final_table1['Dropout'] = np.where((final_table1['dropout_percentage_all'].apply(lambda x: float(x)))>7.660191019, 1, 0) |
|
|
54 |
categorical_columns = final_table1[['Allocation', 'Trial Phase', 'Overall Status', 'Primary Purpose', 'City', 'State', |
|
|
55 |
'Gender']] |
|
|
56 |
# numerical column names and then categorical |
|
|
57 |
|
|
|
58 |
# Convert the categorical data into numerical data |
|
|
59 |
le = preprocessing.LabelEncoder() |
|
|
60 |
X_2 = categorical_columns.apply(le.fit_transform) |
|
|
61 |
|
|
|
62 |
# Apply OneHotEncoder |
|
|
63 |
enc = preprocessing.OneHotEncoder() |
|
|
64 |
enc.fit(X_2) |
|
|
65 |
onehotlabels = enc.transform(X_2).toarray() |
|
|
66 |
|
|
|
67 |
# Create a DataFrame with the one-hot encoded columns |
|
|
68 |
onehot_df = pd.DataFrame(onehotlabels, columns=enc.get_feature_names_out(categorical_columns.columns)) |
|
|
69 |
|
|
|
70 |
# Concatenate the one-hot encoded DataFrame with the original DataFrame 'final_table1' |
|
|
71 |
final_table1_encoded = pd.concat([final_table1, onehot_df], axis=1) |
|
|
72 |
final_table1_encoded = final_table1_encoded.drop(categorical_columns.columns, axis=1) |
|
|
73 |
|
|
|
74 |
# scale the columns using StandardScaler. this is only for numerical variables |
|
|
75 |
numeric_columns = ['Cleaned_Zipcodes', 'RUCA2.0', 'length_of_trial', 'Minimum Age', 'New Start Date'] |
|
|
76 |
|
|
|
77 |
for column in numeric_columns: |
|
|
78 |
final_table1_encoded[column] = pd.to_numeric(final_table1_encoded[column], errors='coerce') |
|
|
79 |
numeric_data = final_table1_encoded[numeric_columns] |
|
|
80 |
|
|
|
81 |
# Scale the numeric columns |
|
|
82 |
scaler = StandardScaler() |
|
|
83 |
scaled_data = scaler.fit_transform(numeric_data) |
|
|
84 |
final_table1_encoded[numeric_columns] = scaled_data |
|
|
85 |
|
|
|
86 |
excess_columns = final_table1_encoded[['nct_id', 'dropout_percentage_all', 'Clinical Title', 'Start Date', 'Completion Date', |
|
|
87 |
'Zipcode', 'New Start Date', 'New Completion Date']] |
|
|
88 |
|
|
|
89 |
final_table1_encoded = final_table1_encoded.drop(excess_columns.columns, axis=1) # table has been finalized. |
|
|
90 |
|
|
|
91 |
|
|
|
92 |
# split the data and choose the column you want to train vs test |
|
|
93 |
X_df = final_table1_encoded.drop('Dropout',axis=1) |
|
|
94 |
y_df = final_table1_encoded['Dropout'] |
|
|
95 |
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.25, random_state=2) |
|
|
96 |
Counter(y_train), Counter(y_test) |
|
|
97 |
|
|
|
98 |
# logistical regression ML |
|
|
99 |
#Fit the model |
|
|
100 |
model = LogisticRegression(max_iter=500) |
|
|
101 |
model.fit(X_train, y_train) |
|
|
102 |
#Generate Predictions |
|
|
103 |
predictions = model.predict(X_test) |
|
|
104 |
predictions_proba = model.predict_proba(X_test) |
|
|
105 |
|
|
|
106 |
plt.hist(predictions_proba[:,1]) |
|
|
107 |
# Confusion matrix |
|
|
108 |
cm = confusion_matrix(y_test, predictions) |
|
|
109 |
plot_confusion_matrix(conf_mat=cm, show_absolute=True) |
|
|
110 |
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel() |
|
|
111 |
#Accuracy |
|
|
112 |
accuracy = (tp+tn)/(tp+tn+fp+fn) |
|
|
113 |
print('Accuracy: %.3f' % accuracy) |
|
|
114 |
|
|
|
115 |
#Recall/Sensitivity/True Positive rate |
|
|
116 |
recall = sensitivity = tpr = tp / (tp + fn) |
|
|
117 |
print('Recall: %.3f' % recall) |
|
|
118 |
|
|
|
119 |
#Precision |
|
|
120 |
precision = tp / (tp + fp) |
|
|
121 |
print('Precision: %.3f' % precision) |
|
|
122 |
|
|
|
123 |
#Specificity/Negative Recall/ True negative Rate/ 1-False Positive Rate |
|
|
124 |
specificity = tn / (tn + fp) |
|
|
125 |
print('Specificity: %.3f' % specificity) |
|
|
126 |
|
|
|
127 |
#F1 Score |
|
|
128 |
f1 = 2*(precision*recall)/(precision+recall) |
|
|
129 |
print('F1: %.3f' % f1) |
|
|
130 |
|
|
|
131 |
#Accuracy |
|
|
132 |
accuracy = accuracy_score(y_test, predictions) |
|
|
133 |
print('Accuracy: %.3f' % accuracy) |
|
|
134 |
|
|
|
135 |
#Recall |
|
|
136 |
recall = recall_score(y_test, predictions) |
|
|
137 |
print('Recall: %.3f' % recall) |
|
|
138 |
|
|
|
139 |
#Precision |
|
|
140 |
precision = precision_score(y_test, predictions) |
|
|
141 |
print('Precision: %.3f' % precision) |
|
|
142 |
|
|
|
143 |
#The f1-score is the harmonic mean of precision and recall |
|
|
144 |
f1 = f1_score(y_test, predictions) |
|
|
145 |
print('F1: %.3f' % f1) |
|
|
146 |
|
|
|
147 |
#AUROC = Area Under the Receiver Operating Characteristic curve |
|
|
148 |
roc_auc = roc_auc_score(y_test, predictions_proba[:,1]) |
|
|
149 |
print('AUCROC: %.3f' % roc_auc) |
|
|
150 |
|
|
|
151 |
print(classification_report(y_test, predictions)) |
|
|
152 |
# Random Forest |
|
|
153 |
#Fit the model |
|
|
154 |
model = RandomForestClassifier(n_estimators=100) |
|
|
155 |
model.fit(X_train, y_train) |
|
|
156 |
|
|
|
157 |
#Prediction |
|
|
158 |
predictions_proba = model.predict_proba(X_test) |
|
|
159 |
predictions = model.predict(X_test) |
|
|
160 |
|
|
|
161 |
|
|
|
162 |
#Getting the confusion matrix for the new |
|
|
163 |
cm = confusion_matrix(y_test,predictions) |
|
|
164 |
plot_confusion_matrix(conf_mat=cm, show_absolute=True) |
|
|
165 |
plt.show() |
|
|
166 |
|
|
|
167 |
#Let's print the classification |
|
|
168 |
print(classification_report(y_test, predictions)) |
|
|
169 |
|
|
|
170 |
#Getting the metrics |
|
|
171 |
#Accuracy |
|
|
172 |
accuracy = accuracy_score(y_test, predictions) |
|
|
173 |
print('Accuracy: %.3f' % accuracy) |
|
|
174 |
|
|
|
175 |
#Recall |
|
|
176 |
recall = recall_score(y_test, predictions) |
|
|
177 |
print('Recall: %.3f' % recall) |
|
|
178 |
|
|
|
179 |
#Precision |
|
|
180 |
precision = precision_score(y_test, predictions) |
|
|
181 |
print('Precision: %.3f' % precision) |
|
|
182 |
|
|
|
183 |
#The f1-score is the harmonic mean of precision and recall |
|
|
184 |
f1 = f1_score(y_test, predictions) |
|
|
185 |
print('F1: %.3f' % f1) |
|
|
186 |
|
|
|
187 |
#Compute and print AUC-ROC Curve |
|
|
188 |
roc_auc = roc_auc_score(y_test, predictions_proba[:,1]) |
|
|
189 |
print('AUCROC: %.3f' % roc_auc) |
|
|
190 |
|
|
|
191 |
# Feature Importance using Tree-Based Classifiers |
|
|
192 |
X_train['RUCA2.0'].value_counts() |
|
|
193 |
model = XGBClassifier(enable_categorical=True) |
|
|
194 |
model.fit(X_train, y_train) |
|
|
195 |
fig, ax = plt.subplots(figsize=(10,10)) |
|
|
196 |
plot_importance(model, ax=ax) |
|
|
197 |
plt.show() |
|
|
198 |
|
|
|
199 |
print("NaN values in X_train:", X_train.isna().sum()) |
|
|
200 |
print("NaN values in X_test:", X_test.isna().sum()) |