Diff of /PredictionModel.py [000000] .. [6e0d8e]

Switch to unified view

a b/PredictionModel.py
1
# Import packages
2
from collections import Counter
3
import numpy as np
4
import collections, numpy
5
import mlxtend
6
import matplotlib
7
import re
8
import pd
9
import pandas as pd
10
from datetime import datetime
11
import us
12
from matplotlib import pyplot
13
import matplotlib.pyplot as plt
14
matplotlib.rcParams['figure.figsize'] = (10, 10)
15
16
from sklearn.datasets import make_classification
17
from sklearn.metrics import confusion_matrix
18
from mlxtend.plotting import plot_confusion_matrix
19
20
from sklearn.linear_model import LogisticRegression
21
from sklearn.ensemble import RandomForestClassifier
22
from sklearn import svm
23
from sklearn.neighbors import KNeighborsClassifier
24
from sklearn.model_selection import train_test_split
25
from sklearn.model_selection import cross_validate
26
from sklearn.preprocessing import OneHotEncoder
27
from sklearn.preprocessing import StandardScaler
28
from sklearn import preprocessing
29
30
from sklearn.metrics import classification_report
31
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
32
from sklearn.metrics import roc_curve, precision_recall_curve, auc
33
import xgboost as xgb
34
from xgboost import XGBClassifier
35
from xgboost import plot_importance
36
37
final_table1 = pd.read_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/final_table1.csv")
38
# final_table1['dropout_percentage_all'].median() # = 7.6602
39
40
'''
41
final_table.isnull().sum()
42
# allocation has 116 NA, completion date has 31, primary purpose has 5, minimum is 9, maximum is 536, ruca is 219
43
final_table.drop(columns='Maximum Age', inplace=True)
44
final_table.drop(columns='Allocation', inplace=True)
45
final_table.drop(columns='Zipcode', inplace=True)
46
final_table.drop(columns='New Completion Date', inplace=True)
47
final_table.drop(columns='Completion Date', inplace=True)
48
final_table['Primary Purpose'].fillna(final_table['Primary Purpose'].mode()[0], inplace=True)
49
final_table['length_of_trial'].fillna(final_table['length_of_trial'].median(), inplace=True)
50
final_table['Minimum Age'].fillna(final_table['Minimum Age'].median(), inplace=True)
51
final_table['RUCA2.0'].fillna(final_table['RUCA2.0'].median(), inplace=True)
52
'''
53
final_table1['Dropout'] = np.where((final_table1['dropout_percentage_all'].apply(lambda x: float(x)))>7.660191019, 1, 0)
54
categorical_columns = final_table1[['Allocation', 'Trial Phase', 'Overall Status', 'Primary Purpose', 'City', 'State',
55
'Gender']]
56
# numerical column names and then categorical
57
58
# Convert the categorical data into numerical data
59
le = preprocessing.LabelEncoder()
60
X_2 = categorical_columns.apply(le.fit_transform)
61
62
# Apply OneHotEncoder
63
enc = preprocessing.OneHotEncoder()
64
enc.fit(X_2)
65
onehotlabels = enc.transform(X_2).toarray()
66
67
# Create a DataFrame with the one-hot encoded columns
68
onehot_df = pd.DataFrame(onehotlabels, columns=enc.get_feature_names_out(categorical_columns.columns))
69
70
# Concatenate the one-hot encoded DataFrame with the original DataFrame 'final_table1'
71
final_table1_encoded = pd.concat([final_table1, onehot_df], axis=1)
72
final_table1_encoded = final_table1_encoded.drop(categorical_columns.columns, axis=1)
73
74
# scale the columns using StandardScaler. this is only for numerical variables
75
numeric_columns = ['Cleaned_Zipcodes', 'RUCA2.0', 'length_of_trial', 'Minimum Age', 'New Start Date']
76
77
for column in numeric_columns:
78
    final_table1_encoded[column] = pd.to_numeric(final_table1_encoded[column], errors='coerce')
79
numeric_data = final_table1_encoded[numeric_columns]
80
81
# Scale the numeric columns
82
scaler = StandardScaler()
83
scaled_data = scaler.fit_transform(numeric_data)
84
final_table1_encoded[numeric_columns] = scaled_data
85
86
excess_columns = final_table1_encoded[['nct_id', 'dropout_percentage_all', 'Clinical Title', 'Start Date', 'Completion Date',
87
                               'Zipcode', 'New Start Date', 'New Completion Date']]
88
89
final_table1_encoded = final_table1_encoded.drop(excess_columns.columns, axis=1) # table has been finalized.
90
91
92
# split the data and choose the column you want to train vs test
93
X_df = final_table1_encoded.drop('Dropout',axis=1)
94
y_df = final_table1_encoded['Dropout']
95
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.25, random_state=2)
96
Counter(y_train), Counter(y_test)
97
98
# logistical regression ML
99
#Fit the model
100
model = LogisticRegression(max_iter=500)
101
model.fit(X_train, y_train)
102
#Generate Predictions
103
predictions = model.predict(X_test)
104
predictions_proba = model.predict_proba(X_test)
105
106
plt.hist(predictions_proba[:,1])
107
# Confusion matrix
108
cm = confusion_matrix(y_test, predictions)
109
plot_confusion_matrix(conf_mat=cm, show_absolute=True)
110
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
111
#Accuracy
112
accuracy = (tp+tn)/(tp+tn+fp+fn)
113
print('Accuracy: %.3f' % accuracy)
114
115
#Recall/Sensitivity/True Positive rate
116
recall = sensitivity = tpr = tp / (tp + fn)
117
print('Recall: %.3f' % recall)
118
119
#Precision
120
precision = tp / (tp + fp)
121
print('Precision: %.3f' % precision)
122
123
#Specificity/Negative Recall/ True negative Rate/ 1-False Positive Rate
124
specificity = tn / (tn + fp)
125
print('Specificity: %.3f' % specificity)
126
127
#F1 Score
128
f1 = 2*(precision*recall)/(precision+recall)
129
print('F1: %.3f' % f1)
130
131
#Accuracy
132
accuracy = accuracy_score(y_test, predictions)
133
print('Accuracy: %.3f' % accuracy)
134
135
#Recall
136
recall = recall_score(y_test, predictions)
137
print('Recall: %.3f' % recall)
138
139
#Precision
140
precision = precision_score(y_test, predictions)
141
print('Precision: %.3f' % precision)
142
143
#The f1-score is the harmonic mean of precision and recall
144
f1 = f1_score(y_test, predictions)
145
print('F1: %.3f' % f1)
146
147
#AUROC = Area Under the Receiver Operating Characteristic curve
148
roc_auc = roc_auc_score(y_test, predictions_proba[:,1])
149
print('AUCROC: %.3f' % roc_auc)
150
151
print(classification_report(y_test, predictions))
152
# Random Forest
153
#Fit the model
154
model = RandomForestClassifier(n_estimators=100)
155
model.fit(X_train, y_train)
156
157
#Prediction
158
predictions_proba = model.predict_proba(X_test)
159
predictions = model.predict(X_test)
160
161
162
#Getting the confusion matrix for the new
163
cm = confusion_matrix(y_test,predictions)
164
plot_confusion_matrix(conf_mat=cm, show_absolute=True)
165
plt.show()
166
167
#Let's print the classification
168
print(classification_report(y_test, predictions))
169
170
#Getting the metrics
171
#Accuracy
172
accuracy = accuracy_score(y_test, predictions)
173
print('Accuracy: %.3f' % accuracy)
174
175
#Recall
176
recall = recall_score(y_test, predictions)
177
print('Recall: %.3f' % recall)
178
179
#Precision
180
precision = precision_score(y_test, predictions)
181
print('Precision: %.3f' % precision)
182
183
#The f1-score is the harmonic mean of precision and recall
184
f1 = f1_score(y_test, predictions)
185
print('F1: %.3f' % f1)
186
187
#Compute and print AUC-ROC Curve
188
roc_auc = roc_auc_score(y_test, predictions_proba[:,1])
189
print('AUCROC: %.3f' % roc_auc)
190
191
# Feature Importance using Tree-Based Classifiers
192
X_train['RUCA2.0'].value_counts()
193
model = XGBClassifier(enable_categorical=True)
194
model.fit(X_train, y_train)
195
fig, ax = plt.subplots(figsize=(10,10))
196
plot_importance(model, ax=ax)
197
plt.show()
198
199
print("NaN values in X_train:", X_train.isna().sum())
200
print("NaN values in X_test:", X_test.isna().sum())