[0bb4e8]: / code / 2_Model_Code.R

Download this file

161 lines (121 with data), 6.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Clinical Trial Patient Attrition Modeling Using Machine Learning
# Full Authors List
# Nampally, Sreenath; Zhang, Youyi; Khan A.N, Imran; Hutchison, Emmette;
# Khader, Shameer; Khan, Faisal
# Code Authors
# Nampally, Sreenath; Zhang, Youyi; Khan A.N, Imran;
# Contact
# shameer.khader@astrazeneca.com.
# Description of Project
# This work assembled the first large-scale sponsor independent clinical
# trial dataset for the purpose of enabling a data-driven approach to
# elucidate the drivers of patient attrition in clinical trials.
# Description of Code
# This code splits the data into training and testing, multiple machine learning
# models were trained to predict patient attrition
library("SuperLearner")
library("MASS")
library("ranger")
library("ipred")
library("kernlab")
library("arm")
library("dplyr")
library("caret")
library("glmnet")
library("parallel")
library("randomForest")
library("party")
# Set the data directories to the place where analysis ready data is stored
ds1 <- read.csv("./Indexed_Analysis_Ready.csv")
# training set by nct_id
ds2 <- unique(ds1[,c("nct_id","Disease")])
#Split training and testing into 80:20 split
set.seed(234)
NCTIndex <-createDataPartition(ds2$Disease, p = .8, list = F, times = 1)
# TRAINING set
ds3<- ds1[ds1$nct_id %in% ds2$nct_id[NCTIndex] ,]
# TEST set
ds4<- ds1[!(ds1$nct_id %in% ds2$nct_id[NCTIndex]) ,]
# Convert Character columns to numeric in the training data set ds3
# ***remove numerics ****
ds3$Disease_n <- as.numeric(as.factor(ds3$Disease))
ds3$Phase_n <- as.numeric(as.factor(ds3$Phase))
ds3$Industry_n <- as.numeric(as.factor(ds3$Industry))
# Remove the character columns and other un necessary columns
ds3 <- ds3[, !(names(ds3) %in% c("nct_id","dwp_wo_ae","dwp_sub", "Disease", "Phase", "Industry") )]
# Convert Character columns to numeric in the test data set ds4
ds4$Disease_n <- as.numeric(as.factor(ds4$Disease))
ds4$Phase_n <- as.numeric(as.factor(ds4$Phase))
ds4$Industry_n <- as.numeric(as.factor(ds4$Industry))
# Remove the character columns and other target columns
ds4 <- ds4[, !(names(ds4) %in% c("nct_id","dwp_wo_ae","dwp_sub", "dwp_all_res", "dwp_all_pred", "Disease", "Phase", "Industry") )]
# Create a test and training data set from ds3 and ds4
superlearner_train <- ds3
superlearner_test <- ds4
# Separate the predictors in to its own data frame for training data set
x.train <- subset(superlearner_train, select = -dwp_all)
y.train <- superlearner_train$dwp_all
# Separate the predictors in to its own data frame for test data set
x.test <- subset(superlearner_test, select = -dwp_all)
y.test <- superlearner_test$dwp_all
cf1 <- cforest(y.train ~ . , data= x.train, control=cforest_unbiased(mtry=2,ntree=50))
test <- varimp(cf1) # get variable importance, based on mean decrease in accuracy
# List Wrapper algorithms available in SuperLearner
listWrappers()
# Create a new function that changes just the ntree argument for RandomForest
# "..." means "all other arguments that were sent to the function"
SL.rf.better <- function(...){
SL.randomForest(..., num.trees=5000)
}
# Fit the CV.SuperLearner with the tuned randomforest along with others.
system.time({
# Set the seed
set.seed(150)
# Create a library of algorithms
SL.library <- c("SL.ranger", "SL.rf.better", "SL.randomForest", "SL.ksvm", "SL.ipredbagg", "SL.bayesglm", "SL.glm", "SL.lm", "SL.glmnet", "SL.ridge", "SL.mean")
# Tune the model
# The outercross-validation is the sample spliting for evaluating the SuperLearner.
# The inner cross-validation are the valuesnpassed to each of the V SuperLearner calls.
# V is the number of folds for CV.SuperLearner
cv.model.tune <- CV.SuperLearner(Y = y.train, X = x.train, family=gaussian(), SL.library = SL.library,
verbose = TRUE, method = "method.NNLS", cvControl = list(V = 10, shuffle = FALSE),
innerCvControl = list(list(V = 5)))
})
# Get summary statistics
summary(cv.model.tune)
df <- summary(cv.model.tune)
newdf <- df$Table
mse <- newdf[,1:2]
mse$rmse ='^'(mse$Ave,1/2) # square root of mse to calculate rmse
finaltable <- mse[c(1,2,5,6,7,8,9,10),c(1,3)] # subset only few algorithms
standardnames <- c('Super Learner', 'Discrete Super Learner', 'Random Forest', 'Support Vector Machines', 'Bagging Classification', 'Bayesian Generalized Linear Models
', 'Generalized Linear Models', 'Ordinary least squares' ) # standardize the names
final_stats_with_rmse <- data.frame("Algorithms" = standardnames, "RMSE" = finaltable[,2]) # subset the data
final_stats_with_rmse %>% mutate_at(vars(RMSE), funs(round(., 2))) # round columns values to two decimal places
## Selecting top 8 features based on variable importance
x.train_selected_vars <- x.train %>% dplyr::select(Duration.Trial, Duration.Treatment, Disease_n, AE_total_serious, N_total, GDP_weighted, AA_fraction, Age_mean)
# Fit the CV.SuperLearner with the tuned randomforest along with lm on final selected variables
system.time({
# Set the seed
set.seed(250)
# Create a library of algorithms
SL.library <- c("SL.randomForest", "SL.lm")
# Tune parameters for SL : stratifyCV, shuffle and validRows
# The outercross-validation is the sample spliting for evaluating the SuperLearner.
# The inner cross-validation are the valuesnpassed to each of the V SuperLearner calls.
# V is the number of folds for CV.SuperLearner
# A list of parameters to control the outer cross-validation process. cvControl set to 3
cv.model.tune.8predictors <- CV.SuperLearner(Y = y.train, X = x.train_selected_vars, family=gaussian(), SL.library = SL.library,
verbose = TRUE, method = "method.NNLS", cvControl = list(V = 3, shuffle = FALSE),
innerCvControl = list(list(V = 5)))
})
# Get summary statistics
summary(cv.model.tune.8predictors)
df <- summary(cv.model.tune.8predictors)
newdf <- df$Table
mse <- newdf[,1:2]
mse$rmse ='^'(mse$Ave,1/2) # square root of mse to calculate rmse
finaltable <- mse[c(3,4),c(1,3)] # subset only few algorithms
standardnames <- c('Random Forest','Ordinary least squares' ) # standardize the names
final_stats_with_rmse <- data.frame("Algorithms" = standardnames, "RMSE" = finaltable[,2]) # subset the data
final_stats_with_rmse %>% mutate_at(vars(RMSE), funs(round(., 2))) # round columns values to two decimal places