Diff of /code/2_Model_Code.R [000000] .. [0bb4e8]

Switch to side-by-side view

--- a
+++ b/code/2_Model_Code.R
@@ -0,0 +1,160 @@
+# Clinical Trial Patient Attrition Modeling Using Machine Learning
+
+# Full Authors List
+# Nampally, Sreenath; Zhang, Youyi; Khan A.N, Imran; Hutchison, Emmette; 
+# Khader, Shameer; Khan, Faisal
+
+# Code Authors
+# Nampally, Sreenath; Zhang, Youyi; Khan A.N, Imran;
+
+# Contact
+# shameer.khader@astrazeneca.com.
+
+# Description of Project
+# This work assembled the first large-scale sponsor independent clinical 
+# trial dataset for the purpose of enabling a data-driven approach to 
+# elucidate the drivers of patient attrition in clinical trials.
+
+# Description of Code
+# This code splits the data into training and testing, multiple machine learning 
+# models were trained to predict patient attrition
+
+library("SuperLearner")
+library("MASS")
+library("ranger")
+library("ipred")
+library("kernlab")
+library("arm")
+library("dplyr")
+library("caret")
+library("glmnet")
+library("parallel")
+library("randomForest")
+library("party")
+
+# Set the data directories to the place where analysis ready data is stored
+
+ds1 <- read.csv("./Indexed_Analysis_Ready.csv")
+
+# training set by nct_id
+ds2 <- unique(ds1[,c("nct_id","Disease")])
+#Split training and testing into 80:20 split
+set.seed(234)
+NCTIndex <-createDataPartition(ds2$Disease, p = .8,  list = F, times = 1)
+# TRAINING set
+ds3<- ds1[ds1$nct_id %in% ds2$nct_id[NCTIndex]  ,]
+# TEST set
+ds4<- ds1[!(ds1$nct_id %in% ds2$nct_id[NCTIndex])  ,]
+
+# Convert Character columns to numeric in the training data set ds3
+# ***remove numerics ****
+ds3$Disease_n <- as.numeric(as.factor(ds3$Disease))
+ds3$Phase_n <- as.numeric(as.factor(ds3$Phase))
+ds3$Industry_n <- as.numeric(as.factor(ds3$Industry))
+
+# Remove the character columns and other un necessary columns 
+ds3 <- ds3[, !(names(ds3) %in% c("nct_id","dwp_wo_ae","dwp_sub", "Disease", "Phase", "Industry") )] 
+
+# Convert Character columns to numeric in the test data set ds4
+ds4$Disease_n <- as.numeric(as.factor(ds4$Disease))
+ds4$Phase_n <- as.numeric(as.factor(ds4$Phase))
+ds4$Industry_n <- as.numeric(as.factor(ds4$Industry))
+
+# Remove the character columns and other target columns 
+ds4 <- ds4[, !(names(ds4) %in% c("nct_id","dwp_wo_ae","dwp_sub", "dwp_all_res", "dwp_all_pred", "Disease", "Phase", "Industry") )] 
+
+# Create a test and training data set from ds3 and ds4
+superlearner_train <- ds3
+superlearner_test <- ds4
+
+# Separate the predictors in to its own data frame for training data set
+x.train <- subset(superlearner_train, select = -dwp_all)
+y.train <- superlearner_train$dwp_all
+
+
+# Separate the predictors in to its own data frame for test data set
+x.test <- subset(superlearner_test, select = -dwp_all)
+y.test <- superlearner_test$dwp_all
+
+
+cf1 <- cforest(y.train ~ . , data= x.train, control=cforest_unbiased(mtry=2,ntree=50))
+
+test <- varimp(cf1) # get variable importance, based on mean decrease in accuracy
+
+# List Wrapper algorithms available in SuperLearner
+listWrappers()
+
+# Create a new function that changes just the ntree argument for RandomForest
+# "..." means "all other arguments that were sent to the function"
+
+SL.rf.better <- function(...){
+  SL.randomForest(..., num.trees=5000)
+}
+
+# Fit the CV.SuperLearner with the tuned randomforest along with others.  
+
+system.time({
+  # Set the seed
+  set.seed(150)
+  
+  # Create a library of algorithms
+  SL.library <- c("SL.ranger", "SL.rf.better", "SL.randomForest", "SL.ksvm", "SL.ipredbagg", "SL.bayesglm", "SL.glm", "SL.lm", "SL.glmnet", "SL.ridge", "SL.mean")
+  
+  
+  # Tune the model
+  # The outercross-validation is the sample spliting for evaluating the SuperLearner.
+  # The inner cross-validation are the valuesnpassed to each of the V SuperLearner calls.
+  # V is the number of folds for CV.SuperLearner
+  cv.model.tune <- CV.SuperLearner(Y = y.train, X = x.train, family=gaussian(), SL.library = SL.library, 
+                                   verbose = TRUE, method = "method.NNLS", cvControl = list(V = 10, shuffle = FALSE),
+                                   innerCvControl = list(list(V = 5)))
+})
+
+# Get summary statistics
+summary(cv.model.tune)
+
+df <- summary(cv.model.tune)
+newdf <- df$Table
+mse <- newdf[,1:2]
+mse$rmse ='^'(mse$Ave,1/2) # square root of mse to calculate rmse
+finaltable <- mse[c(1,2,5,6,7,8,9,10),c(1,3)] # subset only few algorithms
+standardnames <- c('Super Learner', 'Discrete Super Learner', 'Random Forest', 'Support Vector Machines', 'Bagging Classification', 'Bayesian Generalized Linear Models
+', 'Generalized Linear Models', 'Ordinary least squares' ) # standardize the names
+final_stats_with_rmse <- data.frame("Algorithms" = standardnames, "RMSE" = finaltable[,2]) # subset the data
+final_stats_with_rmse %>% mutate_at(vars(RMSE), funs(round(., 2))) # round columns values to two decimal places
+
+## Selecting top 8 features based on variable importance
+
+x.train_selected_vars <- x.train %>% dplyr::select(Duration.Trial, Duration.Treatment, Disease_n, AE_total_serious, N_total, GDP_weighted, AA_fraction, Age_mean)
+
+# Fit the CV.SuperLearner with the tuned randomforest along with lm on final selected variables  
+
+system.time({
+  # Set the seed
+  set.seed(250)
+  
+  # Create a library of algorithms
+  SL.library <- c("SL.randomForest", "SL.lm")
+  
+  
+  # Tune parameters for SL : stratifyCV, shuffle and validRows
+  # The outercross-validation is the sample spliting for evaluating the SuperLearner.
+  # The inner cross-validation are the valuesnpassed to each of the V SuperLearner calls.
+  # V is the number of folds for CV.SuperLearner
+  # A list of parameters to control the outer cross-validation process. cvControl set to 3
+  cv.model.tune.8predictors <- CV.SuperLearner(Y = y.train, X = x.train_selected_vars, family=gaussian(), SL.library = SL.library, 
+                                               verbose = TRUE, method = "method.NNLS", cvControl = list(V = 3, shuffle = FALSE),
+                                               innerCvControl = list(list(V = 5)))
+})
+
+# Get summary statistics
+summary(cv.model.tune.8predictors)
+
+df <- summary(cv.model.tune.8predictors)
+newdf <- df$Table
+mse <- newdf[,1:2]
+mse$rmse ='^'(mse$Ave,1/2) # square root of mse to calculate rmse
+finaltable <- mse[c(3,4),c(1,3)] # subset only few algorithms
+standardnames <- c('Random Forest','Ordinary least squares' ) # standardize the names
+final_stats_with_rmse <- data.frame("Algorithms" = standardnames, "RMSE" = finaltable[,2]) # subset the data
+final_stats_with_rmse %>% mutate_at(vars(RMSE), funs(round(., 2))) # round columns values to two decimal places