Diff of /R/pRRophetic.R [000000] .. [494cbf]

Switch to unified view

a b/R/pRRophetic.R
1
#' Dichotimize a training expression set and fit a logistic ridge regression model which is applied to the test expression matirx.
2
#'
3
#' Dichotimize a training expression set and fit a logistic ridge regression model which is applied to the test expression matirx.
4
#' This function will return a set of probabilities.
5
#'
6
#' @importFrom genefilter rowttests
7
#' @import ridge
8
#' @param trainingExprData Gene expression matrix for samples for which we the phenotype is already known.
9
#' @param trainingPtype The known phenotype, a vector in the same order as the columns of "trainingExprData" or with the same names as colnames of "trainingExprData".
10
#' @param testExprData Gene expression matrix for samples on which we wish to predict a phenotype. Gene names as rows, samples names as columns.
11
#' @param batchCorrect The type of batch correction to be used. Options are "eb", "none", .....
12
#' @param selection How should duplicate gene ids be handled. Default is -1 which asks the user. 1 to summarize by their or 2 to disguard all duplicates.
13
#' @param printOutput Set to FALSE to supress output
14
#' @param numGenesSelected Specifies how genes are selected for "variableSelectionMethod". Options are "tTests", "pearson" and "spearman".
15
#' @param numSens The number of sensitive cell lines to be fit in the logistic regression model.
16
#' @param numRes The number of resistant cell lines fit in the logistic regression model.
17
#' @param minNumSamples The minimum number of test samples, print an error if the number of columns of "testExprData" is below this threshold. A large number of test samples may be necessary to correct for batch effects.
18
#' @keywords internal
19
#' @return classifySamples
20
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
21
classifySamples <- function(trainingExprData, trainingPtype, testExprData, batchCorrect="eb", minNumSamples=10, selection=-1, printOutput=TRUE, numGenesSelected=1000, numSens=15, numRes=55)
22
{
23
  sensInd <- order(trainingPtype)[1:numSens]
24
  resInd <- order(trainingPtype)[(length(trainingPtype)-numRes):length(trainingPtype)]
25
26
  homDataErlot <- homogenizeData(testExprData, trainingExprData, batchCorrect=batchCorrect, selection=selection, printOutput=printOutput)
27
28
  tTests <- genefilter::rowttests(data.matrix(cbind(homDataErlot$train[, sensInd], homDataErlot$train[, resInd])), as.factor(c(rep("sens", length(sensInd)), rep("res", length(resInd)))))
29
  topCorVarGenes <- rownames(tTests[order(tTests[, "p.value"]),])[1:numGenesSelected] # this makes more sense than spearman of pearson....
30
31
  trainExpr <- t(homDataErlot$train[ topCorVarGenes, c(sensInd, resInd)])
32
  trainPtyle <- as.numeric(as.factor(c(rep("sens", length(sensInd)), rep("res", length(resInd))))) - 1
33
34
  trainDat <- data.frame(trainPtyle, trainExpr)
35
  cat("Fitting model, may take some time....\n")
36
  ridgeLogisticModel_all <- logisticRidge(trainPtyle~., data=trainDat)
37
38
  preDataLRR <- data.frame(t(homDataErlot$test[topCorVarGenes, ]))
39
  predsLRR <- predict(ridgeLogisticModel_all, preDataLRR)
40
41
  return(predsLRR)
42
}
43
44
## Functions to predict a phenotype from microarray expression data of different platforms.
45
##
46
47
#' Calculates phenotype from microarray data.
48
#'
49
#' This function uses ridge regression to calculate a phenotype from an gene expression,
50
#' given a gene expression matrix where the phenotype is already known. The function
51
#' also integrates the two datasets using a user-defined procedure, power transforms
52
#' the known phenotype and provides several other options for flexible and powerful prediction
53
#' from a gene expression matrix.
54
#'
55
#' @param trainingExprData The training data. A matrix of expression levels, rows contain genes and columns contain samples, "rownames()" must be specified and must contain the same type of gene ids as "testExprData"
56
#' @param trainingPtype The known phenotype for "trainingExprData". A numeric vector which MUST be the same length as the number of columns of "trainingExprData".
57
#' @param testExprData The test data where the phenotype will be estimted. It is a matrix of expression levels, rows contain genes and columns contain samples, "rownames()" must be specified and must contain the same type of gene ids as "trainingExprData".
58
#' @param batchCorrect How should training and test data matrices be homogenized. Choices are "eb" (default) for ComBat, "qn" for quantiles normalization or "none" for no homogenization.
59
#' @param powerTransformPhenotype Should the phenotype be power transformed before we fit the regression model? Default to TRUE, set to FALSE if the phenotype is already known to be highly normal.
60
#' @param removeLowVaryingGenes What proportion of low varying genes should be removed? 20 percent be default
61
#' @param minNumSamples How many training and test samples are requried. Print an error if below this threshold
62
#' @param selection How should duplicate gene ids be handled. Default is -1 which asks the user. 1 to summarize by their or 2 to disguard all duplicates.
63
#' @param printOutput Set to FALSE to supress output
64
#' @param removeLowVaringGenesFrom what kind of genes should be removed
65
#' @return A vector of the estimated phenotype, in the same order as the columns of "testExprData".
66
#' @import sva
67
#' @import ridge
68
#' @importFrom car powerTransform
69
#' @keywords internal
70
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
71
calcPhenotype <- function(trainingExprData, trainingPtype, testExprData, batchCorrect="eb", powerTransformPhenotype=TRUE, removeLowVaryingGenes=.2, minNumSamples=10, selection=-1, printOutput=TRUE, removeLowVaringGenesFrom="homogenizeData")
72
{
73
74
  # check if the supplied data are of the correct classes
75
  testExprData <- as.matrix(testExprData)
76
  if(class(testExprData)[1] != "matrix") stop("ERROR: \"testExprData\" must be a matrix.");
77
  if(class(trainingExprData)[1] != "matrix") stop("ERROR: \"trainingExprData\" must be a matrix.");
78
  if(class(trainingPtype)[1] != "numeric") stop("ERROR: \"trainingPtype\" must be a numeric vector.");
79
  if(ncol(trainingExprData) != length(trainingPtype)) stop("The training phenotype must be of the same length as the number of columns of the training expressin matrix.");
80
81
  # check if an adequate number of training and test samples have been supplied.
82
  if((ncol(trainingExprData) < minNumSamples) || (ncol(testExprData) < minNumSamples))
83
  {
84
    stop(paste("There are less than", minNumSamples, "samples in your test or training set. It is strongly recommended that you use larger numbers of samples in order to (a) correct for batch effects and (b) fit a reliable model. To supress this message, change the \"minNumSamples\" parameter to this function."))
85
  }
86
87
  # Get the homogenized data
88
  homData <- homogenizeData(testExprData, trainingExprData, batchCorrect=batchCorrect, selection=selection, printOutput=printOutput)
89
90
  # Do variable selection if specified. By default we remove 20% of least varying genes from the homogenized dataset.
91
  # We can also remove the intersection of the lowest 20% from both training and test sets (for the gene ids remaining in the homogenized data)
92
  # Otherwise, keep all genes.
93
  # Check batchCorrect paramter
94
  if(!(removeLowVaringGenesFrom %in% c("homogenizeData", "rawData")))
95
  {
96
    stop("\"removeLowVaringGenesFrom\" must be one of \"homogenizeData\", \"rawData\"")
97
  }
98
99
  keepRows <- seq(1:nrow(homData$train)) # by default we will keep everything
100
  if(removeLowVaryingGenes > 0 && removeLowVaryingGenes < 1) # if the proportion of things to keep is between 0 and 1
101
  {
102
    if(removeLowVaringGenesFrom == "homogenizeData") # if we are filtering based on the homoginized data
103
    {
104
      keepRows <- doVariableSelection(cbind(homData$test, homData$train), removeLowVaryingGenes=removeLowVaryingGenes)
105
      numberGenesRemoved <- nrow(homData$test) - length(keepRows)
106
      if(printOutput) cat(paste("\n", numberGenesRemoved, "low variabilty genes filtered."));
107
    }
108
    else if(removeLowVaringGenesFrom == "rawData") # if we are filtering based on the raw data, i.e. the intersection of the things filtered from both datasets.
109
    {
110
      evaluabeGenes <- rownames(homData$test) # pull the gene names from the genes remaining in the homoginized dataset
111
      keepRowsTrain <- doVariableSelection(trainingExprData[evaluabeGenes, ], removeLowVaryingGenes=removeLowVaryingGenes)
112
      keepRowsTest <- doVariableSelection(testExprData[evaluabeGenes, ], removeLowVaryingGenes=removeLowVaryingGenes)
113
      keepRows <- intersect(keepRowsTrain, keepRowsTest) # only keep things that are kept (i.e. variable) in both datasets
114
      numberGenesRemoved <- nrow(homData$test) - length(keepRows)
115
      if(printOutput) cat(paste("\n", numberGenesRemoved, "low variabilty genes filtered."));
116
    }
117
  }
118
119
  # PowerTranform phenotype if specified.
120
  offset = 0
121
  if(powerTransformPhenotype)
122
  {
123
    if(min(trainingPtype) < 0) # all numbers must be postive for a powerTranform to work, so make them positive.
124
    {
125
      offset <- -min(trainingPtype) + 1
126
      trainingPtype <- trainingPtype + offset
127
    }
128
129
    transForm <- car::powerTransform(trainingPtype)[[6]]
130
    trainingPtype <- trainingPtype^transForm
131
  }
132
133
  # create the Ridge Regression model on our training data
134
  if(printOutput) cat("\nFitting Ridge Regression model... ");
135
  trainFrame <- data.frame(Resp=trainingPtype, t(homData$train[keepRows, ]))
136
  rrModel <- ridge::linearRidge(Resp ~ ., data=trainFrame)
137
  if(printOutput) cat("Done\n\nCalculating predicted phenotype...");
138
139
  # calculate the relative contribution of each gene to the prediction
140
  # i might report these, I don't know if there's any point.
141
  totBeta <- sum(abs(coef(rrModel)))
142
  eachBeta <- abs(coef(rrModel))
143
  eachContribution <- eachBeta/totBeta
144
145
  # predict the new phenotype for the test data.
146
  # if there is a single test sample, there may be problems in predicting using the predict() function for the linearRidge package
147
  # This "if" statement provides a workaround
148
  if(class(homData$test)[1] == "numeric")
149
  {
150
    n <- names(homData$test)
151
    homData$test <- matrix(homData$test, ncol=1)
152
    rownames(homData$test) <- n
153
    testFrame <- data.frame(t(homData$test[keepRows, ]))
154
    preds <- predict(rrModel, newdata=rbind(testFrame, testFrame))[1]
155
  }
156
  else #predict for more than one test sample
157
  {
158
    testFrame <- data.frame(t(homData$test[keepRows, ]))
159
    preds <- predict(rrModel, newdata=testFrame)
160
  }
161
162
  # if the response variable was transformed, untransform it now...
163
  if(powerTransformPhenotype)
164
  {
165
    preds <- preds^(1/transForm)
166
    preds <- preds - offset
167
  }
168
  if(printOutput) cat("Done\n\n");
169
170
  return(preds)
171
}
172
173
#' Cross validation on training dataset
174
#'
175
#' This function does cross validation on a training set to estimate prediction accuracy on a training set.
176
#' If the actual test set is provided, the two datasets can be subsetted and homogenized before the
177
#' cross validation analysis is preformed. This may improve the estimate of prediction accuracy.
178
#'
179
#' @param trainingExprData The training data. A matrix of expression levels, rows contain genes and columns contain samples, "rownames()" must be specified and must contain the same type of gene ids as "testExprData"
180
#' @param trainingPtype The known phenotype for "trainingExprData". A numeric vector which MUST be the same length as the number of columns of "trainingExprData".
181
#' @param testExprData The test data where the phenotype will be estimted. It is a matrix of expression levels, rows contain genes and columns contain samples, "rownames()" must be specified and must contain the same type of gene ids as "trainingExprData".
182
#' @param cvFold Specify the "fold" requried for cross validation. "-1" will do leave one out cross validation (LOOCV)
183
#' @param powerTransformPhenotype Should the phenotype be power transformed before we fit the regression model? Default to TRUE, set to FALSE if the phenotype is already known to be highly normal.
184
#' @param batchCorrect How should training and test data matrices be homogenized. Choices are "eb" (default) for ComBat, "qn" for quantiles normalization or "none" for no homogenization.
185
#' @param removeLowVaryingGenes What proportion of low varying genes should be removed? 20 precent be default
186
#' @param minNumSamples How many training and test samples are requried. Print an error if below this threshold
187
#' @param selection How should duplicate gene ids be handled. Default is -1 which asks the user. 1 to summarize by their or 2 to disguard all duplicates.
188
#' @return An object of class "pRRopheticCv", which is a list with two members, "cvPtype" and "realPtype", which correspond to the cross valiation predicted phenotype and the  user provided measured phenotype respectively.
189
#' @import ridge
190
#' @importFrom sva ComBat
191
#' @importFrom car powerTransform
192
#' @keywords internal
193
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
194
predictionAccuracyByCv <- function(trainingExprData, trainingPtype, testExprData=-1, cvFold=-1, powerTransformPhenotype=TRUE, batchCorrect="eb", removeLowVaryingGenes=.2, minNumSamples=10, selection=1)
195
{
196
197
  # check if an adequate number of training and test samples have been supplied.
198
  if((ncol(trainingExprData) < minNumSamples))
199
  {
200
    stop(paste("There are less than", minNumSamples, "samples in your test or training set. It is strongly recommended that you use larger numbers of samples in order to (a) correct for intrinsic difference in your training and test sets and (b) fit a reliable model. To supress this message, change the \"minNumSamples\" parameter to this function."))
201
  }
202
203
  # check if a test matrix was supplied, if not, don't homogenize the data (but store it in a list called homData anyway for convenience)
204
  if(is.null(testExprData))
205
  {
206
    homData <- list()
207
    homData$selection <- selection
208
    homData$train <- trainingExprData
209
  } else if(!is.null(testExprData)) {
210
    homData <- homogenizeData(testExprData, trainingExprData, batchCorrect=batchCorrect, selection=selection) # homogenize the data.
211
  }
212
213
  nTrain <- ncol(trainingExprData)
214
  predPtype <- numeric() # a numeric vector to hold the predicted phenotypes for the CV subgroups
215
216
  # Perform either N fold cross validation or LOOCV, depending on the "cvFold" variable.
217
  if(cvFold == -1) # if we are doing leave-one-out cross-validation (LOOCV).
218
  {
219
    for(i in 1:nTrain)
220
    {
221
      testMatTemp <- matrix(homData$train[,i], ncol=1)
222
      rownames(testMatTemp) <- rownames(homData$train)
223
      #predPtype[i] <- calcPhenotype(testMatTemp, trainCvSet[,-i], trainingPtype[-i], batchCorrect="none", minNumSamples=0, selection=homData$selection, removeLowVaryingGenes=removeLowVaryingGenes, powerTransformPhenotype=powerTransformPhenotype)
224
      predPtype[i] <- calcPhenotype(homData$train[,-i], trainingPtype[-i], testMatTemp, batchCorrect="none", minNumSamples=0, selection=homData$selection, removeLowVaryingGenes=removeLowVaryingGenes, powerTransformPhenotype=powerTransformPhenotype, printOutput=FALSE)
225
226
      # print an update for each 20% of the this, this is slow, so we should give some updates....
227
      if(i %% as.integer(nTrain/5) == 0)
228
        cat(paste(i, "of" , nTrain, "iterations complete. \n"))
229
    }
230
  }
231
  else if(cvFold > 1) # if we are doing N-fold cross validation
232
  {
233
    randTestSamplesIndex <- sample(1:nTrain) # create a random order for samples
234
235
    # create a vector which indicates which samples are in which group... and split into list of groups
236
    sampleGroup <- rep(cvFold, nTrain)
237
    groupSize <- as.integer(nTrain / cvFold)
238
    for(j in 1:(cvFold-1)) { sampleGroup[(((j-1)*groupSize)+1):(j*groupSize)] <- rep(j, groupSize) }
239
    cvGroupIndexList <- split(randTestSamplesIndex, sampleGroup)
240
241
    # predict on each of the groups....
242
    for(j in 1:cvFold)
243
    {
244
245
      # create the ranomdly chosen "training" and "test" sets for cross validation
246
      testCvSet <- homData$train[, cvGroupIndexList[[j]]]
247
      trainCvSet <- homData$train[, unlist(cvGroupIndexList[-j])]
248
      trainPtypeCv <- trainingPtype[unlist(cvGroupIndexList[-j])]
249
250
      predPtype <- c(predPtype, calcPhenotype(trainCvSet, trainPtypeCv, testCvSet, batchCorrect="none", minNumSamples=0, selection=homData$selection, removeLowVaryingGenes=removeLowVaryingGenes, powerTransformPhenotype=powerTransformPhenotype, printOutput=FALSE))
251
252
      cat(paste("\n", j, " of ", cvFold, " iterations complete.", sep=""))
253
    }
254
255
    # re-order the predicted phenotypes correctly, as they were ranomized when this started.
256
    predPtype <- predPtype[order(randTestSamplesIndex)]
257
258
  } else {
259
    stop("Unrecognised value of \"cvFold\"")
260
  }
261
262
  finalData <- list(cvPtype=predPtype, realPtype=trainingPtype)
263
  class(finalData) <- "pRRopheticCv"
264
265
  return(finalData)
266
}
267
268
#' R^2 from "pRRopheticCv" object.
269
#'
270
#' Given an object of class "pRRopheticCv", i.e. the output of cross validation, calculate
271
#' the R^2 value for the prediction (an estimate of prediction accuracy).
272
#'
273
#' @param cvOutData an object of class "pRRopheticCv", i.e. the outpout of the "predictionAccuracyByCv()" function
274
#' @param powerTranform powerTranform phenotype or not.
275
#' @return a numeric vector containing the R squared value from the cross validation.
276
#' @keywords internal
277
#' @importFrom car powerTransform
278
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
279
estimateRsqr.pRRopheticCv <- function(cvOutData, powerTranform=TRUE)
280
{
281
  # calculate the R^2
282
  return(summary(lm(cvOutData$realPtype~cvOutData$cvPtype))$r.squared)
283
}
284
285
286
#' Confidence intervals from "pRRopheticCv" object.
287
#'
288
#' Given an object of class "pRRopheticCv", i.e. the output of cross validation, calculate
289
#' an average confidence interval for the predictions.
290
#'
291
#' @param cvOutData an object of class "pRRopheticCv", i.e. the outpout of the "predictionAccuracyByCv()" function
292
#' @param conf the confidence interval required, by default 95 precent confidence interval.
293
#' @return a numeric vector containing the average upper and lower confidence interval.
294
#' @keywords internal
295
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
296
estimateCI.pRRopheticCv <- function(cvOutData, conf=.95)
297
{
298
299
  # report 95% confidence intervals and 50% confidence intervals (estimated from the untransformed data)
300
  allDeviationsRaw <- cvOutData$cvPtype - cvOutData$realPtype
301
  allDeviations <- abs(allDeviationsRaw)
302
  inCi <- which(allDeviations < quantile(allDeviations, conf))
303
  ci <- c(min(allDeviationsRaw[inCi]), max(allDeviationsRaw[inCi]))
304
  return(ci)
305
}
306
307
308
#' Mean prediction error from "pRRopheticCv" object.
309
#'
310
#' Given an object of class "pRRopheticCv", estiamte the mean prediction error,
311
#' i.e. the mean difference between the predicted and measured phenotype.
312
#'
313
#' @param cvOutData an object of class "pRRopheticCv", i.e. the outpout of the "predictionAccuracyByCv()" function
314
#' @return a numeric vector containing the mean prediction error from the cross validation.
315
#' @keywords internal
316
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
317
estimateMeanPredictionError.pRRopheticCv <- function(cvOutData)
318
{
319
  allDeviationsRaw <- abs(cvOutData$cvPtype - cvOutData$realPtype)
320
  return(mean(allDeviationsRaw))
321
}
322
323
#' Median prediction error from "pRRopheticCv" object.
324
#'
325
#' Given an object of class "pRRopheticCv", estiamte the median prediction error,
326
#' i.e. the median difference between the predicted and measured phenotype.
327
#'
328
#' @param cvOutData an object of class "pRRopheticCv", i.e. the outpout of the "predictionAccuracyByCv()" function
329
#' @return a numeric vector containing the median prediction error from the cross validation.
330
#' @keywords internal
331
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
332
estimateMedianPredictionError.pRRopheticCv <- function(cvOutData)
333
{
334
  allDeviationsRaw <- abs(cvOutData$cvPtype - cvOutData$realPtype)
335
  return(mean(median))
336
}
337
338
339
#' Summary of "pRRopheticCv" object.
340
#'
341
#' Given an object of class "pRRopheticCv", print various metrics that
342
#' summarize the performance of the cross validataion analysis
343
#' @param cvOutData an object of class "pRRopheticCv", i.e. the outpout of the "predictionAccuracyByCv()" function
344
#' @keywords internal
345
#' @return summary.pRRopheticCv
346
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
347
summary.pRRopheticCv <- function(cvOutData)
348
{
349
  cat("\nSummary of cross-validation results:\n\n")
350
351
  corOut <- cor.test(cvOutData[[1]], cvOutData[[2]])
352
  cat(paste("Pearsons correlation:", round(corOut$estimate, digits=2), ", P = ", corOut$p.value, "\n"))
353
  cat(paste("R-squared value: ", round(estimateRsqr.pRRopheticCv(cvOutData), digits=2), "\n", sep=""))
354
  cis <- estimateCI.pRRopheticCv(cvOutData)
355
  cat(paste("Estimated 95% confidence intervals: ", round(cis[1], digits=2), ", ", round(cis[2], digits=2), "\n", sep=""))
356
  cat(paste("Mean prediction error: ", round(estimateMeanPredictionError.pRRopheticCv(cvOutData), digits=2), "\n\n", sep=""))
357
}
358
359
#' Plot "pRRopheticCv" object.
360
#'
361
#' Given an object of class "pRRopheticCv", plot the cross validation
362
#' predicted values against the measured values. Also plots a regression
363
#' line.
364
#'
365
#' @param cvOutData an object of class "pRRopheticCv", i.e. the outpout of the "predictionAccuracyByCv()" function
366
#' @keywords internal
367
#' @return plot.pRRopheticCv
368
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
369
plot.pRRopheticCv <- function(cvOutData)
370
{
371
  coefs <- coef(lm(cvOutData$realPtype~cvOutData$cvPtype))
372
  plot(cvOutData$cvPtype, cvOutData$realPtype, main="Measured phenotype Vs. C.V. predicted phenotype", xlab="Predicted Phenotype", ylab="Measured Phenotype")
373
  abline(coefs[1], coefs[2], col="red")
374
}
375
376
#' a function to do variable selection on expression matrices.
377
#'
378
#' This funtino will I.e. remove genes with low variation
379
#' It returns a vector of row ids to keep. Note, rownames() must be specified.
380
#'
381
#' @param exprMat a matrix of expression levels, rows contain genes and columns contain samples.
382
#' @param removeLowVaryingGenes the proportion of low varying genes to be removed.
383
#' @return a vector of row ids to keep
384
#' @keywords internal
385
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
386
doVariableSelection <- function(exprMat, removeLowVaryingGenes)
387
{
388
  vars <- apply(exprMat, 1, var)
389
  return(order(vars, decreasing=TRUE)[seq(1:as.integer(nrow(exprMat)*(1-removeLowVaryingGenes)))])
390
}
391
392
#' Take two expression matrices and return homogenized versions of the matrices.
393
#'
394
#' This function accepts two expression matrices, with gene ids as rownames() and
395
#' sample ids as colnames(). It will deal with duplicate gene ids.
396
#' subset and order the matrices correctly.
397
#' and perform homogenize the data using whatever method is specified (by default Combat from the sva library).
398
#'
399
#' @param testExprMat Gene expression matrix for samples on which we wish to predict a phenotype. Gene names as rows, samples names as columns.
400
#' @param trainExprMat Gene expression matrix for samples for which we the phenotype is already known.
401
#' @param batchCorrect The type of batch correction to be used. Options are "eb" for Combat, "none", or "qn" for quantile normalization.
402
#' @param selection parameter can be used to specify how duplicates are handled, by default value -1 means ask the user. 1 means summarize duplictes by their mean and 2 means to disguard all duplicate genes.
403
#' @param printOutput Set to FALSE to supress output
404
#' @return a list containing two entries $train and $test, which are the homogenized input matrices.
405
#' @importFrom sva ComBat
406
#' @importFrom preprocessCore normalize.quantiles
407
#' @keywords internal
408
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
409
homogenizeData <- function(testExprMat, trainExprMat, batchCorrect="eb", selection=-1, printOutput=TRUE)
410
{
411
  # Check batchCorrect paramter
412
  if(!(batchCorrect %in% c("eb", "qn", "none", "rank", "rank_then_eb", "standardize")))
413
    stop("\"batchCorrect\" must be one of \"eb\", \"qn\", \"rank\", \"rank_then_eb\", \"standardize\" or \"none\"")
414
415
  # check if both row and column names have been specified
416
  if(is.null(rownames(trainExprMat)) || is.null(rownames(testExprMat)))
417
  {
418
    stop("ERROR: Gene identifiers must be specified as \"rownames()\" on both training and test expression matrices. Both matices must have the same type of gene identifiers.")
419
  }
420
421
  # check that some of the row names overlap between both datasets (print an error if none overlap.
422
  if(sum(rownames(trainExprMat) %in% rownames(testExprMat)) == 0)
423
  {
424
    stop("ERROR: The rownames() of the supplied expression matrices do not match. Note that these are case-sensitive.")
425
  } else {
426
    if(printOutput) cat(paste("\n", sum(rownames(trainExprMat) %in% rownames(testExprMat)), " gene identifiers overlap between the supplied expression matrices... \n", paste=""));
427
  }
428
429
  # if there are duplicate gene names, give the option of removing them or summarizing them by their mean.
430
  if((sum(duplicated(rownames(trainExprMat))) > 0) || sum(sum(duplicated(rownames(testExprMat))) > 0))
431
  {
432
    if(selection == -1) #print the following if we're asking the user how to proceed.
433
    {
434
      cat("\nExpression matrix contain duplicated gene identifiers (i.e. duplicate rownames()), how would you like to proceed:")
435
      cat("\n1. Summarize duplicated gene ids by their mean value (acceptable in most cases)")
436
      cat("\n2. Disguard all duplicated genes (recommended if unsure)")
437
      cat("\n3. Abort (if you want to deal with duplicate genes ids manually)\n")
438
    }
439
440
    while(is.na(selection) | selection <= 0 | selection > 3 )
441
    {
442
      selection <- readline("Selection: ")
443
      selection <- ifelse(grepl("[^1-3.]", selection), -1 , as.numeric(selection))
444
    }
445
446
    cat("\n")
447
448
    if(selection == 1) # summarize duplicates by their mean
449
    {
450
      if((sum(duplicated(rownames(trainExprMat))) > 0))
451
      {
452
        trainExprMat <- summarizeGenesByMean(trainExprMat)
453
      }
454
      if((sum(duplicated(rownames(testExprMat))) > 0))
455
      {
456
        testExprMat <- summarizeGenesByMean(testExprMat)
457
      }
458
    }
459
    else if(selection == 2) # disguard all duplicated genes
460
    {
461
      if((sum(duplicated(rownames(trainExprMat))) > 0))
462
      {
463
        keepGenes <- names(which(table(rownames(trainExprMat)) == 1))
464
        trainExprMat <- trainExprMat[keepGenes, ]
465
      }
466
467
      if((sum(duplicated(rownames(testExprMat))) > 0))
468
      {
469
        keepGenes <- names(which(table(rownames(testExprMat)) == 1))
470
        testExprMat <- testExprMat[keepGenes, ]
471
      }
472
    } else {
473
      stop("Exectution Aborted!")
474
    }
475
  }
476
477
  # subset and order gene ids on the expression matrices
478
  commonGenesIds <- rownames(trainExprMat)[rownames(trainExprMat) %in% rownames(testExprMat)]
479
  trainExprMat <- trainExprMat[commonGenesIds, ]
480
  testExprMat <- testExprMat[commonGenesIds, ]
481
482
  # subset and order the two expresison matrices
483
  if(batchCorrect == "eb")
484
  {
485
    # subset to common genes andbatch correct using ComBat
486
    dataMat <- cbind(trainExprMat, testExprMat)
487
    mod <- data.frame("(Intercept)"=rep(1, ncol(dataMat)))
488
    rownames(mod) <- colnames(dataMat)
489
    whichbatch <- as.factor(c(rep("train", ncol(trainExprMat)), rep("test", ncol(testExprMat))))
490
    combatout <- sva::ComBat(dataMat, whichbatch, mod=mod)
491
    return(list(train=combatout[, whichbatch=="train"], test=combatout[, whichbatch=="test"], selection=selection))
492
  }
493
  else if(batchCorrect == "standardize") # standardize to mean 0 and variance 1 in each dataset, using a non EB based approach
494
  {
495
    for(i in 1:nrow(trainExprMat))
496
    {
497
      row <- trainExprMat[i, ]
498
      trainExprMat[i, ] <- ((row - mean(row)) / sd(row))
499
    }
500
    for(i in 1:nrow(testExprMat))
501
    {
502
      row <- testExprMat[i, ]
503
      testExprMat[i, ] <- ((row - mean(row)) / sd(row))
504
    }
505
    return(list(train=trainExprMat, test=testExprMat, selection=selection))
506
  }
507
  else if(batchCorrect == "rank") # the random-rank transform approach, that may be better when applying models to RNA-seq data....
508
  {
509
    for(i in 1:nrow(trainExprMat))
510
    {
511
      trainExprMat[i, ] <- rank(trainExprMat[i, ], ties.method="random")
512
    }
513
    for(i in 1:nrow(testExprMat))
514
    {
515
      testExprMat[i, ] <- rank(testExprMat[i, ], ties.method="random")
516
    }
517
    return(list(train=trainExprMat, test=testExprMat, selection=selection))
518
  }
519
  else if(batchCorrect == "rank_then_eb") # rank-transform the RNA-seq data, then apply ComBat....
520
  {
521
    # first, rank transform the RNA-seq data
522
    for(i in 1:nrow(testExprMat))
523
    {
524
      testExprMat[i, ] <- rank(testExprMat[i, ], ties.method="random")
525
    }
526
527
    # subset to common genes andbatch correct using ComBat
528
    dataMat <- cbind(trainExprMat, testExprMat)
529
    mod <- data.frame("(Intercept)"=rep(1, ncol(dataMat)))
530
    rownames(mod) <- colnames(dataMat)
531
    whichbatch <- as.factor(c(rep("train", ncol(trainExprMat)), rep("test", ncol(testExprMat))))
532
    combatout <- sva::ComBat(dataMat, whichbatch, mod=mod)
533
    return(list(train=combatout[, whichbatch=="train"], test=combatout[, whichbatch=="test"], selection=selection))
534
  }
535
  else if(batchCorrect == "qn")
536
  {
537
    dataMat <- cbind(trainExprMat, testExprMat)
538
    dataMatNorm <- preprocessCore::normalize.quantiles(dataMat)
539
    whichbatch <- as.factor(c(rep("train", ncol(trainExprMat)), rep("test", ncol(testExprMat))))
540
    return(list(train=dataMatNorm[, whichbatch=="train"], test=dataMatNorm[, whichbatch=="test"], selection=selection))
541
  } else {
542
    return(list(train=trainExprMat, test=testExprMat, selection=selection))
543
  }
544
}
545
546
547
## This file contains functions for prediction and classification from the CGP cell line data....
548
549
#' Given a gene expression matrix, predict drug senstivity for a drug in CGP
550
#'
551
#' Given a gene expression matrix, predict drug senstivity for a drug in CGP.
552
#'
553
#' @param testMatrix a gene expression matrix with gene names as row ids and sample names as column ids.
554
#' @param drug the name of the drug for which you would like to predict sensitivity, one of A.443654, A.770041, ABT.263, ABT.888, AG.014699, AICAR, AKT.inhibitor.VIII, AMG.706, AP.24534, AS601245, ATRA, AUY922, Axitinib, AZ628, AZD.0530, AZD.2281, AZD6244, AZD6482, AZD7762, AZD8055, BAY.61.3606, Bexarotene, BI.2536, BIBW2992, Bicalutamide, BI.D1870, BIRB.0796, Bleomycin, BMS.509744, BMS.536924, BMS.708163, BMS.754807, Bortezomib, Bosutinib, Bryostatin.1, BX.795, Camptothecin, CCT007093, CCT018159, CEP.701, CGP.082996, CGP.60474, CHIR.99021, CI.1040, Cisplatin, CMK, Cyclopamine, Cytarabine, Dasatinib, DMOG, Docetaxel, Doxorubicin, EHT.1864, Elesclomol, Embelin, Epothilone.B, Erlotinib, Etoposide, FH535, FTI.277, GDC.0449, GDC0941, Gefitinib, Gemcitabine, GNF.2, GSK269962A, GSK.650394, GW.441756, GW843682X, Imatinib, IPA.3, JNJ.26854165, JNK.9L, JNK.Inhibitor.VIII, JW.7.52.1, KIN001.135, KU.55933, Lapatinib, Lenalidomide, LFM.A13, Metformin, Methotrexate, MG.132, Midostaurin, Mitomycin.C, MK.2206, MS.275, Nilotinib, NSC.87877, NU.7441, Nutlin.3a, NVP.BEZ235, NVP.TAE684, Obatoclax.Mesylate, OSI.906, PAC.1, Paclitaxel, Parthenolide, Pazopanib, PD.0325901, PD.0332991, PD.173074, PF.02341066, PF.4708671, PF.562271, PHA.665752, PLX4720, Pyrimethamine, QS11, Rapamycin, RDEA119, RO.3306, Roscovitine, Salubrinal, SB.216763, SB590885, Shikonin, SL.0101.1, Sorafenib, S.Trityl.L.cysteine, Sunitinib, Temsirolimus, Thapsigargin, Tipifarnib, TW.37, Vinblastine, Vinorelbine, Vorinostat, VX.680, VX.702, WH.4.023, WO2009093972, WZ.1.84, X17.AAG, X681640, XMD8.85, Z.LLNle.CHO, ZM.447439.
555
#' @param tissueType specify if you would like to traing the models on only a subset of the CGP cell lines (based on the tissue type from which the cell lines originated). This be one any of "all" (for everything, default option), "allSolidTumors" (everything except for blood), "blood", "breast", "CNS", "GI tract" ,"lung", "skin", "upper aerodigestive"
556
#' @param batchCorrect How should training and test data matrices be homogenized. Choices are "eb" (default) for ComBat, "qn" for quantiles normalization or "none" for no homogenization.
557
#' @param powerTransformPhenotype Should the phenotype be power transformed before we fit the regression model? Default to TRUE, set to FALSE if the phenotype is already known to be highly normal.
558
#' @param removeLowVaryingGenes What proportion of low varying genes should be removed? 20 percent be default
559
#' @param minNumSamples How many training and test samples are requried. Print an error if below this threshold
560
#' @param selection How should duplicate gene ids be handled. Default is -1 which asks the user. 1 to summarize by their or 2 to disguard all duplicates.
561
#' @param printOutput Set to FALSE to supress output
562
#' @param removeLowVaringGenesFrom what kind of genes should be removed
563
#' @param dataset version of GDSC dataset
564
#' @return a gene expression matrix that does not contain duplicate gene ids
565
#' @keywords internal
566
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
567
pRRopheticPredict <- function(testMatrix, drug, tissueType="all", batchCorrect="eb", powerTransformPhenotype=TRUE, removeLowVaryingGenes=.2, minNumSamples=10, selection=-1, printOutput=TRUE, removeLowVaringGenesFrom="homogenizeData", dataset="cgp2014")
568
{
569
  cgpTrainData <- getCGPinfo(drug, tissueType, dataset) # get the IC50 and expression data for this drug/tissueType
570
571
  if(!all(!duplicated(colnames(cgpTrainData$trainDataOrd)))) { # remove duplicated gene expression and IC50 in the same cell lines
572
    cgpTrainData$trainDataOrd <- cgpTrainData$trainDataOrd[,!duplicated(colnames(cgpTrainData$trainDataOrd))]
573
    cgpTrainData$ic50sOrd <- cgpTrainData$ic50sOrd[!duplicated(names(cgpTrainData$ic50sOrd))]
574
  }
575
576
  predictedPtype <- calcPhenotype(cgpTrainData$trainDataOrd, cgpTrainData$ic50sOrd, testMatrix, batchCorrect=batchCorrect, powerTransformPhenotype=powerTransformPhenotype, removeLowVaryingGenes=removeLowVaryingGenes, minNumSamples=minNumSamples, selection=selection, printOutput=printOutput, removeLowVaringGenesFrom=removeLowVaringGenesFrom)
577
578
  return(predictedPtype)
579
580
}
581
582
583
#' This function uses X fold cross validation on the TrainingSet to estimate the accuracy of the
584
#' phenotype prediction fold: How many fold cross-validation to use.
585
#'
586
#' This function does cross validation on a training set to estimate prediction accuracy on a training set.
587
#' If the actual test set is provided, the two datasets can be subsetted and homogenized before the
588
#' cross validation analysis is preformed. This may improve the estimate of prediction accuracy.
589
#'
590
#' @param testExprData The test data where the phenotype will be estimted. It is a matrix of expression levels, rows contain genes and columns contain samples, "rownames()" must be specified and must contain the same type of gene ids as "trainingExprData".
591
#' @param tissueType specify if you would like to traing the models on only a subset of the CGP cell lines (based on the tissue type from which the cell lines originated). This be one any of "all" (for everything, default option), "allSolidTumors" (everything except for blood), "blood", "breast", "CNS", "GI tract" ,"lung", "skin", "upper aerodigestive"
592
#' @param drug the name of the drug for which you would like to predict sensitivity, one of A.443654, A.770041, ABT.263, ABT.888, AG.014699, AICAR, AKT.inhibitor.VIII, AMG.706, AP.24534, AS601245, ATRA, AUY922, Axitinib, AZ628, AZD.0530, AZD.2281, AZD6244, AZD6482, AZD7762, AZD8055, BAY.61.3606, Bexarotene, BI.2536, BIBW2992, Bicalutamide, BI.D1870, BIRB.0796, Bleomycin, BMS.509744, BMS.536924, BMS.708163, BMS.754807, Bortezomib, Bosutinib, Bryostatin.1, BX.795, Camptothecin, CCT007093, CCT018159, CEP.701, CGP.082996, CGP.60474, CHIR.99021, CI.1040, Cisplatin, CMK, Cyclopamine, Cytarabine, Dasatinib, DMOG, Docetaxel, Doxorubicin, EHT.1864, Elesclomol, Embelin, Epothilone.B, Erlotinib, Etoposide, FH535, FTI.277, GDC.0449, GDC0941, Gefitinib, Gemcitabine, GNF.2, GSK269962A, GSK.650394, GW.441756, GW843682X, Imatinib, IPA.3, JNJ.26854165, JNK.9L, JNK.Inhibitor.VIII, JW.7.52.1, KIN001.135, KU.55933, Lapatinib, Lenalidomide, LFM.A13, Metformin, Methotrexate, MG.132, Midostaurin, Mitomycin.C, MK.2206, MS.275, Nilotinib, NSC.87877, NU.7441, Nutlin.3a, NVP.BEZ235, NVP.TAE684, Obatoclax.Mesylate, OSI.906, PAC.1, Paclitaxel, Parthenolide, Pazopanib, PD.0325901, PD.0332991, PD.173074, PF.02341066, PF.4708671, PF.562271, PHA.665752, PLX4720, Pyrimethamine, QS11, Rapamycin, RDEA119, RO.3306, Roscovitine, Salubrinal, SB.216763, SB590885, Shikonin, SL.0101.1, Sorafenib, S.Trityl.L.cysteine, Sunitinib, Temsirolimus, Thapsigargin, Tipifarnib, TW.37, Vinblastine, Vinorelbine, Vorinostat, VX.680, VX.702, WH.4.023, WO2009093972, WZ.1.84, X17.AAG, X681640, XMD8.85, Z.LLNle.CHO, ZM.447439.
593
#' @param cvFold Specify the "fold" requried for cross validation. "-1" will do leave one out cross validation (LOOCV)
594
#' @param powerTransformPhenotype Should the phenotype be power transformed before we fit the regression model? Default to TRUE, set to FALSE if the phenotype is already known to be highly normal.
595
#' @param batchCorrect How should training and test data matrices be homogenized. Choices are "eb" (default) for ComBat, "qn" for quantiles normalization or "none" for no homogenization.
596
#' @param removeLowVaryingGenes What proportion of low varying genes should be removed? 20 percent by default.
597
#' @param minNumSamples How many training and test samples are requried. Print an error if below this threshold
598
#' @param selection How should duplicate gene ids be handled. Default is -1 which asks the user. 1 to summarize by their or 2 to disguard all duplicates.
599
#' @return An object of class "pRRopheticCv", which is a list with two members, "cvPtype" and "realPtype", which correspond to the cross valiation predicted phenotype and the  user provided measured phenotype respectively.
600
#' @import ridge
601
#' @importFrom sva ComBat
602
#' @importFrom car powerTransform
603
#' @keywords internal
604
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
605
pRRopheticCV <- function(drug, tissueType="all", testExprData=NULL, cvFold=-1, powerTransformPhenotype=TRUE, batchCorrect="eb", removeLowVaryingGenes=.2, minNumSamples=10, selection=1)
606
{
607
  cgpTrainData <- getCGPinfo(drug, tissueType) # get the IC50 and expression data for this drug/tissueType
608
609
  # I may need to alter this function so it can either take the test data or not take the test data...
610
  cvOut <- predictionAccuracyByCv(cgpTrainData$trainDataOrd, cgpTrainData$ic50sOrd, testExprData=testExprData, cvFold=cvFold, powerTransformPhenotype=powerTransformPhenotype, batchCorrect=batchCorrect, removeLowVaryingGenes=removeLowVaryingGenes, minNumSamples=minNumSamples, selection=selection)
611
  return(cvOut)
612
}
613
614
#' Given a drug and tissue type, return CGP expression and drug sensitivity data.
615
#'
616
#' Given a drug and tissue type, return CGP expression and drug sensitivity data.
617
#'
618
#' @param drug The name of the drug for which you would like to predict sensitivity, one of A.443654, A.770041, ABT.263, ABT.888, AG.014699, AICAR, AKT.inhibitor.VIII, AMG.706, AP.24534, AS601245, ATRA, AUY922, Axitinib, AZ628, AZD.0530, AZD.2281, AZD6244, AZD6482, AZD7762, AZD8055, BAY.61.3606, Bexarotene, BI.2536, BIBW2992, Bicalutamide, BI.D1870, BIRB.0796, Bleomycin, BMS.509744, BMS.536924, BMS.708163, BMS.754807, Bortezomib, Bosutinib, Bryostatin.1, BX.795, Camptothecin, CCT007093, CCT018159, CEP.701, CGP.082996, CGP.60474, CHIR.99021, CI.1040, Cisplatin, CMK, Cyclopamine, Cytarabine, Dasatinib, DMOG, Docetaxel, Doxorubicin, EHT.1864, Elesclomol, Embelin, Epothilone.B, Erlotinib, Etoposide, FH535, FTI.277, GDC.0449, GDC0941, Gefitinib, Gemcitabine, GNF.2, GSK269962A, GSK.650394, GW.441756, GW843682X, Imatinib, IPA.3, JNJ.26854165, JNK.9L, JNK.Inhibitor.VIII, JW.7.52.1, KIN001.135, KU.55933, Lapatinib, Lenalidomide, LFM.A13, Metformin, Methotrexate, MG.132, Midostaurin, Mitomycin.C, MK.2206, MS.275, Nilotinib, NSC.87877, NU.7441, Nutlin.3a, NVP.BEZ235, NVP.TAE684, Obatoclax.Mesylate, OSI.906, PAC.1, Paclitaxel, Parthenolide, Pazopanib, PD.0325901, PD.0332991, PD.173074, PF.02341066, PF.4708671, PF.562271, PHA.665752, PLX4720, Pyrimethamine, QS11, Rapamycin, RDEA119, RO.3306, Roscovitine, Salubrinal, SB.216763, SB590885, Shikonin, SL.0101.1, Sorafenib, S.Trityl.L.cysteine, Sunitinib, Temsirolimus, Thapsigargin, Tipifarnib, TW.37, Vinblastine, Vinorelbine, Vorinostat, VX.680, VX.702, WH.4.023, WO2009093972, WZ.1.84, X17.AAG, X681640, XMD8.85, Z.LLNle.CHO, ZM.447439.
619
#' @param tissueType Specify if you would like to traing the models on only a subset of the CGP cell lines (based on the tissue type from which the cell lines originated). This be one any of "all" (for everything, default option), "allSolidTumors" (everything except for blood), "blood", "breast", "CNS", "GI tract" ,"lung", "skin", "upper aerodigestive"
620
#' @param dataset The dataset from which you wish to build the predictive models. Default is "cgp2012", also available "cgp2016", comming soon "ctrp".
621
#' @return a list with two entries, trainDataOrd the ordered expression data and ic50sOrd the drug sensitivity data.
622
#' @keywords internal
623
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
624
getCGPinfo <-  function(drug, tissueType="all", dataset="cgp2014")
625
{
626
  # list of possible datasets that can be used.
627
  possibleDatasets <- c("cgp2014", "cgp2016")
628
  if(!dataset %in% possibleDatasets) stop(paste("ERROR: the dataset specified was not found. Note dataset names are case sensitive. Please select from: ", (paste(possibleDatasets, collapse=", "))));
629
630
  if(dataset == "cgp2014")
631
  {
632
    # was a valid tissue type specified; tissue types represeted by > 40 cell lines
633
    if(!tissueType %in% c("all", "allSolidTumors", "blood", "breast", "CNS", "GI tract" ,"lung", "skin", "upper aerodigestive")) stop("ERROR: the tissue type specified must be one of \"all\", \"allSolidTumors\", \"blood\", \"breast\", \"CNS\", \"GI tract\", \"lung\", \"skin\" or \"upper aerodigestive\". These tissue types are represented by at least 40 cell lines (although not all 40 may have been screened with each drug).");
634
635
    # was a valid drug specified
636
    possibleDrugs <- c("A.443654", "A.770041", "ABT.263", "ABT.888", "AG.014699", "AICAR", "AKT.inhibitor.VIII", "AMG.706", "AP.24534", "AS601245", "ATRA", "AUY922", "Axitinib", "AZ628", "AZD.0530", "AZD.2281", "AZD6244", "AZD6482", "AZD7762", "AZD8055", "BAY.61.3606", "Bexarotene", "BI.2536", "BIBW2992", "Bicalutamide", "BI.D1870", "BIRB.0796", "Bleomycin", "BMS.509744", "BMS.536924", "BMS.708163", "BMS.754807", "Bortezomib", "Bosutinib", "Bryostatin.1", "BX.795", "Camptothecin", "CCT007093", "CCT018159", "CEP.701", "CGP.082996", "CGP.60474", "CHIR.99021", "CI.1040", "Cisplatin", "CMK", "Cyclopamine", "Cytarabine", "Dasatinib", "DMOG", "Docetaxel", "Doxorubicin", "EHT.1864", "Elesclomol", "Embelin", "Epothilone.B", "Erlotinib", "Etoposide", "FH535", "FTI.277", "GDC.0449", "GDC0941", "Gefitinib", "Gemcitabine", "GNF.2", "GSK269962A", "GSK.650394", "GW.441756", "GW843682X", "Imatinib", "IPA.3", "JNJ.26854165", "JNK.9L", "JNK.Inhibitor.VIII", "JW.7.52.1", "KIN001.135", "KU.55933", "Lapatinib", "Lenalidomide", "LFM.A13", "Metformin", "Methotrexate", "MG.132", "Midostaurin", "Mitomycin.C", "MK.2206", "MS.275", "Nilotinib", "NSC.87877", "NU.7441", "Nutlin.3a", "NVP.BEZ235", "NVP.TAE684", "Obatoclax.Mesylate", "OSI.906", "PAC.1", "Paclitaxel", "Parthenolide", "Pazopanib", "PD.0325901", "PD.0332991", "PD.173074", "PF.02341066", "PF.4708671", "PF.562271", "PHA.665752", "PLX4720", "Pyrimethamine", "QS11", "Rapamycin", "RDEA119", "RO.3306", "Roscovitine", "Salubrinal", "SB.216763", "SB590885", "Shikonin", "SL.0101.1", "Sorafenib", "S.Trityl.L.cysteine", "Sunitinib", "Temsirolimus", "Thapsigargin", "Tipifarnib", "TW.37", "Vinblastine", "Vinorelbine", "Vorinostat", "VX.680", "VX.702", "WH.4.023", "WO2009093972", "WZ.1.84", "X17.AAG", "X681640", "XMD8.85", "Z.LLNle.CHO", "ZM.447439")
637
    if(!drug %in% possibleDrugs) stop(paste("ERROR: the drug specified was not found. Note drug names are case sensitive. Please select from: ", (paste(possibleDrugs, collapse=", "))));
638
639
    # load("/home/pgeeleher/postdoc_stuff/HDAC_project/Scripts/r_package_files/data/drugAndPhenoCgp.RData") # contains drugToCellLineDataCgp (to map cell lines to .CEL file names), gdsc_brainarray_syms (the gene expression data), drugSensitivityDataCgp (the drug sensitivity data)
640
    data("drugAndPhenoCgp") # contains drugToCellLineDataCgp (to map cell lines to .CEL file names), gdsc_brainarray_syms (the gene expression data), drugSensitivityDataCgp (the drug sensitivity data)
641
642
    colIc50Name <- paste(drug, "_IC_50", sep="")
643
    ic50s <- as.numeric(drugSensitivityDataCgp[, colIc50Name])
644
    names(ic50s) <- drugSensitivityDataCgp[ ,"Cell.Line"]
645
    whichNas <- which(is.na(ic50s))
646
    ic50s <- ic50s[-whichNas]
647
    tissue <- drugSensitivityDataCgp[ ,"Cancer.Type"]
648
    names(tissue) <- drugSensitivityDataCgp[ ,"Cell.Line"]
649
    tissue <- tissue[-whichNas]
650
651
    # if a tissue type has been specified, use only tissues of that type.
652
    if(tissueType != "all")
653
    {
654
      if(tissueType == "allSolidTumors")
655
      {
656
        tissueType <- ic50s <- ic50s[!(tissue %in% "blood")]
657
      }
658
      else
659
      {
660
        ic50s <- ic50s[tissue %in% tissueType]
661
      }
662
    }
663
664
    # map the drug sensitivity and expression data
665
    pDataUnique <- drugToCellLineDataCgp[drugToCellLineDataCgp$Source.Name %in%
666
                                           names(which(table(drugToCellLineDataCgp$Source.Name) == 1)), ]
667
    rownames(pDataUnique) <- pDataUnique$Source.Name
668
    commonCellLines <- rownames(pDataUnique)[rownames(pDataUnique) %in% names(ic50s)]
669
    pDataUniqueOrd <- pDataUnique[commonCellLines, ]
670
    ic50sOrd <- ic50s[commonCellLines]
671
    trainDataOrd <- gdsc_brainarray_syms[, pDataUniqueOrd$"Array.Data.File"]
672
673
    return(list(ic50sOrd=ic50sOrd, trainDataOrd=trainDataOrd))
674
  }
675
  else if(dataset == "cgp2016")
676
  {
677
    cat("\nUsing updated CGP 2016 datsets for prediction\n\n")
678
679
    # was a valid tissue type specified; tissue types represeted by > 40 cell lines
680
    if(!tissueType %in% c("all", "aero_digestive_tract", "blood", "bone", "breast", "digestive_system", "lung", "nervous_system", "skin", "urogenital_system")) stop("ERROR: the tissue type specified must be one of \"all\", \"aero_digestive_tract\", \"blood\", \"bone\", \"breast\", \"digestive_system\", \"lung\", \"nervous_system\", \"skin\", \"urogenital_system\". These tissue types are represented by at least 40 cell lines (although not all 40 may have been screened with each drug).");
681
682
    # was a valid drug specified
683
    possibleDrugs2016 <- c("Erlotinib", "Rapamycin", "Sunitinib", "PHA-665752", "MG-132", "Paclitaxel", "Cyclopamine", "AZ628", "Sorafenib", "VX-680", "Imatinib", "TAE684", "Crizotinib", "Saracatinib", "S-Trityl-L-cysteine", "Z-LLNle-CHO", "Dasatinib", "GNF-2", "CGP-60474", "CGP-082996", "A-770041", "WH-4-023", "WZ-1-84", "BI-2536", "BMS-536924", "BMS-509744", "CMK", "Pyrimethamine", "JW-7-52-1", "A-443654", "GW843682X", "MS-275", "Parthenolide", "KIN001-135", "TGX221", "Bortezomib", "XMD8-85", "Roscovitine", "Salubrinal", "Lapatinib", "GSK269962A", "Doxorubicin", "Etoposide", "Gemcitabine", "Mitomycin C", "Vinorelbine", "NSC-87877", "Bicalutamide", "QS11", "CP466722", "Midostaurin", "CHIR-99021", "AP-24534", "AZD6482", "JNK-9L", "PF-562271", "HG-6-64-1", "JQ1", "JQ12", "DMOG", "FTI-277", "OSU-03012", "Shikonin", "AKT inhibitor VIII", "Embelin", "FH535", "PAC-1", "IPA-3", "GSK-650394", "BAY 61-3606", "5-Fluorouracil", "Thapsigargin", "Obatoclax Mesylate", "BMS-754807", "Lisitinib", "Bexarotene", "Bleomycin", "LFM-A13", "GW-2580", "AUY922", "Phenformin", "Bryostatin 1", "Pazopanib", "LAQ824", "Epothilone B", "GSK1904529A", "BMS345541", "Tipifarnib", "BMS-708163", "Ruxolitinib", "AS601245", "Ispinesib Mesylate", "TL-2-105", "AT-7519", "TAK-715", "BX-912", "ZSTK474", "AS605240", "Genentech Cpd 10", "GSK1070916", "KIN001-102", "LY317615", "GSK429286A", "FMK", "QL-XII-47", "CAL-101", "UNC0638", "XL-184", "WZ3105", "XMD14-99", "AC220", "CP724714", "JW-7-24-1", "NPK76-II-72-1", "STF-62247", "NG-25", "TL-1-85", "VX-11e", "FR-180204", "Tubastatin A", "Zibotentan", "YM155", "NSC-207895", "VNLG/124", "AR-42", "CUDC-101", "Belinostat", "I-BET-762", "CAY10603", "Linifanib ", "BIX02189", "CH5424802", "EKB-569", "GSK2126458", "KIN001-236", "KIN001-244", "KIN001-055", "KIN001-260", "KIN001-266", "Masitinib", "MP470", "MPS-1-IN-1", "BHG712", "OSI-930", "OSI-027", "CX-5461", "PHA-793887", "PI-103", "PIK-93", "SB52334", "TPCA-1", "TG101348", "Foretinib", "Y-39983", "YM201636", "Tivozanib", "GSK690693", "SNX-2112", "QL-XI-92", "XMD13-2", "QL-X-138", "XMD15-27", "T0901317", "EX-527", "THZ-2-49", "KIN001-270", "THZ-2-102-1", "AICAR", "Camptothecin", "Vinblastine", "Cisplatin", "Cytarabine", "Docetaxel", "Methotrexate", "ATRA", "Gefitinib", "Navitoclax", "Vorinostat", "Nilotinib", "RDEA119", "CI-1040", "Temsirolimus", "Olaparib", "Veliparib", "Bosutinib", "Lenalidomide", "Axitinib", "AZD7762", "GW 441756", "CEP-701", "SB 216763", "17-AAG", "VX-702", "AMG-706", "KU-55933", "Elesclomol", "Afatinib", "GDC0449", "PLX4720", "BX-795", "NU-7441", "SL 0101-1", "BIRB 0796", "JNK Inhibitor VIII", "681640", "Nutlin-3a (-)", "PD-173074", "ZM-447439", "RO-3306", "MK-2206", "PD-0332991", "BEZ235", "GDC0941", "AZD8055", "PD-0325901", "SB590885", "selumetinib", "CCT007093", "EHT 1864", "Cetuximab", "PF-4708671", "JNJ-26854165", "HG-5-113-01", "HG-5-88-01", "TW 37", "XMD11-85h", "ZG-10", "XMD8-92", "QL-VIII-58", "CCT018159", "AG-014699", "SB 505124", "Tamoxifen", "QL-XII-61", "PFI-1", "IOX2", "YK 4-279", "(5Z)-7-Oxozeaenol", "piperlongumine", "FK866", "Talazoparib", "rTRAIL", "UNC1215", "SGC0946", "XAV939", "Trametinib", "Dabrafenib", "Temozolomide", "Bleomycin (50 uM)", "SN-38", "MLN4924")
684
    if(!drug %in% possibleDrugs2016) stop(paste("ERROR: the drug specified was not found. Note drug names are case sensitive. Please select from: ", (paste(possibleDrugs2016, collapse=", "))));
685
686
    # load("/home/pgeeleher/Dropbox/HDAC_project_Scripts/r_package_files/pRRophetic/data/PANCANCER_IC_Tue_Aug_9_15_28_57_2016.RData") #
687
    data("PANCANCER_IC_Tue_Aug_9_15_28_57_2016") # contains "drugData2016" the 2016 drug IC50 data, downloaded from (http://www.cancerrxgene.org/translation/drug/download#ic50)
688
689
    # load("/home/pgeeleher/Dropbox/HDAC_project_Scripts/r_package_files/pRRophetic/data/cgp2016ExprRma.RData") #
690
    data("cgp2016ExprRma") # contains "cgp2016ExprRma" the 2016 gene expression data. Data was obtained from "http://www.cancerrxgene.org/gdsc1000/GDSC1000_WebResources/Data/preprocessed/Cell_line_RMA_proc_basalExp.txt.zip". Cosmic Ids (in the column names) were mapped to cell line names using data from this file: ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/release-6.0/Cell_Lines_Details.xlsx
691
692
    # get the IC50s and tissue types for cell lines screened with this drug.
693
    ic50s <-  drugData2016[, "IC50"][drugData2016[, "Drug.name"] == drug]
694
    names(ic50s) <- drugData2016[ ,"Cell.line.name"][drugData2016[, "Drug.name"] == drug]
695
    tissue <- drugData2016[, "Tissue"][drugData2016[, "Drug.name"] == drug]
696
    names(tissue) <- drugData2016[ ,"Cell.line.name"][drugData2016[, "Drug.name"] == drug]
697
698
    # if a tissue type has been specified, use only tissues of that type.
699
    if(tissueType != "all")
700
    {
701
      if(tissueType == "allSolidTumors")
702
      {
703
        tissueType <- ic50s <- ic50s[!(tissue %in% "blood")]
704
      }
705
      else
706
      {
707
        ic50s <- ic50s[tissue %in% tissueType]
708
      }
709
    }
710
711
    # get the ordered subsetted expression and IC50 data
712
    commonCellLines <- names(ic50s)[names(ic50s) %in% colnames(cgp2016ExprRma)]
713
    ic50sOrd <- ic50s[commonCellLines]
714
    trainDataOrd <- cgp2016ExprRma[, commonCellLines]
715
716
    return(list(ic50sOrd=ic50sOrd, trainDataOrd=trainDataOrd))
717
  }
718
}
719
720
#' Predict from the CGP data using a logistic model
721
#'
722
#' Predict from the CGP data using a logistic model.
723
#'
724
#' @param testMatrix a gene expression matrix with gene names as row ids and sample names as column ids.
725
#' @param drug the name of the drug for which you would like to predict sensitivity, one of A.443654, A.770041, ABT.263, ABT.888, AG.014699, AICAR, AKT.inhibitor.VIII, AMG.706, AP.24534, AS601245, ATRA, AUY922, Axitinib, AZ628, AZD.0530, AZD.2281, AZD6244, AZD6482, AZD7762, AZD8055, BAY.61.3606, Bexarotene, BI.2536, BIBW2992, Bicalutamide, BI.D1870, BIRB.0796, Bleomycin, BMS.509744, BMS.536924, BMS.708163, BMS.754807, Bortezomib, Bosutinib, Bryostatin.1, BX.795, Camptothecin, CCT007093, CCT018159, CEP.701, CGP.082996, CGP.60474, CHIR.99021, CI.1040, Cisplatin, CMK, Cyclopamine, Cytarabine, Dasatinib, DMOG, Docetaxel, Doxorubicin, EHT.1864, Elesclomol, Embelin, Epothilone.B, Erlotinib, Etoposide, FH535, FTI.277, GDC.0449, GDC0941, Gefitinib, Gemcitabine, GNF.2, GSK269962A, GSK.650394, GW.441756, GW843682X, Imatinib, IPA.3, JNJ.26854165, JNK.9L, JNK.Inhibitor.VIII, JW.7.52.1, KIN001.135, KU.55933, Lapatinib, Lenalidomide, LFM.A13, Metformin, Methotrexate, MG.132, Midostaurin, Mitomycin.C, MK.2206, MS.275, Nilotinib, NSC.87877, NU.7441, Nutlin.3a, NVP.BEZ235, NVP.TAE684, Obatoclax.Mesylate, OSI.906, PAC.1, Paclitaxel, Parthenolide, Pazopanib, PD.0325901, PD.0332991, PD.173074, PF.02341066, PF.4708671, PF.562271, PHA.665752, PLX4720, Pyrimethamine, QS11, Rapamycin, RDEA119, RO.3306, Roscovitine, Salubrinal, SB.216763, SB590885, Shikonin, SL.0101.1, Sorafenib, S.Trityl.L.cysteine, Sunitinib, Temsirolimus, Thapsigargin, Tipifarnib, TW.37, Vinblastine, Vinorelbine, Vorinostat, VX.680, VX.702, WH.4.023, WO2009093972, WZ.1.84, X17.AAG, X681640, XMD8.85, Z.LLNle.CHO, ZM.447439.
726
#' @param tissueType specify if you would like to traing the models on only a subset of the CGP cell lines (based on the tissue type from which the cell lines originated). This be one any of "all" (for everything, default option), "allSolidTumors" (everything except for blood), "blood", "breast", "CNS", "GI tract" ,"lung", "skin", "upper aerodigestive"
727
#' @param batchCorrect The type of batch correction to be used. Options are "eb", "none", .....
728
#' @param selection How should duplicate gene ids be handled. Default is -1 which asks the user. 1 to summarize by their or 2 to disguard all duplicates.
729
#' @param printOutput Set to FALSE to supress output
730
#' @param numGenesSelected Specifies how genes are selected for "variableSelectionMethod". Options are "tTests", "pearson" and "spearman".
731
#' @param numSens The number of sensitive cell lines to be fit in the logistic regression model.
732
#' @param numRes The number of resistant cell lines fit in the logistic regression model.
733
#' @param minNumSamples The minimum number of test samples, print an error if the number of columns of "testExprData" is below this threshold. A large number of test samples may be necessary to correct for batch effects.
734
#' @return A predicted probability of sensitive or resistant from the logistic regression model.
735
#' @keywords internal
736
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
737
pRRopheticLogisticPredict <- function(testMatrix, drug, tissueType="all", batchCorrect="eb", minNumSamples=10, selection=-1, printOutput=TRUE, numGenesSelected=1000, numSens=15, numRes=55)
738
{
739
  cgpTrainData <- getCGPinfo(drug, tissueType) # get the IC50 and expression data for this drug/tissueType
740
741
  predictedPtype <- classifySamples(cgpTrainData$trainDataOrd, cgpTrainData$ic50sOrd, testMatrix, batchCorrect=batchCorrect,minNumSamples=minNumSamples, selection=selection, printOutput=printOutput, numGenesSelected=numGenesSelected, numSens=numSens, numRes=numRes)
742
743
  return(predictedPtype[,1])
744
}
745
746
747
#' Check the distribution of the drug response (IC50) data using a QQ-plot.
748
#'
749
#' Visualize the distribution of the transformed IC50 data for a drug of interest using a QQ plot. If the distribution of the IC50 values deviates wildly from a normal distribtion, it is likely not suitalbe for prediction using a linear model (like linear ridge regression). This drug may be more suitable to constructing a model using a logistic or other type of model.
750
#
751
#' @param drug the name of the drug for which you would like to predict sensitivity, one of A.443654, A.770041, ABT.263, ABT.888, AG.014699, AICAR, AKT.inhibitor.VIII, AMG.706, AP.24534, AS601245, ATRA, AUY922, Axitinib, AZ628, AZD.0530, AZD.2281, AZD6244, AZD6482, AZD7762, AZD8055, BAY.61.3606, Bexarotene, BI.2536, BIBW2992, Bicalutamide, BI.D1870, BIRB.0796, Bleomycin, BMS.509744, BMS.536924, BMS.708163, BMS.754807, Bortezomib, Bosutinib, Bryostatin.1, BX.795, Camptothecin, CCT007093, CCT018159, CEP.701, CGP.082996, CGP.60474, CHIR.99021, CI.1040, Cisplatin, CMK, Cyclopamine, Cytarabine, Dasatinib, DMOG, Docetaxel, Doxorubicin, EHT.1864, Elesclomol, Embelin, Epothilone.B, Erlotinib, Etoposide, FH535, FTI.277, GDC.0449, GDC0941, Gefitinib, Gemcitabine, GNF.2, GSK269962A, GSK.650394, GW.441756, GW843682X, Imatinib, IPA.3, JNJ.26854165, JNK.9L, JNK.Inhibitor.VIII, JW.7.52.1, KIN001.135, KU.55933, Lapatinib, Lenalidomide, LFM.A13, Metformin, Methotrexate, MG.132, Midostaurin, Mitomycin.C, MK.2206, MS.275, Nilotinib, NSC.87877, NU.7441, Nutlin.3a, NVP.BEZ235, NVP.TAE684, Obatoclax.Mesylate, OSI.906, PAC.1, Paclitaxel, Parthenolide, Pazopanib, PD.0325901, PD.0332991, PD.173074, PF.02341066, PF.4708671, PF.562271, PHA.665752, PLX4720, Pyrimethamine, QS11, Rapamycin, RDEA119, RO.3306, Roscovitine, Salubrinal, SB.216763, SB590885, Shikonin, SL.0101.1, Sorafenib, S.Trityl.L.cysteine, Sunitinib, Temsirolimus, Thapsigargin, Tipifarnib, TW.37, Vinblastine, Vinorelbine, Vorinostat, VX.680, VX.702, WH.4.023, WO2009093972, WZ.1.84, X17.AAG, X681640, XMD8.85, Z.LLNle.CHO, ZM.447439
752
#' @importFrom car powerTransform
753
#' @keywords internal
754
#' @return pRRopheticQQplot
755
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
756
pRRopheticQQplot <- function(drug)
757
{
758
  possibleDrugs <- c("A.443654", "A.770041", "ABT.263", "ABT.888", "AG.014699", "AICAR", "AKT.inhibitor.VIII", "AMG.706", "AP.24534", "AS601245", "ATRA", "AUY922", "Axitinib", "AZ628", "AZD.0530", "AZD.2281", "AZD6244", "AZD6482", "AZD7762", "AZD8055", "BAY.61.3606", "Bexarotene", "BI.2536", "BIBW2992", "Bicalutamide", "BI.D1870", "BIRB.0796", "Bleomycin", "BMS.509744", "BMS.536924", "BMS.708163", "BMS.754807", "Bortezomib", "Bosutinib", "Bryostatin.1", "BX.795", "Camptothecin", "CCT007093", "CCT018159", "CEP.701", "CGP.082996", "CGP.60474", "CHIR.99021", "CI.1040", "Cisplatin", "CMK", "Cyclopamine", "Cytarabine", "Dasatinib", "DMOG", "Docetaxel", "Doxorubicin", "EHT.1864", "Elesclomol", "Embelin", "Epothilone.B", "Erlotinib", "Etoposide", "FH535", "FTI.277", "GDC.0449", "GDC0941", "Gefitinib", "Gemcitabine", "GNF.2", "GSK269962A", "GSK.650394", "GW.441756", "GW843682X", "Imatinib", "IPA.3", "JNJ.26854165", "JNK.9L", "JNK.Inhibitor.VIII", "JW.7.52.1", "KIN001.135", "KU.55933", "Lapatinib", "Lenalidomide", "LFM.A13", "Metformin", "Methotrexate", "MG.132", "Midostaurin", "Mitomycin.C", "MK.2206", "MS.275", "Nilotinib", "NSC.87877", "NU.7441", "Nutlin.3a", "NVP.BEZ235", "NVP.TAE684", "Obatoclax.Mesylate", "OSI.906", "PAC.1", "Paclitaxel", "Parthenolide", "Pazopanib", "PD.0325901", "PD.0332991", "PD.173074", "PF.02341066", "PF.4708671", "PF.562271", "PHA.665752", "PLX4720", "Pyrimethamine", "QS11", "Rapamycin", "RDEA119", "RO.3306", "Roscovitine", "Salubrinal", "SB.216763", "SB590885", "Shikonin", "SL.0101.1", "Sorafenib", "S.Trityl.L.cysteine", "Sunitinib", "Temsirolimus", "Thapsigargin", "Tipifarnib", "TW.37", "Vinblastine", "Vinorelbine", "Vorinostat", "VX.680", "VX.702", "WH.4.023", "WO2009093972", "WZ.1.84", "X17.AAG", "X681640", "XMD8.85", "Z.LLNle.CHO", "ZM.447439")
759
  if(!drug %in% possibleDrugs) stop(paste("ERROR: the drug specified was not found. Note drug names are case sensitive. Please select from: ", (paste(possibleDrugs, collapse=", "))));
760
761
  # load("/home/pgeeleher/postdoc_stuff/HDAC_project/Scripts/r_package_files/data/drugAndPhenoCgp.RData") # contains drugToCellLineDataCgp (to map cell lines to .CEL file names), gdsc_brainarray_syms (the gene expression data), drugSensitivityDataCgp (the drug sensitivity data)
762
  data("drugAndPhenoCgp") # contains drugToCellLineDataCgp (to map cell lines to .CEL file names), gdsc_brainarray_syms (the gene expression data), drugSensitivityDataCgp (the drug sensitivity data)
763
764
  colIc50Name <- paste(drug, "_IC_50", sep="")
765
  ic50s <- as.numeric(drugSensitivityDataCgp[, colIc50Name])
766
  names(ic50s) <- drugSensitivityDataCgp[ ,"Cell.Line"]
767
  whichNas <- which(is.na(ic50s))
768
  ic50s <- ic50s[-whichNas]
769
770
  offset = 0
771
  if(min(ic50s) < 0) # all numbers must be postive for a powerTranform to work, so make them positive.
772
  {
773
    offset <- -min(ic50s) + 1
774
    ic50s <- ic50s + offset
775
  }
776
777
  transForm <- powerTransform(ic50s)[[6]]
778
  ic50s <- ic50s^transForm
779
780
  qqnorm(ic50s, main=paste("QQplot on power-transformed IC50 values for", drug))
781
  qqline(ic50s, col="red")
782
}
783
784
785
#' Calculate PPV and NPV and a cutpoint from the training data.
786
#'
787
#' Calculate PPV and NPV and a cutpoint from the training data.
788
#'
789
#' @param predResponders a numeric vector of the predictions that were obtained for the known drug responders
790
#' @param predNonResponders a numeric vector of the predictions for the known drug non-responders
791
#' @param drug the name of the drug for which you would like to predict sensitivity, one of A.443654, A.770041, ABT.263, ABT.888, AG.014699, AICAR, AKT.inhibitor.VIII, AMG.706, AP.24534, AS601245, ATRA, AUY922, Axitinib, AZ628, AZD.0530, AZD.2281, AZD6244, AZD6482, AZD7762, AZD8055, BAY.61.3606, Bexarotene, BI.2536, BIBW2992, Bicalutamide, BI.D1870, BIRB.0796, Bleomycin, BMS.509744, BMS.536924, BMS.708163, BMS.754807, Bortezomib, Bosutinib, Bryostatin.1, BX.795, Camptothecin, CCT007093, CCT018159, CEP.701, CGP.082996, CGP.60474, CHIR.99021, CI.1040, Cisplatin, CMK, Cyclopamine, Cytarabine, Dasatinib, DMOG, Docetaxel, Doxorubicin, EHT.1864, Elesclomol, Embelin, Epothilone.B, Erlotinib, Etoposide, FH535, FTI.277, GDC.0449, GDC0941, Gefitinib, Gemcitabine, GNF.2, GSK269962A, GSK.650394, GW.441756, GW843682X, Imatinib, IPA.3, JNJ.26854165, JNK.9L, JNK.Inhibitor.VIII, JW.7.52.1, KIN001.135, KU.55933, Lapatinib, Lenalidomide, LFM.A13, Metformin, Methotrexate, MG.132, Midostaurin, Mitomycin.C, MK.2206, MS.275, Nilotinib, NSC.87877, NU.7441, Nutlin.3a, NVP.BEZ235, NVP.TAE684, Obatoclax.Mesylate, OSI.906, PAC.1, Paclitaxel, Parthenolide, Pazopanib, PD.0325901, PD.0332991, PD.173074, PF.02341066, PF.4708671, PF.562271, PHA.665752, PLX4720, Pyrimethamine, QS11, Rapamycin, RDEA119, RO.3306, Roscovitine, Salubrinal, SB.216763, SB590885, Shikonin, SL.0101.1, Sorafenib, S.Trityl.L.cysteine, Sunitinib, Temsirolimus, Thapsigargin, Tipifarnib, TW.37, Vinblastine, Vinorelbine, Vorinostat, VX.680, VX.702, WH.4.023, WO2009093972, WZ.1.84, X17.AAG, X681640, XMD8.85, Z.LLNle.CHO, ZM.447439.
792
#' @param tissueType specify if you would like to traing the models on only a subset of the CGP cell lines (based on the tissue type from which the cell lines originated). This be one any of "all" (for everything, default option), "allSolidTumors" (everything except for blood), "blood", "breast", "CNS", "GI tract" ,"lung", "skin", "upper aerodigestive"#'
793
#' @return a list with two entries, trainDataOrd the ordered expression data and ic50sOrd the drug sensitivity data.
794
#' @keywords internal
795
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
796
getPPV <- function(predResponders, predNonResponders, drug, tissueType="all")
797
{
798
  possibleDrugs <- c("A.443654", "A.770041", "ABT.263", "ABT.888", "AG.014699", "AICAR", "AKT.inhibitor.VIII", "AMG.706", "AP.24534", "AS601245", "ATRA", "AUY922", "Axitinib", "AZ628", "AZD.0530", "AZD.2281", "AZD6244", "AZD6482", "AZD7762", "AZD8055", "BAY.61.3606", "Bexarotene", "BI.2536", "BIBW2992", "Bicalutamide", "BI.D1870", "BIRB.0796", "Bleomycin", "BMS.509744", "BMS.536924", "BMS.708163", "BMS.754807", "Bortezomib", "Bosutinib", "Bryostatin.1", "BX.795", "Camptothecin", "CCT007093", "CCT018159", "CEP.701", "CGP.082996", "CGP.60474", "CHIR.99021", "CI.1040", "Cisplatin", "CMK", "Cyclopamine", "Cytarabine", "Dasatinib", "DMOG", "Docetaxel", "Doxorubicin", "EHT.1864", "Elesclomol", "Embelin", "Epothilone.B", "Erlotinib", "Etoposide", "FH535", "FTI.277", "GDC.0449", "GDC0941", "Gefitinib", "Gemcitabine", "GNF.2", "GSK269962A", "GSK.650394", "GW.441756", "GW843682X", "Imatinib", "IPA.3", "JNJ.26854165", "JNK.9L", "JNK.Inhibitor.VIII", "JW.7.52.1", "KIN001.135", "KU.55933", "Lapatinib", "Lenalidomide", "LFM.A13", "Metformin", "Methotrexate", "MG.132", "Midostaurin", "Mitomycin.C", "MK.2206", "MS.275", "Nilotinib", "NSC.87877", "NU.7441", "Nutlin.3a", "NVP.BEZ235", "NVP.TAE684", "Obatoclax.Mesylate", "OSI.906", "PAC.1", "Paclitaxel", "Parthenolide", "Pazopanib", "PD.0325901", "PD.0332991", "PD.173074", "PF.02341066", "PF.4708671", "PF.562271", "PHA.665752", "PLX4720", "Pyrimethamine", "QS11", "Rapamycin", "RDEA119", "RO.3306", "Roscovitine", "Salubrinal", "SB.216763", "SB590885", "Shikonin", "SL.0101.1", "Sorafenib", "S.Trityl.L.cysteine", "Sunitinib", "Temsirolimus", "Thapsigargin", "Tipifarnib", "TW.37", "Vinblastine", "Vinorelbine", "Vorinostat", "VX.680", "VX.702", "WH.4.023", "WO2009093972", "WZ.1.84", "X17.AAG", "X681640", "XMD8.85", "Z.LLNle.CHO", "ZM.447439")
799
  if(!drug %in% possibleDrugs) stop(paste("ERROR: the drug specified was not found. Note drug names are case sensitive. Please select from: ", (paste(possibleDrugs, collapse=", "))));
800
801
  # load("/home/pgeeleher/postdoc_stuff/HDAC_project/Scripts/r_package_files/data/drugAndPhenoCgp.RData") # contains drugToCellLineDataCgp (to map cell lines to .CEL file names), gdsc_brainarray_syms (the gene expression data), drugSensitivityDataCgp (the drug sensitivity data)
802
  cutpoint <- mean(getCGPinfo(drug, tissueType)$ic50sOrd)
803
804
  # positive predictive values is the number of true positive / the total number of positive calls.
805
  ppv <- sum(predResponders < cutpoint) / (sum(predResponders < cutpoint) + sum(predNonResponders < cutpoint))
806
807
  # negative predictive values is the number of true negatives / the total number of negative calls.
808
  npv <- sum(predNonResponders > cutpoint) / (sum(predResponders > cutpoint) + sum(predNonResponders > cutpoint))
809
810
  cat(paste("\nPPV: ", round(ppv, 2), "\nNPV: ", round(npv, 2), "\nCutpoint: ", round(cutpoint, 2), "\n\n"))
811
812
  return(list(ppv=ppv, npv=npv, cutpoint=cutpoint))
813
}
814
815
#' Take an expression matrix and if duplicate genes are measured, summarize them by their means
816
#'
817
#' This function accepts two expression matrices, with gene ids as rownames() and
818
#' sample ids as colnames(). It will find all duplicate genes and summarize their
819
#' expression by their mean.
820
#'
821
#' @param exprMat a gene expression matrix with gene names as row ids and sample names as column ids.
822
#' @return a gene expression matrix that does not contain duplicate gene ids
823
#' @keywords internal
824
#' @author Paul Geeleher, Nancy Cox, R. Stephanie Huang
825
summarizeGenesByMean <- function(exprMat)
826
{
827
  geneIds <- rownames(exprMat)
828
  t <- table(geneIds) # how many times is each gene name duplicated
829
  allNumDups <- unique(t)
830
  allNumDups <- allNumDups[-which(allNumDups == 1)]
831
832
  # create a *new* gene expression matrix with everything in the correct order....
833
  # start by just adding stuff that isn't duplicated
834
  exprMatUnique <- exprMat[which(geneIds %in% names(t[t == 1])), ]
835
  gnamesUnique <- geneIds[which(geneIds %in% names(t[t == 1]))]
836
837
  # add all the duplicated genes to the bottom of "exprMatUniqueHuman", summarizing as you go
838
  for(numDups in allNumDups)
839
  {
840
    geneList <- names(which(t == numDups))
841
842
    for(i in 1:length(geneList))
843
    {
844
      exprMatUnique <- rbind(exprMatUnique, colMeans(exprMat[which(geneIds == geneList[i]), ]))
845
      gnamesUnique <- c(gnamesUnique, geneList[i])
846
      # print(i)
847
    }
848
  }
849
850
  if(class(exprMatUnique) == "numeric")
851
  {
852
    exprMatUnique <- matrix(exprMatUnique, ncol=1)
853
  }
854
855
  rownames(exprMatUnique) <- gnamesUnique
856
  return(exprMatUnique)
857
}