Diff of /R/amaretto_functions.R [000000] .. [2ab972]

Switch to unified view

a b/R/amaretto_functions.R
1
#' AMARETTO_Initialize (version: reorder and filter MA_Matrix)
2
#'
3
#' Code used to initialize the seed clusters for an AMARETTO run.
4
#' Requires processed gene expressiosn (rna-seq or microarray), CNV (usually from a GISTIC run), and methylation (from MethylMix, provided in this package) data.
5
#' Uses the function CreateRegulatorData() and results are fed into the function AMARETTO_Run().
6
#'
7
#' @param ProcessedData List of Expression, CNV and MethylMix data matrices, with genes in rows and samples in columns.
8
#' @param Driver_list Custom list of driver genes to be considered in analysis
9
#' @param NrModules How many gene co-expression modules should AMARETTO search for? Usually around 100 is acceptable, given the large number of possible driver-passenger gene combinations.
10
#' @param VarPercentage Minimum percentage by variance for filtering of genes; for example, 75\% would indicate that the CreateRegulatorData() function only analyses genes that have a variance above the 75th percentile across all samples.
11
#' @param PvalueThreshold Threshold used to find relevant driver genes with CNV alterations: maximal p-value.
12
#' @param RsquareThreshold Threshold used to find relevant driver genes with CNV alterations: minimal R-square value between CNV and gene expression data.
13
#' @param pmax 'pmax' variable for glmnet function from glmnet package; the maximum number of variables aver to be nonzero. Should not be changed by user unless she/he fully understands the AMARETTO algorithm and how its parameters choices affect model output.
14
#' @param OneRunStop OneRunStop
15
#' @param NrCores A numeric variable indicating the number of computer/server cores to use for paralellelization. Default is 1, i.e. no parallelization. Please check your computer or server's computing capacities before increasing this number.  Parallelization is done via the RParallel package. Mac vs. Windows environments may behave differently when using parallelization.
16
#' @param random_seeds A numeric vector of length 2, containing two seed numbers for randomization : 1st for kmeans and 2nd for glmnet
17
#' @param method Perform union or intersection of the driver genes evaluated from the input data matrices and custom driver gene list provided.
18
#' @param convergence_cutoff A numeric value (E.g. 0.01) representing the fraction of the total number of genes, in which, The algorithm is considered reaching convergence and will stop, if Nr of Gene-replacements in an iteration falls below this threshold * total number of genes.  
19
#'
20
#' @return result
21
#' @rawNamespace import(callr, except = run)
22
#' @rawNamespace import(Rcpp, except = .DollarNames)
23
#' @rawNamespace import(utils, except = prompt)
24
#' @importFrom matrixStats rowVars rowMads
25
#' @importFrom stats aov coef cor density dist dnorm hclust kmeans lm na.omit p.adjust phyper prcomp qqline qqnorm qqplot rgamma var
26
#' @examples
27
#' data('ProcessedDataLIHC')
28
#' data('Driver_Genes')
29
#' AMARETTOinit <- AMARETTO_Initialize(ProcessedData = ProcessedDataLIHC,
30
#'                                     NrModules = 2, VarPercentage = 50)
31
#' \dontrun{
32
#' AMARETTOinit <- AMARETTO_Initialize(ProcessedData = ProcessedDataLIHC,
33
#'                                     Driver_list = Driver_Genes[['MSigDB']],
34
#'                                     NrModules = 2, VarPercentage = 50)
35
#'}
36
#' @export
37
AMARETTO_Initialize <- function(ProcessedData = ProcessedData, Driver_list = NULL, NrModules, VarPercentage, PvalueThreshold = 0.001, 
38
                                RsquareThreshold = 0.1, pmax = 10, NrCores = 1, OneRunStop = 0, method = "union", random_seeds = NULL, convergence_cutoff = 0.01) {
39
    
40
    MA_matrix <- ProcessedData[[1]]
41
    CNV_matrix <- ProcessedData[[2]]
42
    MET_matrix <- ProcessedData[[3]]
43
    
44
    if (is.null(MET_matrix) & is.null(CNV_matrix) & is.null(Driver_list)) {
45
        stop("Please select the correct input data types")
46
    }
47
    if (is.null(CNV_matrix) & is.null(Driver_list)) {
48
        if (ncol(MA_matrix) < 2 || ncol(MET_matrix) == 1) {
49
            stop("AMARETTO cannot be run with less than two samples.\n")
50
        }
51
    }
52
    if (is.null(MET_matrix) & is.null(Driver_list)) {
53
        if (ncol(MA_matrix) < 2 || ncol(CNV_matrix) == 1) {
54
            stop("AMARETTO cannot be run with less than two samples.\n")
55
        }
56
    }
57
    
58
    if (!is.null(MA_matrix) & !is.null(CNV_matrix) & !is.null(MET_matrix) & !is.null(Driver_list)) {
59
        if (ncol(MA_matrix) < 2 || ncol(CNV_matrix) == 1 || ncol(MET_matrix) == 1) {
60
            stop("AMARETTO cannot be run with less than two samples.\n")
61
        }
62
    }
63
    MA_matrix_Var = geneFiltering("MAD", MA_matrix, VarPercentage)
64
    MA_matrix_Var = t(scale(t(MA_matrix_Var)))
65
    if (NrModules >= nrow(MA_matrix_Var)) {
66
        stop(paste0("The number of modules is too large compared to the number of genes. Choose a number of modules smaller than ", nrow(MA_matrix_Var), ".\n"))
67
    }
68
    if(!is.null(random_seeds)){
69
      set.seed(random_seeds[1])
70
    }
71
    KmeansResults = kmeans(MA_matrix_Var, NrModules, iter.max = 100)
72
    Clusters = as.numeric(KmeansResults$cluster)
73
    names(Clusters) <- rownames(MA_matrix_Var)
74
    AutoRegulation = 2
75
    Lambda2 = 1e-04
76
    alpha = 1 - 1e-06
77
    Parameters <- list(AutoRegulation = AutoRegulation, OneRunStop = OneRunStop, Lambda2 = Lambda2,
78
                       Mode = "larsen", pmax = pmax, alpha = alpha, NrModules = NrModules, VarPercentage = VarPercentage,
79
                       random_seeds = random_seeds, convergence_cutoff = convergence_cutoff)
80
    RegulatorInfo = CreateRegulatorData(MA_matrix = MA_matrix_Var, CNV_matrix = CNV_matrix, MET_matrix = MET_matrix,
81
                                        Driver_list = Driver_list, PvalueThreshold = PvalueThreshold, RsquareThreshold = RsquareThreshold, method = method)
82
    if (length(RegulatorInfo) > 1) {
83
        RegulatorData = RegulatorInfo$RegulatorData
84
        Alterations = RegulatorInfo$Alterations
85
        return(list(MA_matrix_Var = MA_matrix_Var, RegulatorData = RegulatorData, RegulatorAlterations = Alterations,
86
                    ModuleMembership = Clusters, Parameters = Parameters, NrCores = NrCores))
87
    }
88
}
89
90
91
#' AMARETTO_Run
92
#' Function to run AMARETTO, a statistical algorithm to identify cancer drivers by integrating a variety of omics data from cancer and normal tissue.
93
#' @param AMARETTOinit List output from AMARETTO_Initialize().
94
#' @return result
95
#' @importFrom doParallel registerDoParallel
96
#' @importFrom glmnet cv.glmnet
97
#' @examples
98
#' data('ProcessedDataLIHC')
99
#' AMARETTOinit <- AMARETTO_Initialize(ProcessedData = ProcessedDataLIHC,
100
#'                                     NrModules = 2, VarPercentage = 50)
101
#' AMARETTOresults <- AMARETTO_Run(AMARETTOinit)
102
#' @export
103
AMARETTO_Run <- function(AMARETTOinit) {
104
    if (length(AMARETTOinit) == 0) {
105
        cat("For give cancer site no drivers were find during the initialization step of AMARETTO")
106
    } else {
107
        if (nrow(AMARETTOinit$RegulatorData) == 1) {
108
            cat("For give cancer site only one driver is detected. AMARETTO cannot be run with less than two drivers.\n")
109
        } else {
110
            cat("Running AMARETTO on", length(rownames(AMARETTOinit$MA_matrix_Var)), "genes and", length(colnames(AMARETTOinit$MA_matrix_Var)), "samples.\n")
111
            cat("\tStopping if less then", AMARETTOinit$Parameters$convergence_cutoff * length(rownames(AMARETTOinit$MA_matrix_Var)), "genes reassigned.\n")
112
            result = AMARETTO_LarsenBased(AMARETTOinit$MA_matrix_Var, AMARETTOinit$ModuleMembership, AMARETTOinit$RegulatorData,
113
                                          AMARETTOinit$Parameters, AMARETTOinit$NrCores)
114
            result$ModuleData = AMARETTO_CreateModuleData(AMARETTOinit, result)
115
            result$RegulatoryProgramData = AMARETTO_CreateRegulatorPrograms(AMARETTOinit, result)
116
            return(result)
117
        }
118
    }
119
}
120
121
#' AMARETTO_EvaluateTestSet
122
#'
123
#' Code to evaluate AMARETTO on a new gene expression test set. Uses output from AMARETTO_Run() and CreateRegulatorData().
124
#' @param AMARETTOresults AMARETTO output from AMARETTO_Run().
125
#' @param MA_Data_TestSet Gene expression matrix from a test set (that was not used in AMARETTO_Run()).
126
#' @param RegulatorData_TestSet Test regulator data from CreateRegulatorData().
127
#'
128
#' @return result
129
#' @export
130
#'
131
#' @examples
132
#' data('ProcessedDataLIHC')
133
#' AMARETTOinit <- AMARETTO_Initialize(ProcessedData = ProcessedDataLIHC,
134
#'                                     NrModules = 2, VarPercentage = 50)
135
#'
136
#' AMARETTOresults <- AMARETTO_Run(AMARETTOinit)
137
#' AMARETTOtestReport <- AMARETTO_EvaluateTestSet(AMARETTOresults = AMARETTOresults,
138
#'                                                MA_Data_TestSet = AMARETTOinit$MA_matrix_Var,
139
#'                                                RegulatorData_TestSet = AMARETTOinit$RegulatorData)
140
AMARETTO_EvaluateTestSet <- function(AMARETTOresults = AMARETTOresults, MA_Data_TestSet = MA_Data_TestSet, RegulatorData_TestSet = RegulatorData_TestSet) {
141
    nrSamples = ncol(MA_Data_TestSet)
142
    RegulatorNames = rownames(RegulatorData_TestSet)
143
    stats = mat.or.vec(AMARETTOresults$NrModules, 9)
144
    Rsquare = mat.or.vec(AMARETTOresults$NrModules,1)
145
    RsquareAjusted = mat.or.vec(AMARETTOresults$NrModules, 1)
146
    modules <- list()
147
    for (i in seq_len(AMARETTOresults$NrModules)) {
148
        currentRegulators = RegulatorNames[which(AMARETTOresults$RegulatoryPrograms[i,] != 0)]
149
        nrPresentRegulators = sum((rownames(RegulatorData_TestSet) %in% currentRegulators))
150
        currentPresentRegulators = (currentRegulators %in% rownames(RegulatorData_TestSet))
151
        stats[i, 1] = nrPresentRegulators
152
        stats[i, 2] = length(currentRegulators)
153
        currentClusterGenes = AMARETTOresults$AllGenes[which(AMARETTOresults$ModuleMembership[,1] == i)]
154
        nrPresentClusterGenes = sum((rownames(MA_Data_TestSet) %in% currentClusterGenes))
155
        stats[i, 3] = nrPresentClusterGenes
156
        stats[i, 4] = length(currentClusterGenes)
157
        stats[i, 5] = stats[i, 3]/stats[i, 4] * 100
158
        currentWeights = AMARETTOresults$RegulatoryPrograms[i,which(AMARETTOresults$RegulatoryPrograms[i,] != 0)]
159
        totalWeights = sum(abs(currentWeights))
160
        presentWeights = currentWeights[currentPresentRegulators]
161
        presentRegulators = currentRegulators[currentPresentRegulators]
162
        totalWeightsPercentage = sum(abs(presentWeights))/totalWeights * 100
163
        modules[[i]] = intersect(currentClusterGenes,rownames(MA_Data_TestSet))
164
        if (nrPresentRegulators > 0) {
165
            predictions = (t(RegulatorData_TestSet[presentRegulators, , drop = FALSE])) %*% (presentWeights)
166
            # need to make sure that the first argument remains a matrix.
167
            predictions = data.matrix(predictions)
168
            if (length(modules[[i]]) != 0) {
169
                if (length(currentClusterGenes) > 1) {
170
                  outcome = colMeans(MA_Data_TestSet[currentClusterGenes,])
171
                } else {
172
                  outcome = MA_Data_TestSet[currentClusterGenes,]
173
                }
174
                residuals = predictions - outcome
175
                SStot = sum((outcome - mean(outcome))^2)
176
                SSres = sum((predictions - outcome)^2)
177
                Rsquare[i] = 1 - (SSres/SStot)
178
                RsquareAjusted[i] = Rsquare[i] - (nrPresentRegulators/(nrSamples - 1 - nrPresentRegulators)) * (1 - Rsquare[i])
179
                MSE = (1/nrSamples) * sum((predictions - outcome)^2)
180
                stats[i, 6] = totalWeightsPercentage
181
                stats[i, 7] = Rsquare[i]
182
                stats[i, 8] = RsquareAjusted[i]
183
                stats[i, 9] = MSE
184
            } else {
185
                stats[i, 6] = totalWeightsPercentage
186
                stats[i, 7] = 0
187
                stats[i, 8] = 0
188
                stats[i, 9] = 0
189
            }
190
        } else {
191
            stats[i, 6] = 0
192
            stats[i, 7] = 0
193
            stats[i, 8] = 0
194
            stats[i, 9] = 0
195
        }
196
    }
197
    dimnames(stats) <- list(rownames(stats, do.NULL = FALSE, prefix = "Module_"),
198
                            c("nrPresReg", "nrTotalReg", "nrPresGen", "nrTotGen", "percPresGen", "percWeightPresent", "Rsquare", "RsquareAdjusted", "MSE"))
199
    return(stats)
200
}
201
202
203