Diff of /R/getElites.R [000000] .. [494cbf]

Switch to unified view

a b/R/getElites.R
1
#' @name getElites
2
#' @title Get elites for clustering
3
#' @description This function provides several methods to help selecting elites from input features, which aims to reduce data dimension for multi-omics integrative clustering analysis.
4
#' @param dat A data.frame of one omics data, can be continuous or binary data.
5
#' @param surv.info A data.frame with rownames of observations and with at least two columns of `futime` for survival time and `fustat` for survival status (0: censoring; 1: event)
6
#' @param method A string value to indicate the filtering method for selecting elites. Allowed values contain c('mad', 'sd', 'pca', 'cox', 'freq'). 'mad' means median absolute deviation, 'sd' means standard deviation, 'pca' means principal components analysis, 'cox' means univariate Cox proportional hazards regression which needs surv.info also, 'freq' only works for binary data; "mad" by default.
7
#' @param na.action A string value to indicate the action for handling NA missing value. Allowed values contain c('rm', 'impute'). 'rm' means removal of all features containing any missing values, 'impute' means imputation for missing values by k-nearest neighbors; "rm" by default.
8
#' @param doLog2 A logic value to indicate if performing log2 transformation for data before calculating statistics (e.g., sd, mad , pca and cox). FALSE by default.
9
#' @param lowpct A numeric cutoff for removing low expression values. NULL by default; 0.1 is recommended for continuous data which means features that have no expression in more than 10\% samples will be removed. Otherwise default value of NULL should be kept for binary data.
10
#' @param p.cutoff A numeric cutoff for nominal p value derived from univariate Cox proportional hazards regression; 0.05 by default.
11
#' @param elite.pct A numeric cutoff of percentage for selecting elites. NOTE: epite.pct works for all methods except for 'cox', but two scenarios exist. 1) when using method of 'mad' or 'sd', features will be descending sorted by mad or sd, and top elites.pct \* feature size of elites (features) will be selected; 2) when using method of 'freq' for binary data, frequency for value of 1 will be calculated for each feature, and features that have value of 1 in greater than elites.pct \* sample size will be considered elites. This argument will be discarded if elite.num is provided simultaneously. Set this argument with 1 and leave elite.num NULL will return all the features as elites after dealing with NA values.
12
#' @param elite.num A integer cutoff of exact number for selecting elites. NOTE: elite.num works for all methods except for 'cox', but two scenarios exist. 1) when using method of 'mad' or 'sd', features will be descending sorted by mad or sd, and top elite.num of elites (features) will be selected; 2) when using method of 'freq' for binary data, frequency for value of 1 will be calculated for each feature, and features that have value of 1 in greater than elite.num of sample size will be considered elites.
13
#' @param pca.ratio A numeric value ranging from 0 to 1 which represents the ratio of principal components is selected; 0.9 by default.
14
#' @param scaleFlag A logic value to indicate if scaling the data after filtering. FALSE by default.
15
#' @param centerFlag A logic value to indicate if centering the data after filtering. FALSE by default.
16
#' @import survival
17
#' @importFrom impute impute.knn
18
#' @return A list containing the following components:
19
#'
20
#'         \code{elite.dat} a data.frame containing data for selected elites (features).
21
#'
22
#'         \code{pca.res} a data.frame containing results for principal components analysis if \code{method == 'pca'}
23
#'
24
#'         \code{unicox.res} a data.frame containing results for univariate Cox proportional hazards regression if \code{method == 'cox'}
25
#' @export
26
#' @examples # There is no example and please refer to vignette.
27
getElites <- function(dat        = NULL,
28
                      surv.info  = NULL,
29
                      method     = "mad",
30
                      na.action  = "rm",
31
                      doLog2     = FALSE,
32
                      lowpct     = NULL,
33
                      p.cutoff   = 0.05,
34
                      elite.pct  = NULL,
35
                      elite.num  = NULL,
36
                      pca.ratio  = 0.9,
37
                      scaleFlag  = FALSE,
38
                      centerFlag = FALSE) {
39
  # check argument
40
  if(!is.element(method, c("mad","sd","pca","cox","freq"))) {
41
    stop("unsupportive method to filter elite features, allowed values contain c('mad','sd','pca','cox','freq').")}
42
  if(!is.element(na.action, c("rm","impute"))) {
43
    stop("unsupportive method to handle NA missing value, allowed values contain c('rm','impute').")}
44
45
  # deal with missing value
46
  if(na.action == "rm") {
47
    df <- as.data.frame(na.omit(dat))
48
    if(nrow(df) != nrow(dat)) {message(paste0("--",(nrow(dat) - nrow(df)), " features with NA values are removed."))}
49
  } else if(na.action == "impute") {
50
    df <- as.data.frame(impute::impute.knn(as.matrix(dat))$data)
51
  } else {
52
    df <- as.data.frame(dat)
53
  }
54
55
  if(doLog2) {df <- as.data.frame(log2(df + 1))}
56
57
  # remove low expression
58
  if(!is.null(lowpct)) {
59
    n.raw <- nrow(df)
60
    PASSFlag <- rowSums(df == 0) < ceiling(lowpct * ncol(df))
61
    names(PASSFlag) <- rownames(df)
62
    df <- df[PASSFlag,]
63
    if(nrow(df) != n.raw) {
64
      message(paste0("--remove ",n.raw-nrow(df)," features with 0 values in more than ", lowpct*100, "% samples."))
65
    }
66
  }
67
68
  # select elite features
69
  if(method == "freq") {
70
    message("--method of 'freq' only supports binary omics data (e.g., somatic mutation matrix), and in this manner, elite.pct and elite.num are used to cut frequency.")
71
    if(sum(unique(as.vector(as.matrix(df)))) == 1) {
72
      #message("--it seems the input data correctely contains binary values of 0 and 1 only.")
73
      statistic <- rowSums(df); names(statistic) <- rownames(df)
74
    } else {
75
      stop("it seems the input data contains more values than 0 and 1, please check and choose another proper method!")
76
    }
77
  }
78
  if(method != "freq" & method != "cox" & sum(unique(as.vector(as.matrix(df)))) == 1) {
79
    stop("it seems the input data is a binary matrix with entries of 0 and 1 only, please check and choose 'freq' or 'cox' methods if appropriate!")
80
  }
81
82
  if(method == "mad") {
83
    statistic <- apply(df, 1, mad)
84
    names(statistic) <- rownames(df)
85
    statistic <- sort(statistic, decreasing = TRUE)
86
  }
87
88
  if(method == "sd") {
89
    statistic <- apply(df, 1, sd)
90
    names(statistic) <- rownames(df)
91
    statistic <- sort(statistic, decreasing = TRUE)
92
  }
93
94
  if(method == "cox" & !is.null(surv.info)) {
95
    if(all(is.element(c("futime", "fustat"), colnames(surv.info)))) {
96
97
      display.progress = function (index, totalN, breakN=20) {
98
        if ( index %% ceiling(totalN/breakN)  == 0  ) {
99
          cat(paste(round(index*100/totalN), "% ", sep = ""))
100
        }
101
      }
102
103
      comsam <- intersect(colnames(df), rownames(surv.info))
104
      df <- df[,comsam]; surv.info <- surv.info[comsam,,drop = FALSE]
105
      if(length(comsam) == ncol(df)) {
106
        message("--all sample matched between omics matrix and survival data.")
107
      } else {
108
        message(paste0("--removed ",(ncol(df) - length(comsam))," samples that mismatched between omics matrix and survival data."))
109
      }
110
111
      unicox <- data.frame()
112
      for(i in 1:nrow(df)){
113
114
        display.progress(index = i, totalN = nrow(df))
115
        gene <- rownames(df)[i]
116
        tmp <- data.frame(expr = as.numeric(df[i,]),
117
                          futime = surv.info$futime,
118
                          fustat = surv.info$fustat,
119
                          stringsAsFactors = FALSE)
120
        cox <- survival::coxph(survival::Surv(futime, fustat) ~ expr, data = tmp)
121
        coxSummary <- summary(cox)
122
        unicox <- rbind.data.frame(unicox,
123
                                   data.frame(gene             = gene,
124
                                              HR               = as.numeric(coxSummary$coefficients[,"exp(coef)"])[1],
125
                                              z                = as.numeric(coxSummary$coefficients[,"z"])[1],
126
                                              pvalue           = as.numeric(coxSummary$coefficients[,"Pr(>|z|)"])[1],
127
                                              lower            = as.numeric(coxSummary$conf.int[,3][1]),
128
                                              upper            = as.numeric(coxSummary$conf.int[,4][1]),
129
                                              stringsAsFactors = FALSE),
130
                                   stringsAsFactors            = FALSE)
131
      }
132
      statistic <- unicox$pvalue
133
      names(statistic) <- unicox$gene
134
    } else {stop("survival information must contain columns of 'futime' for survival time and 'fustat' for survival status.")}
135
  }
136
137
  if(method == "freq") {
138
    if(!is.null(elite.num) & !is.null(elite.pct)) {
139
      message("elite.num has been provided then discards elite.pct.")
140
      elite <- names(statistic[statistic > elite.num])
141
    }
142
    if(!is.null(elite.num) & is.null(elite.pct)) {
143
      message("missing elite.pct then use elite.num.")
144
      elite <- names(statistic[statistic > elite.num])
145
    }
146
    if(is.null(elite.num) & !is.null(elite.pct)) {
147
      message("missing elite.num then use elite.pct")
148
      elite <- names(statistic[statistic > elite.pct * ncol(df)])
149
    }
150
  }
151
152
  if(method == "cox") {
153
    elite <- names(statistic[statistic < p.cutoff])
154
  }
155
  if(method %in% c("sd", "mad")) {
156
    if(!is.null(elite.num) & !is.null(elite.pct)) {
157
      message("elite.num has been provided then discards elite.pct.")
158
      if(elite.num > length(statistic)) {
159
        stop(paste0("number of remaining features (n=", length(statistic), ") less than elite.num of ",elite.num,", please consider to use elite.pct."))
160
      } else {
161
        elite <- names(statistic)[1:elite.num]
162
      }
163
    }
164
    if(!is.null(elite.num) & is.null(elite.pct)) {
165
      message("missing elite.pct then use elite.num.")
166
      if(elite.num > length(statistic)) {
167
        stop(paste0("number of remaining features (n=", length(statistic), ") less than elite.num of ",elite.num,", please consider to use elite.pct."))
168
      } else {
169
        elite <- names(statistic)[1:elite.num]
170
      }
171
    }
172
    if(is.null(elite.num) & !is.null(elite.pct)) {
173
      message("missing elite.num then use elite.pct")
174
      elite <- names(statistic)[1:(elite.pct * nrow(df))]
175
    }
176
  }
177
178
  if(method == "pca") {
179
    message(paste0("--the ratio used to select principal component is set as ", pca.ratio))
180
    tdf <- t(df)
181
    pca.res <- prcomp(tdf, scale = T, center = T)
182
    vars <- apply(pca.res$x, 2, var)
183
    props <- vars / sum(vars)
184
    cprops <- as.vector(cumsum(props))
185
    outdat <- (tdf %*% pca.res$ro)[,1:which(cprops > pca.ratio)[1]]
186
    outdat <- as.data.frame(t(scale(outdat, center = centerFlag, scale = scaleFlag)))
187
  } else {
188
    outdat <- as.data.frame(t(scale(t(df[elite, , drop = FALSE]), center = centerFlag, scale = scaleFlag)))
189
  }
190
191
  if(method == "cox") {
192
    return(list(elite.dat = outdat, unicox.res = unicox))
193
  } else if(method == "pca") {
194
    return(list(elite.dat = outdat, pca.res = pca.res))
195
  } else {return(list(elite.dat = outdat))}
196
}