a b/R/runPAM.R
1
#' @name runPAM
2
#' @title Run partition around medoids classifier
3
#' @description Using partition around medoids (PAM) classifier to predict potential subtype label on external cohort and calculate in-group proportions (IGP) statistics.
4
#' @param train.expr A matrix of normalized expression training data with rows for genes and columns for samples; FPKM or TPM without log2 transformation is recommended.
5
#' @param moic.res An object returned by `getMOIC()` with one specified algorithm or `get\%algorithm_name\%` or `getConsensusMOIC()` with a list of multiple algorithms.
6
#' @param test.expr A matrix of normalized expression testing data with rows for genes and columns for samples; FPKM or TPM without log2 transformation is recommended.
7
#' @param gene.subset A string vector to indicate a subset of genes to be used.
8
#' @return A list with the following components:
9
#'
10
#'         \code{IGP}        a named numeric vector storing the in-group proportion (see \link[clusterRepro]{IGP.clusterRepro}).
11
#'
12
#'         \code{clust.res}  similar to `clust.res` returned by `getMOIC()` or `get%algorithm_name%` or `getConsensusMOIC()`.
13
#'
14
#'         \code{mo.method}  a string value indicating the method used for prediction.
15
#' @details This function first trains a partition around medoids (PAM) classifier in the discovery (training) cohort
16
#'  to predict the subtype for patients in the external validation (testing) cohort,
17
#'  and each sample in the validation cohort was assigned to a subtype label whose centroid had the highest Pearson correlation with the sample.
18
#'  Finally, the in-group proportion (IGP) statistic will be performed to evaluate the similarity and reproducibility of the acquired subtypes between discovery and validation cohorts.
19
#' @export
20
#' @importFrom pamr pamr.train
21
#' @importFrom clusterRepro IGP.clusterRepro
22
#' @importFrom grDevices colorRamp
23
#' @references Tibshirani R, Hastie T, Narasimhan B and Chu G (2002). Diagnosis of multiple cancer types by shrunken centroids of gene expression. Proc Natl Acad Sci, 99,6567–6572.
24
#'
25
#' Kapp A V, Tibshirani R. (2007). Are clusters found in one dataset present in another dataset?. Biostatistics, 8(1):9-31.
26
#' @examples # There is no example and please refer to vignette.
27
runPAM <- function(train.expr  = NULL,
28
                   moic.res    = NULL,
29
                   test.expr   = NULL,
30
                   gene.subset = NULL) {
31
32
  # check sample
33
  comsam <- intersect(moic.res$clust.res$samID, colnames(train.expr))
34
  if(length(comsam) == nrow(moic.res$clust.res)) {
35
    message("--all samples matched.")
36
  } else {
37
    message(paste0("--",(nrow(moic.res$clust.res)-length(comsam))," samples mismatched from current subtypes."))
38
  }
39
40
  moic.res$clust.res <- moic.res$clust.res[comsam, , drop = FALSE]
41
  train.expr <- train.expr[,comsam]
42
43
  # check if using subset of genes
44
  if(is.null(gene.subset)) {
45
    comgene <- intersect(rownames(train.expr), rownames(test.expr))
46
    message(paste0("--a total of ",length(comgene)," genes shared and used."))
47
48
  } else {
49
    comgene <- intersect(intersect(rownames(train.expr), rownames(test.expr)), gene.subset)
50
    message(paste0("--a total of ",length(comgene)," genes shared in the gene subset and used."))
51
  }
52
53
  train.expr <- train.expr[comgene,]
54
  test.expr <- test.expr[comgene,]
55
  train.subt <- paste0("CS",moic.res$clust.res$clust)
56
57
  # check data
58
  if(max(train.expr) < 25 | (max(train.expr) >= 25 & min(train.expr) < 0)) {
59
    message("--training expression profile seems to have been standardised (z-score or log transformation), no more action will be performed.")
60
    train.expr <- train.expr
61
  }
62
  if(max(train.expr) >= 25 & min(train.expr) >= 0){
63
    message("--log2 transformation done for training expression data.")
64
    train.expr <- log2(train.expr + 1)
65
  }
66
  train.expr <- as.data.frame(t(scale(t(train.expr))))
67
68
  if(max(test.expr) < 25 | (max(test.expr) >= 25 & min(test.expr) < 0)) {
69
    message("--testing expression profile seems to have been standardised (z-score or log transformation), no more action will be performed.")
70
    test.expr <- test.expr
71
  }
72
  if(max(test.expr) >= 25 & min(test.expr) >= 0){
73
    message("--log2 transformation done for testing expression data.")
74
    test.expr <- log2(test.expr + 1)
75
  }
76
  test.expr <- as.data.frame(t(scale(t(test.expr))))
77
78
  # train a partition around medoids (PAM) classifier in training dataset
79
  mylist <- list(x = as.matrix(train.expr), # x should be the expression matrix
80
                 y = as.vector(train.subt)) # y should be a vector of classification
81
  pamr.classifier <- quiet(pamr.train(mylist))
82
83
  # classify samples using centroids and calculates the in-group proportions
84
  # pamr.pred.train <- IGP.clusterRepro(Centroids = pamr.classifier$centroids,
85
  #                                     Data      = as.matrix(train.expr))
86
  pamr.pred.test  <- IGP.clusterRepro(Centroids = pamr.classifier$centroids,
87
                                      Data      = as.matrix(test.expr))
88
89
  IGP <- pamr.pred.test$IGP; names(IGP) <- paste0("CS", 1:length(IGP))
90
91
  ex.moic.res <- data.frame(samID = names(pamr.pred.test$Class),
92
                            clust = as.character(pamr.pred.test$Class),
93
                            row.names = names(pamr.pred.test$Class),
94
                            stringsAsFactors = FALSE)
95
96
  return(list(IGP = IGP, clust.res = ex.moic.res, mo.method = "PAM"))
97
}