|
a |
|
b/R/runPAM.R |
|
|
1 |
#' @name runPAM |
|
|
2 |
#' @title Run partition around medoids classifier |
|
|
3 |
#' @description Using partition around medoids (PAM) classifier to predict potential subtype label on external cohort and calculate in-group proportions (IGP) statistics. |
|
|
4 |
#' @param train.expr A matrix of normalized expression training data with rows for genes and columns for samples; FPKM or TPM without log2 transformation is recommended. |
|
|
5 |
#' @param moic.res An object returned by `getMOIC()` with one specified algorithm or `get\%algorithm_name\%` or `getConsensusMOIC()` with a list of multiple algorithms. |
|
|
6 |
#' @param test.expr A matrix of normalized expression testing data with rows for genes and columns for samples; FPKM or TPM without log2 transformation is recommended. |
|
|
7 |
#' @param gene.subset A string vector to indicate a subset of genes to be used. |
|
|
8 |
#' @return A list with the following components: |
|
|
9 |
#' |
|
|
10 |
#' \code{IGP} a named numeric vector storing the in-group proportion (see \link[clusterRepro]{IGP.clusterRepro}). |
|
|
11 |
#' |
|
|
12 |
#' \code{clust.res} similar to `clust.res` returned by `getMOIC()` or `get%algorithm_name%` or `getConsensusMOIC()`. |
|
|
13 |
#' |
|
|
14 |
#' \code{mo.method} a string value indicating the method used for prediction. |
|
|
15 |
#' @details This function first trains a partition around medoids (PAM) classifier in the discovery (training) cohort |
|
|
16 |
#' to predict the subtype for patients in the external validation (testing) cohort, |
|
|
17 |
#' and each sample in the validation cohort was assigned to a subtype label whose centroid had the highest Pearson correlation with the sample. |
|
|
18 |
#' Finally, the in-group proportion (IGP) statistic will be performed to evaluate the similarity and reproducibility of the acquired subtypes between discovery and validation cohorts. |
|
|
19 |
#' @export |
|
|
20 |
#' @importFrom pamr pamr.train |
|
|
21 |
#' @importFrom clusterRepro IGP.clusterRepro |
|
|
22 |
#' @importFrom grDevices colorRamp |
|
|
23 |
#' @references Tibshirani R, Hastie T, Narasimhan B and Chu G (2002). Diagnosis of multiple cancer types by shrunken centroids of gene expression. Proc Natl Acad Sci, 99,6567–6572. |
|
|
24 |
#' |
|
|
25 |
#' Kapp A V, Tibshirani R. (2007). Are clusters found in one dataset present in another dataset?. Biostatistics, 8(1):9-31. |
|
|
26 |
#' @examples # There is no example and please refer to vignette. |
|
|
27 |
runPAM <- function(train.expr = NULL, |
|
|
28 |
moic.res = NULL, |
|
|
29 |
test.expr = NULL, |
|
|
30 |
gene.subset = NULL) { |
|
|
31 |
|
|
|
32 |
# check sample |
|
|
33 |
comsam <- intersect(moic.res$clust.res$samID, colnames(train.expr)) |
|
|
34 |
if(length(comsam) == nrow(moic.res$clust.res)) { |
|
|
35 |
message("--all samples matched.") |
|
|
36 |
} else { |
|
|
37 |
message(paste0("--",(nrow(moic.res$clust.res)-length(comsam))," samples mismatched from current subtypes.")) |
|
|
38 |
} |
|
|
39 |
|
|
|
40 |
moic.res$clust.res <- moic.res$clust.res[comsam, , drop = FALSE] |
|
|
41 |
train.expr <- train.expr[,comsam] |
|
|
42 |
|
|
|
43 |
# check if using subset of genes |
|
|
44 |
if(is.null(gene.subset)) { |
|
|
45 |
comgene <- intersect(rownames(train.expr), rownames(test.expr)) |
|
|
46 |
message(paste0("--a total of ",length(comgene)," genes shared and used.")) |
|
|
47 |
|
|
|
48 |
} else { |
|
|
49 |
comgene <- intersect(intersect(rownames(train.expr), rownames(test.expr)), gene.subset) |
|
|
50 |
message(paste0("--a total of ",length(comgene)," genes shared in the gene subset and used.")) |
|
|
51 |
} |
|
|
52 |
|
|
|
53 |
train.expr <- train.expr[comgene,] |
|
|
54 |
test.expr <- test.expr[comgene,] |
|
|
55 |
train.subt <- paste0("CS",moic.res$clust.res$clust) |
|
|
56 |
|
|
|
57 |
# check data |
|
|
58 |
if(max(train.expr) < 25 | (max(train.expr) >= 25 & min(train.expr) < 0)) { |
|
|
59 |
message("--training expression profile seems to have been standardised (z-score or log transformation), no more action will be performed.") |
|
|
60 |
train.expr <- train.expr |
|
|
61 |
} |
|
|
62 |
if(max(train.expr) >= 25 & min(train.expr) >= 0){ |
|
|
63 |
message("--log2 transformation done for training expression data.") |
|
|
64 |
train.expr <- log2(train.expr + 1) |
|
|
65 |
} |
|
|
66 |
train.expr <- as.data.frame(t(scale(t(train.expr)))) |
|
|
67 |
|
|
|
68 |
if(max(test.expr) < 25 | (max(test.expr) >= 25 & min(test.expr) < 0)) { |
|
|
69 |
message("--testing expression profile seems to have been standardised (z-score or log transformation), no more action will be performed.") |
|
|
70 |
test.expr <- test.expr |
|
|
71 |
} |
|
|
72 |
if(max(test.expr) >= 25 & min(test.expr) >= 0){ |
|
|
73 |
message("--log2 transformation done for testing expression data.") |
|
|
74 |
test.expr <- log2(test.expr + 1) |
|
|
75 |
} |
|
|
76 |
test.expr <- as.data.frame(t(scale(t(test.expr)))) |
|
|
77 |
|
|
|
78 |
# train a partition around medoids (PAM) classifier in training dataset |
|
|
79 |
mylist <- list(x = as.matrix(train.expr), # x should be the expression matrix |
|
|
80 |
y = as.vector(train.subt)) # y should be a vector of classification |
|
|
81 |
pamr.classifier <- quiet(pamr.train(mylist)) |
|
|
82 |
|
|
|
83 |
# classify samples using centroids and calculates the in-group proportions |
|
|
84 |
# pamr.pred.train <- IGP.clusterRepro(Centroids = pamr.classifier$centroids, |
|
|
85 |
# Data = as.matrix(train.expr)) |
|
|
86 |
pamr.pred.test <- IGP.clusterRepro(Centroids = pamr.classifier$centroids, |
|
|
87 |
Data = as.matrix(test.expr)) |
|
|
88 |
|
|
|
89 |
IGP <- pamr.pred.test$IGP; names(IGP) <- paste0("CS", 1:length(IGP)) |
|
|
90 |
|
|
|
91 |
ex.moic.res <- data.frame(samID = names(pamr.pred.test$Class), |
|
|
92 |
clust = as.character(pamr.pred.test$Class), |
|
|
93 |
row.names = names(pamr.pred.test$Class), |
|
|
94 |
stringsAsFactors = FALSE) |
|
|
95 |
|
|
|
96 |
return(list(IGP = IGP, clust.res = ex.moic.res, mo.method = "PAM")) |
|
|
97 |
} |