a b/R/select_rho_partial.R
1
#' @title Data preprocessing for partial correlation analysis
2
#' @description A method that integrates differential expression (DE) analysis
3
#'     and differential network (DN) analysis to select biomarker candidates for
4
#'     cancer studies. select_rho_partial is the pre-processing step for INDEED
5
#'     partial differential analysis.
6
#' @param data This is a p*n dataframe that contains the expression levels for all biomolecules and samples.
7
#' @param class_label This is a 1*n dataframe that contains the class label with 0 for group 1 and 1 for group 2.
8
#' @param id This is a p*1 dataframe that contains the ID for each biomolecule.
9
#' @param error_curve This is a boolean value indicating whether to plot the error curve (TRUE) or not (FALSE). 
10
#'     The default is TRUE.
11
#' @examples select_rho_partial(data = Met_GU, class_label = Met_Group_GU, id = Met_name_GU, 
12
#'     error_curve = TRUE)
13
#' @return A list of processed data for the next step, and an error curve to select optimal rho value
14
#'     for graphical lasso.
15
#' @import devtools
16
#' @importFrom glasso glasso
17
#' @importFrom stats qnorm cor quantile var sd glm 
18
#' @importFrom graphics abline title plot lines 
19
#' @export
20
21
select_rho_partial <- function(data = NULL, class_label = NULL, id = NULL, error_curve = TRUE){
22
    data_bind <- rbind(data, class_label)
23
    # Group 1: p*n1
24
    raw_group_1 <- data_bind[,data_bind[nrow(data_bind),] == 0][1:(nrow(data_bind) - 1),]
25
    # Group 2: p*n2
26
    raw_group_2 <- data_bind[,data_bind[nrow(data_bind),] == 1][1:(nrow(data_bind) - 1),]  
27
    p <- nrow(raw_group_2)
28
    n_group_1 <- ncol(raw_group_1)
29
    n_group_2 <- ncol(raw_group_2)
30
    
31
32
    # Z-transform the data for group-specific normalization
33
    data_group_1 <- scale(t(raw_group_1)) # Group 1: n1*p
34
    data_group_2 <- scale(t(raw_group_2)) # Group 2: n2*p
35
    cov_group_1 <- var(data_group_1)
36
    cov_group_2 <- var(data_group_2)
37
    
38
    ## Apply grapihcal LASSO given data_group_1 and data_group_2
39
    #  Group 1 first
40
    n_fold <- 5 # number of folds
41
    rho <- exp(seq(log(1), log(0.01), length.out = 20))
42
43
    # group 1
44
    # draw error curve
45
    error_group1 <- choose_rho(data_group_1, n_fold, rho)
46
    if(error_curve == TRUE){
47
        par(mfrow = c(1, 2))
48
        plot(rho, error_group1$log.cv, xlab = expression(rho), ylab = "Error")
49
        lines(rho, error_group1$log.cv)
50
        title(main = paste("Group 1 error curve", "using cross validation", sep = "\n"))
51
        # chosse optimal rho
52
        abline(v = rho[error_group1$log.cv == min(error_group1$log.cv)], col = "red", lty = 3)
53
        # one standard error rule
54
        abline(h = min(error_group1$log.cv) + 
55
                   error_group1$log.rho[error_group1$log.cv == min(error_group1$log.cv)], 
56
               col = "blue")
57
    }
58
    rho[error_group1$log.cv == min(error_group1$log.cv)] # rho based on minimum rule
59
60
    # rhos that are under blue line
61
    rho_under_blue <- rho[error_group1$log.cv < 
62
                              min(error_group1$log.cv) + 
63
                              error_group1$log.rho[error_group1$log.cv == 
64
                                                       min(error_group1$log.cv)]]
65
66
    # rhos that are on the right of the red line
67
    rho_right_red <- rho[rho > rho[error_group1$log.cv == min(error_group1$log.cv)]]
68
69
    #rhos that are under blue line and to the right of the red line
70
    rho_one_std_group1 <- max(intersect(rho_under_blue, rho_right_red ))
71
    rho_min_rule_group1 <- rho[error_group1$log.cv == min(error_group1$log.cv)]
72
73
    # group 2
74
    # draw error curve
75
    error_group2 <- choose_rho(data_group_2, n_fold, rho)
76
    if(error_curve == TRUE){
77
        plot(rho, error_group2$log.cv, xlab = expression(rho), ylab = "Error")
78
        lines(rho, error_group2$log.cv)
79
        title(main = paste("Group 2 error curve", "using cross validation", sep="\n"))
80
        # choOse optimal rho
81
        abline(v = rho[error_group2$log.cv == min(error_group2$log.cv)], col = "red", lty = 3)
82
        # one standard error rule
83
        abline(h = min(error_group2$log.cv) + 
84
                   error_group2$log.rho[error_group2$log.cv == 
85
                                            min(error_group2$log.cv)], col = "blue")
86
    }
87
    rho[error_group2$log.cv == min(error_group2$log.cv)] # rho based on minimum rule
88
89
    # rhos that are under blue line
90
    rho_under_blue <- rho[error_group2$log.cv < min(error_group2$log.cv) + 
91
                              error_group2$log.rho[error_group2$log.cv == 
92
                                                       min(error_group2$log.cv)]]
93
94
    # rhos that are on the right of the red line
95
    rho_right_red <- rho[rho > rho[error_group2$log.cv == min(error_group2$log.cv)]]
96
97
    #rhos that are under blue line and to the right of the red line
98
    rho_one_std_group2 <- max(intersect(rho_under_blue, rho_right_red )) # export as global
99
    rho_min_rule_group2 <- rho[error_group2$log.cv == min(error_group2$log.cv)] # export as global
100
101
    rho_df <- data.frame(c(rho_one_std_group1, rho_min_rule_group1), c(rho_one_std_group2, 
102
                                                                       rho_min_rule_group2), 
103
                         row.names = c("one standard error","minimum"))
104
    colnames(rho_df)<-c('group1', "group2")
105
    
106
    data_list <- list(p = p, rho_table = rho_df, cov_group_1 = cov_group_1, 
107
                      cov_group_2 = cov_group_2, n_group_1 = n_group_1, n_group_2 = n_group_2, 
108
                      data_group_1 = data_group_1, data_group_2 = data_group_2, 
109
                      class_label = class_label, id = id, data = data)
110
    return(data_list)
111
}