Diff of /R/helper_function.R [000000] .. [3bfed4]

Switch to unified view

a b/R/helper_function.R
1
## This file contains all the helper functions needed to 
2
## properly run non_partial_cor(), select_rho_partial(), partial_cor(), and network_display().
3
4
#' @title Compute the correlation
5
#' @description This function computes either the pearson or spearman correlation coefficient.
6
#' @param data_group_1 This is a n*p matrix.
7
#' @param data_group_2 This is a n*p matrix.
8
#' @param type_of_cor If this is NULL, pearson correlation coefficient will be calculated as 
9
#'     default. Otherwise, a character string "spearman" will calculate the spearman correlation
10
#'     coefficient.
11
#' @return A list of correlation matrices for both group 1 and group 2.
12
#' @importFrom stats cor 
13
14
compute_cor <- function(data_group_1, data_group_2, type_of_cor) {
15
    if (is.null(type_of_cor) || type_of_cor == "pearson") {
16
        cor_group_1 <- cor(data_group_1, method = "pearson")
17
        cor_group_2 <- cor(data_group_2, method = "pearson")
18
19
    } else if (type_of_cor == "spearman") {
20
        cor_group_1 <- cor(data_group_1, method = "spearman")
21
        cor_group_2 <- cor(data_group_2, method = "spearman")
22
    }
23
    cor <- list("Group1" = cor_group_1, "Group2" = cor_group_2)
24
}
25
26
27
#' @title Compute the partial correlation
28
#' @description This function computes the partial correlation coefficient.
29
#' @param pre_inv This is an inverse covariance matrix.
30
#' @return A \eqn{p*p} partial correlation matrix.
31
#' @importFrom utils tail
32
33
compute_par <- function(pre_inv) {
34
  p <- nrow(pre_inv)
35
36
  i <- rep(seq_len(p-1), times=(p-1):1)
37
  k <- unlist(lapply(2:p, seq, p))
38
39
  pre_inv_i <- vapply(seq_len(p-1), function(x) pre_inv[x,x], numeric(1))
40
  pre_inv_i <- rep(pre_inv_i, times=(p-1):1)
41
42
  pre_inv_j <- vapply(2:p, function(x) pre_inv[x,x], numeric(1))
43
  pre_inv_j <- unlist(lapply(seq_len(p), function(x) tail(seq_len(p), -(x))))
44
45
  pc_value <- pre_inv[upper.tri(pre_inv)]
46
  pc_calc <- -pc_value / sqrt(pre_inv_i * pre_inv_j)
47
48
  pc <- matrix(0, p, p)
49
  pc[upper.tri(pc)] <- pc_calc
50
  pc[lower.tri(pc)] <- t(pc)[lower.tri(t(pc))]
51
  return(pc)
52
}
53
54
55
#' @title Permutations to build a differential network based on correlation analysis
56
#' @description A permutation test that randomly permutes the sample labels in distinct
57
#'     biological groups for each biomolecule. The difference in each paired biomolecule
58
#'     is considered statistically significant if it falls into the 2.5% tails on either end of the 
59
#'     empirical distribution curve. 
60
#' @param m This is the number of permutations.
61
#' @param p This is the number of biomarker candidates.
62
#' @param n_group_1 This is the number of subjects in group 1.
63
#' @param n_group_2 This is the number of subjects in group 2.
64
#' @param data_group_1 This is a \eqn{n*p} matrix containing group 1 data.
65
#' @param data_group_2 THis is a \eqn{n*p} matrix containing group 2 data.
66
#' @param type_of_cor If this is NULL, pearson correlation coefficient will be calculated as 
67
#'     default. Otherwise, a character string "spearman" will calculate the spearman correlation 
68
#'     coefficient.
69
#' @return A multi-dimensional matrix that contains the permutation result.
70
#' @importFrom utils txtProgressBar setTxtProgressBar
71
#' @importFrom stats cor 
72
73
permutation_cor <- function(m, p, n_group_1, n_group_2, data_group_1, data_group_2, type_of_cor) {
74
    diff_p <- array(0, dim = c(m, p, p))
75
    pb <- txtProgressBar(min = 0, max = m, style = 3)
76
    for (t in 1 : m) {
77
        data_group_1_p <- matrix(0, n_group_1, p)
78
        for (i in 1 : p) {
79
            data_group_1_p[, i] <- data_group_1[sample(n_group_1), i]
80
        }
81
        data_group_2_p <- matrix(0, n_group_2, p)
82
        for (i in 1 : p) {
83
            data_group_2_p[, i] <- data_group_2[sample(n_group_2), i]
84
        }
85
86
    if (is.null(type_of_cor)) {
87
        cor_group_1_p <- cor(data_group_1_p, method = "pearson")
88
        cor_group_2_p <- cor(data_group_2_p, method = "pearson")
89
    } else {
90
        cor_group_1_p <- cor(data_group_1_p, method = "spearman")
91
        cor_group_2_p <- cor(data_group_2_p, method = "spearman")
92
    }
93
        diff_p[t, , ] <- cor_group_2_p - cor_group_1_p
94
95
        # update progress bar
96
        setTxtProgressBar(pb, t)
97
    }
98
    close(pb)
99
    return(diff_p)
100
}
101
102
103
#' @title Permutations to build differential network based on partial correlation analysis
104
#' @description A permutation test that randomly permutes the sample labels in distinct
105
#'     biological groups for each biomolecule. The difference in paired partial correlation
106
#'     is considered statistically significant if it falls into the 2.5% tails on either end of the 
107
#'     empirical distribution curve.  
108
#' @param m This is the number of permutations.
109
#' @param p This is the number of biomarker candidates.
110
#' @param n_group_1 This is the number of subjects in group 1.
111
#' @param n_group_2 This is the number of subjects in group 2.
112
#' @param data_group_1 This is a \eqn{n*p} matrix containing group 1 data.
113
#' @param data_group_2 This is a \eqn{n*p} matrix containing group 2 data.
114
#' @param rho_group_1_opt This is an optimal tuning parameter to obtain a sparse differential 
115
#'     network for group 1.
116
#' @param rho_group_2_opt This is an optimal tuning parameter to obtain a sparse differential
117
#'     network for group 2.
118
#' @return A multi-dimensional matrix that contains the permutation result.
119
#' @importFrom utils txtProgressBar setTxtProgressBar
120
#' @importFrom glasso glasso
121
122
permutation_pc <- function(m, p, n_group_1, n_group_2, data_group_1, data_group_2, rho_group_1_opt, 
123
                           rho_group_2_opt) {
124
    diff_p <- array(0, dim = c(m, p, p))
125
    pb <- txtProgressBar(min = 0, max = m, style = 3)
126
    for(t in 1 : m) {
127
        data_group_1_p <- matrix(0, n_group_1, p)
128
        for(i in 1 : p) {
129
            data_group_1_p[, i] <- data_group_1[sample(n_group_1), i]
130
        }
131
        data_group_2_p <- matrix(0, n_group_2, p)
132
        for(i in 1 : p) {
133
            data_group_2_p[, i] <- data_group_2[sample(n_group_2), i]
134
        }
135
        per_group_1 <- glasso(var(data_group_1_p), rho = rho_group_1_opt)
136
        per_group_2 <- glasso(var(data_group_2_p), rho = rho_group_2_opt)
137
        pc_group_1_p <- compute_par(per_group_1$wi)
138
        pc_group_2_p <- compute_par(per_group_2$wi)
139
        diff_p[t, , ] <- pc_group_2_p - pc_group_1_p
140
        # update progress bar
141
        setTxtProgressBar(pb, t)
142
    }
143
    close(pb)
144
    return(diff_p)
145
}
146
147
148
#' @title Calculate the positive and negative thresholds based on the permutation result
149
#' @description This function calculates the positive and negative thresholds based on the 
150
#'     permutation result.
151
#' @param thres_left This is the threshold representing 2.5 percent of the left tail of the 
152
#'     empirical distributuion curve.
153
#' @param thres_right This is the threshold representing 2.5 percent of the right tail of the 
154
#'     empirical distributuion curve.
155
#' @param p This is the number of biomarker candidates.
156
#' @param diff_p This is the permutation result from either permutation_cor or permutation_pc.
157
#' @return A list of positive and negative thresholds.
158
#' @importFrom stats quantile
159
160
permutation_thres <- function(thres_left, thres_right, p, diff_p) {
161
    significant_thres_p <- matrix(0, p, p)
162
    significant_thres_n <- matrix(0, p, p)
163
    for (i in 1 : (p-1)) {
164
        for (j in (i + 1) : p) {
165
            significant_thres_n[i, j] <- quantile(diff_p[, i, j], probs = thres_left)
166
            significant_thres_n[j, i] <- significant_thres_n[i, j]
167
            significant_thres_p[i, j] <- quantile(diff_p[, i, j], probs = thres_right)
168
            significant_thres_p[j, i] <- significant_thres_p[i, j]
169
        }
170
    }
171
    significant_thres <- list("positive" = significant_thres_p, "negative" = significant_thres_n)
172
    return(significant_thres)
173
}
174
175
176
#' @title Calculate the differential network score
177
#' @description This function calculates differential network score by using the binary link and 
178
#'     z-scores.
179
#' @param binary_link This is the binary correlation matrix with 1 indicating positive correlation 
180
#'     and -1 indicating negative correlation for each biomolecular pair.
181
#' @param z_score This is converted from the given or calculated p-value.
182
#' @return An activity score associated with each biomarker candidate.
183
184
compute_dns <- function(binary_link, z_score) {
185
    # get adjacent matrix
186
    diff_d <- abs(binary_link)
187
    # set diagonal elements to 1
188
    diag(diff_d) <- 1
189
    # compute differential network score for each row
190
    dns <- apply(diff_d, 1, function(x, y = z_score) sum(y[which(x == 1)]))
191
    return(dns)
192
}
193
194
195
#' @title Obtain p-values using logistic regression
196
#' @description This function calculates p-values using logistic regression in cases that p-values 
197
#'     are not provided.
198
#' @param x This is a data frame consists of data from group 1 and group 2.
199
#' @param class_label This is a binary array indicating 0 for group 1 and 1 for group 2.
200
#' @param Met_name This is an array of IDs.
201
#' @return p-values
202
#' @importFrom stats glm
203
204
pvalue_logit <- function(x, class_label, Met_name) {
205
    data_tp <- as.data.frame(t(x))    # n*p
206
    class_label_tp <- as.data.frame(t(class_label))
207
    pvalue <- c()
208
    # attach metabolites ID and class label in the data set
209
    X_df <- cbind(data_tp, class_label_tp)
210
    colnames(X_df)[1:(ncol(X_df)-1)] <- Met_name
211
    colnames(X_df)[ncol(X_df)] <- "Class"
212
    for (i in 1:(ncol(X_df)-1)) {
213
      X_df_tempt <- X_df[,c(i, ncol(X_df))]
214
      glm.fit <- glm(Class ~. , family = "binomial", data = X_df_tempt)
215
      pvalue_tempt <- summary(glm.fit)$coefficients[,4][2]
216
      pvalue <- c(pvalue, pvalue_tempt)
217
    }
218
    pvalue_df <- data.frame("ID" = Met_name, "p.value" = pvalue)
219
    return(pvalue_df)
220
}
221
222
223
#' @title Create log likelihood error
224
#' @description This function calculates the log likelihood error. 
225
#' @param data This is a matrix.
226
#' @param theta This is a precision matrix.
227
#' @return log likelihood error 
228
229
loglik_ave <- function(data, theta){
230
    loglik <- c()
231
    loglik <- log(det(theta)) - sum(diag(var(data) %*% theta))
232
    return(-loglik)
233
}
234
235
236
#' @title Draw error curve
237
#' @description This function draws error curve using cross-validation.
238
#' @param data This is a matrix.
239
#' @param n_fold This parameter specifies the n number in n-fold cross_validation.
240
#' @param rho This is the regularization parameter values to be evalueated in terms their errors.
241
#' @return A list of errors and their corresponding \eqn{log(rho)}.
242
#' @importFrom glasso glasso
243
244
choose_rho <- function(data, n_fold, rho) {
245
  # randomly shuffle the data
246
  Data <- data[sample(nrow(data)), ]
247
  # create n_fold equally size folds
248
  folds <- cut(seq(1, nrow(Data)), breaks = n_fold, labels = FALSE)
249
  # tune parameters
250
  d <- ncol(Data)
251
252
  loglik <- lapply(seq_along(rho), function(i) {
253
    vapply(seq_len(n_fold), function(j) {
254
      # segement your data by fold using the which() function
255
      testIndexes <- which(folds == j, arr.ind = TRUE)
256
      testData <- Data[testIndexes, ]
257
      trainData <- Data[-testIndexes, ]
258
      # use test and train data partitions however you desire...
259
      cov <- var(trainData) # compute the covariance matrix
260
      pre <- glasso(cov, rho = rho[i])
261
      loglik_ave(testData, pre$wi)
262
    }, numeric(1))})
263
264
  loglik_cv <- vapply(loglik, mean, numeric(1))
265
  loglik_rho <- vapply(loglik, function(x) sd(x) / sqrt(n_fold), numeric(1))
266
267
  #plot(rho, loglik_cv, xlab = expression(lambda), ylab = "Error")
268
  #lines(rho, loglik_cv)
269
  error <- list("log.cv" = loglik_cv, "log.rho" = loglik_rho)
270
  return(error)
271
}        
272
273
274
#' @title Compute p-value for edges
275
#' @description This function computes p-value for edges based on permutation result. 
276
#' @param p This is the number of biomarker candidates.
277
#' @param diff This is the delta correlation or partial correlation matrix.
278
#' @param diff_p This is the permutation result from either permutation_cor or permutation_pc.
279
#' @param m This is the number of permutations.
280
#' @return p-value for edges.
281
282
compute_pvalue_edge <- function(p, diff, diff_p, m) {
283
  significant_thres <- matrix(0, p, p)
284
  for (i in 1 : (p-1)) {
285
    for (j in (i + 1) : p) {
286
      significant_thres[i, j] <- min(length(diff_p[,i,j][diff_p[,i,j]>=diff[i,j]]),
287
                                     length(diff_p[,i,j][diff_p[,i,j]<=diff[i,j]])) 
288
      significant_thres[j, i] <- significant_thres[i, j]
289
    }
290
  }
291
  pvalue_edge <- significant_thres/m
292
  # adjust for two sides
293
  pvalue_edge[pvalue_edge <= 0.5] <- 2 * pvalue_edge[pvalue_edge <= 0.5]
294
  
295
  diag(pvalue_edge) <- 1
296
  return(pvalue_edge)
297
}
298
299
300
#' @title Compute fdr p-value for edges
301
#' @description This function computes fdr p-value for edges to adjust for multiple testing.
302
#' @param p This is the number of biomarker candidates.
303
#' @param pvalue_edge This is p-value for edges from compute_pvalue_edge.
304
#' @return Adjusted p-value for edges by fdr.
305
#' @importFrom stats p.adjust
306
307
compute_pvalue_edge_fdr <- function(p, pvalue_edge) {
308
  pvalue_edge_vector <- vector()
309
  for (i in 1:(p-1)){
310
    for (j in (i+1):p){
311
      pvalue_edge_vector = append(pvalue_edge_vector, c(i,j,pvalue_edge[i,j]))
312
    }
313
  }
314
  pvalue_edge_vector <- matrix(pvalue_edge_vector, ncol = 3, byrow = T)
315
  pvalue_edge_vector_fdr <- p.adjust(pvalue_edge_vector[,3], method = "fdr", n = length(pvalue_edge_vector[,3]))
316
  pvalue_edge_fdr <- matrix(0, p, p)
317
  num <- 1
318
  for (i in 1 : (p-1)) {
319
    for (j in (i + 1) : p) {
320
      pvalue_edge_fdr[i, j] <- pvalue_edge_vector_fdr[num]
321
      pvalue_edge_fdr[j, i] <- pvalue_edge_fdr[i, j]
322
      num <- num + 1
323
    }
324
  }
325
  diag(pvalue_edge_fdr) = 1
326
  return(pvalue_edge_fdr)
327
}
328
329
330
#' @title Compute edge weights
331
#' @description This function computes edge weights based on p-value for edges with directions.
332
#' @param pvalue_edge_fdr This is the p-value for edges possibly after multiple testing correction.
333
#' @param binary_link This is the binary edge connection.
334
#' @return Edge weights.
335
#' @importFrom stats qnorm
336
337
compute_edge_weights <- function(pvalue_edge_fdr, binary_link) {
338
  zscore_edge_fdr <- abs(qnorm(1 - pvalue_edge_fdr/2))
339
  # 1.5 is a predefined factor to cap zero-pvalue connection
340
  inf_cap <- 1.5 * max(zscore_edge_fdr[is.finite(zscore_edge_fdr)]) 
341
  zscore_edge_fdr[is.infinite(zscore_edge_fdr)] <- inf_cap
342
  weight_link <- zscore_edge_fdr * binary_link
343
  return(weight_link)
344
}
345
346
347