[3bfed4]: / R / helper_function.R

Download this file

348 lines (297 with data), 14.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
## This file contains all the helper functions needed to
## properly run non_partial_cor(), select_rho_partial(), partial_cor(), and network_display().
#' @title Compute the correlation
#' @description This function computes either the pearson or spearman correlation coefficient.
#' @param data_group_1 This is a n*p matrix.
#' @param data_group_2 This is a n*p matrix.
#' @param type_of_cor If this is NULL, pearson correlation coefficient will be calculated as
#' default. Otherwise, a character string "spearman" will calculate the spearman correlation
#' coefficient.
#' @return A list of correlation matrices for both group 1 and group 2.
#' @importFrom stats cor
compute_cor <- function(data_group_1, data_group_2, type_of_cor) {
if (is.null(type_of_cor) || type_of_cor == "pearson") {
cor_group_1 <- cor(data_group_1, method = "pearson")
cor_group_2 <- cor(data_group_2, method = "pearson")
} else if (type_of_cor == "spearman") {
cor_group_1 <- cor(data_group_1, method = "spearman")
cor_group_2 <- cor(data_group_2, method = "spearman")
}
cor <- list("Group1" = cor_group_1, "Group2" = cor_group_2)
}
#' @title Compute the partial correlation
#' @description This function computes the partial correlation coefficient.
#' @param pre_inv This is an inverse covariance matrix.
#' @return A \eqn{p*p} partial correlation matrix.
#' @importFrom utils tail
compute_par <- function(pre_inv) {
p <- nrow(pre_inv)
i <- rep(seq_len(p-1), times=(p-1):1)
k <- unlist(lapply(2:p, seq, p))
pre_inv_i <- vapply(seq_len(p-1), function(x) pre_inv[x,x], numeric(1))
pre_inv_i <- rep(pre_inv_i, times=(p-1):1)
pre_inv_j <- vapply(2:p, function(x) pre_inv[x,x], numeric(1))
pre_inv_j <- unlist(lapply(seq_len(p), function(x) tail(seq_len(p), -(x))))
pc_value <- pre_inv[upper.tri(pre_inv)]
pc_calc <- -pc_value / sqrt(pre_inv_i * pre_inv_j)
pc <- matrix(0, p, p)
pc[upper.tri(pc)] <- pc_calc
pc[lower.tri(pc)] <- t(pc)[lower.tri(t(pc))]
return(pc)
}
#' @title Permutations to build a differential network based on correlation analysis
#' @description A permutation test that randomly permutes the sample labels in distinct
#' biological groups for each biomolecule. The difference in each paired biomolecule
#' is considered statistically significant if it falls into the 2.5% tails on either end of the
#' empirical distribution curve.
#' @param m This is the number of permutations.
#' @param p This is the number of biomarker candidates.
#' @param n_group_1 This is the number of subjects in group 1.
#' @param n_group_2 This is the number of subjects in group 2.
#' @param data_group_1 This is a \eqn{n*p} matrix containing group 1 data.
#' @param data_group_2 THis is a \eqn{n*p} matrix containing group 2 data.
#' @param type_of_cor If this is NULL, pearson correlation coefficient will be calculated as
#' default. Otherwise, a character string "spearman" will calculate the spearman correlation
#' coefficient.
#' @return A multi-dimensional matrix that contains the permutation result.
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @importFrom stats cor
permutation_cor <- function(m, p, n_group_1, n_group_2, data_group_1, data_group_2, type_of_cor) {
diff_p <- array(0, dim = c(m, p, p))
pb <- txtProgressBar(min = 0, max = m, style = 3)
for (t in 1 : m) {
data_group_1_p <- matrix(0, n_group_1, p)
for (i in 1 : p) {
data_group_1_p[, i] <- data_group_1[sample(n_group_1), i]
}
data_group_2_p <- matrix(0, n_group_2, p)
for (i in 1 : p) {
data_group_2_p[, i] <- data_group_2[sample(n_group_2), i]
}
if (is.null(type_of_cor)) {
cor_group_1_p <- cor(data_group_1_p, method = "pearson")
cor_group_2_p <- cor(data_group_2_p, method = "pearson")
} else {
cor_group_1_p <- cor(data_group_1_p, method = "spearman")
cor_group_2_p <- cor(data_group_2_p, method = "spearman")
}
diff_p[t, , ] <- cor_group_2_p - cor_group_1_p
# update progress bar
setTxtProgressBar(pb, t)
}
close(pb)
return(diff_p)
}
#' @title Permutations to build differential network based on partial correlation analysis
#' @description A permutation test that randomly permutes the sample labels in distinct
#' biological groups for each biomolecule. The difference in paired partial correlation
#' is considered statistically significant if it falls into the 2.5% tails on either end of the
#' empirical distribution curve.
#' @param m This is the number of permutations.
#' @param p This is the number of biomarker candidates.
#' @param n_group_1 This is the number of subjects in group 1.
#' @param n_group_2 This is the number of subjects in group 2.
#' @param data_group_1 This is a \eqn{n*p} matrix containing group 1 data.
#' @param data_group_2 This is a \eqn{n*p} matrix containing group 2 data.
#' @param rho_group_1_opt This is an optimal tuning parameter to obtain a sparse differential
#' network for group 1.
#' @param rho_group_2_opt This is an optimal tuning parameter to obtain a sparse differential
#' network for group 2.
#' @return A multi-dimensional matrix that contains the permutation result.
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @importFrom glasso glasso
permutation_pc <- function(m, p, n_group_1, n_group_2, data_group_1, data_group_2, rho_group_1_opt,
rho_group_2_opt) {
diff_p <- array(0, dim = c(m, p, p))
pb <- txtProgressBar(min = 0, max = m, style = 3)
for(t in 1 : m) {
data_group_1_p <- matrix(0, n_group_1, p)
for(i in 1 : p) {
data_group_1_p[, i] <- data_group_1[sample(n_group_1), i]
}
data_group_2_p <- matrix(0, n_group_2, p)
for(i in 1 : p) {
data_group_2_p[, i] <- data_group_2[sample(n_group_2), i]
}
per_group_1 <- glasso(var(data_group_1_p), rho = rho_group_1_opt)
per_group_2 <- glasso(var(data_group_2_p), rho = rho_group_2_opt)
pc_group_1_p <- compute_par(per_group_1$wi)
pc_group_2_p <- compute_par(per_group_2$wi)
diff_p[t, , ] <- pc_group_2_p - pc_group_1_p
# update progress bar
setTxtProgressBar(pb, t)
}
close(pb)
return(diff_p)
}
#' @title Calculate the positive and negative thresholds based on the permutation result
#' @description This function calculates the positive and negative thresholds based on the
#' permutation result.
#' @param thres_left This is the threshold representing 2.5 percent of the left tail of the
#' empirical distributuion curve.
#' @param thres_right This is the threshold representing 2.5 percent of the right tail of the
#' empirical distributuion curve.
#' @param p This is the number of biomarker candidates.
#' @param diff_p This is the permutation result from either permutation_cor or permutation_pc.
#' @return A list of positive and negative thresholds.
#' @importFrom stats quantile
permutation_thres <- function(thres_left, thres_right, p, diff_p) {
significant_thres_p <- matrix(0, p, p)
significant_thres_n <- matrix(0, p, p)
for (i in 1 : (p-1)) {
for (j in (i + 1) : p) {
significant_thres_n[i, j] <- quantile(diff_p[, i, j], probs = thres_left)
significant_thres_n[j, i] <- significant_thres_n[i, j]
significant_thres_p[i, j] <- quantile(diff_p[, i, j], probs = thres_right)
significant_thres_p[j, i] <- significant_thres_p[i, j]
}
}
significant_thres <- list("positive" = significant_thres_p, "negative" = significant_thres_n)
return(significant_thres)
}
#' @title Calculate the differential network score
#' @description This function calculates differential network score by using the binary link and
#' z-scores.
#' @param binary_link This is the binary correlation matrix with 1 indicating positive correlation
#' and -1 indicating negative correlation for each biomolecular pair.
#' @param z_score This is converted from the given or calculated p-value.
#' @return An activity score associated with each biomarker candidate.
compute_dns <- function(binary_link, z_score) {
# get adjacent matrix
diff_d <- abs(binary_link)
# set diagonal elements to 1
diag(diff_d) <- 1
# compute differential network score for each row
dns <- apply(diff_d, 1, function(x, y = z_score) sum(y[which(x == 1)]))
return(dns)
}
#' @title Obtain p-values using logistic regression
#' @description This function calculates p-values using logistic regression in cases that p-values
#' are not provided.
#' @param x This is a data frame consists of data from group 1 and group 2.
#' @param class_label This is a binary array indicating 0 for group 1 and 1 for group 2.
#' @param Met_name This is an array of IDs.
#' @return p-values
#' @importFrom stats glm
pvalue_logit <- function(x, class_label, Met_name) {
data_tp <- as.data.frame(t(x)) # n*p
class_label_tp <- as.data.frame(t(class_label))
pvalue <- c()
# attach metabolites ID and class label in the data set
X_df <- cbind(data_tp, class_label_tp)
colnames(X_df)[1:(ncol(X_df)-1)] <- Met_name
colnames(X_df)[ncol(X_df)] <- "Class"
for (i in 1:(ncol(X_df)-1)) {
X_df_tempt <- X_df[,c(i, ncol(X_df))]
glm.fit <- glm(Class ~. , family = "binomial", data = X_df_tempt)
pvalue_tempt <- summary(glm.fit)$coefficients[,4][2]
pvalue <- c(pvalue, pvalue_tempt)
}
pvalue_df <- data.frame("ID" = Met_name, "p.value" = pvalue)
return(pvalue_df)
}
#' @title Create log likelihood error
#' @description This function calculates the log likelihood error.
#' @param data This is a matrix.
#' @param theta This is a precision matrix.
#' @return log likelihood error
loglik_ave <- function(data, theta){
loglik <- c()
loglik <- log(det(theta)) - sum(diag(var(data) %*% theta))
return(-loglik)
}
#' @title Draw error curve
#' @description This function draws error curve using cross-validation.
#' @param data This is a matrix.
#' @param n_fold This parameter specifies the n number in n-fold cross_validation.
#' @param rho This is the regularization parameter values to be evalueated in terms their errors.
#' @return A list of errors and their corresponding \eqn{log(rho)}.
#' @importFrom glasso glasso
choose_rho <- function(data, n_fold, rho) {
# randomly shuffle the data
Data <- data[sample(nrow(data)), ]
# create n_fold equally size folds
folds <- cut(seq(1, nrow(Data)), breaks = n_fold, labels = FALSE)
# tune parameters
d <- ncol(Data)
loglik <- lapply(seq_along(rho), function(i) {
vapply(seq_len(n_fold), function(j) {
# segement your data by fold using the which() function
testIndexes <- which(folds == j, arr.ind = TRUE)
testData <- Data[testIndexes, ]
trainData <- Data[-testIndexes, ]
# use test and train data partitions however you desire...
cov <- var(trainData) # compute the covariance matrix
pre <- glasso(cov, rho = rho[i])
loglik_ave(testData, pre$wi)
}, numeric(1))})
loglik_cv <- vapply(loglik, mean, numeric(1))
loglik_rho <- vapply(loglik, function(x) sd(x) / sqrt(n_fold), numeric(1))
#plot(rho, loglik_cv, xlab = expression(lambda), ylab = "Error")
#lines(rho, loglik_cv)
error <- list("log.cv" = loglik_cv, "log.rho" = loglik_rho)
return(error)
}
#' @title Compute p-value for edges
#' @description This function computes p-value for edges based on permutation result.
#' @param p This is the number of biomarker candidates.
#' @param diff This is the delta correlation or partial correlation matrix.
#' @param diff_p This is the permutation result from either permutation_cor or permutation_pc.
#' @param m This is the number of permutations.
#' @return p-value for edges.
compute_pvalue_edge <- function(p, diff, diff_p, m) {
significant_thres <- matrix(0, p, p)
for (i in 1 : (p-1)) {
for (j in (i + 1) : p) {
significant_thres[i, j] <- min(length(diff_p[,i,j][diff_p[,i,j]>=diff[i,j]]),
length(diff_p[,i,j][diff_p[,i,j]<=diff[i,j]]))
significant_thres[j, i] <- significant_thres[i, j]
}
}
pvalue_edge <- significant_thres/m
# adjust for two sides
pvalue_edge[pvalue_edge <= 0.5] <- 2 * pvalue_edge[pvalue_edge <= 0.5]
diag(pvalue_edge) <- 1
return(pvalue_edge)
}
#' @title Compute fdr p-value for edges
#' @description This function computes fdr p-value for edges to adjust for multiple testing.
#' @param p This is the number of biomarker candidates.
#' @param pvalue_edge This is p-value for edges from compute_pvalue_edge.
#' @return Adjusted p-value for edges by fdr.
#' @importFrom stats p.adjust
compute_pvalue_edge_fdr <- function(p, pvalue_edge) {
pvalue_edge_vector <- vector()
for (i in 1:(p-1)){
for (j in (i+1):p){
pvalue_edge_vector = append(pvalue_edge_vector, c(i,j,pvalue_edge[i,j]))
}
}
pvalue_edge_vector <- matrix(pvalue_edge_vector, ncol = 3, byrow = T)
pvalue_edge_vector_fdr <- p.adjust(pvalue_edge_vector[,3], method = "fdr", n = length(pvalue_edge_vector[,3]))
pvalue_edge_fdr <- matrix(0, p, p)
num <- 1
for (i in 1 : (p-1)) {
for (j in (i + 1) : p) {
pvalue_edge_fdr[i, j] <- pvalue_edge_vector_fdr[num]
pvalue_edge_fdr[j, i] <- pvalue_edge_fdr[i, j]
num <- num + 1
}
}
diag(pvalue_edge_fdr) = 1
return(pvalue_edge_fdr)
}
#' @title Compute edge weights
#' @description This function computes edge weights based on p-value for edges with directions.
#' @param pvalue_edge_fdr This is the p-value for edges possibly after multiple testing correction.
#' @param binary_link This is the binary edge connection.
#' @return Edge weights.
#' @importFrom stats qnorm
compute_edge_weights <- function(pvalue_edge_fdr, binary_link) {
zscore_edge_fdr <- abs(qnorm(1 - pvalue_edge_fdr/2))
# 1.5 is a predefined factor to cap zero-pvalue connection
inf_cap <- 1.5 * max(zscore_edge_fdr[is.finite(zscore_edge_fdr)])
zscore_edge_fdr[is.infinite(zscore_edge_fdr)] <- inf_cap
weight_link <- zscore_edge_fdr * binary_link
return(weight_link)
}