--- a +++ b/R/2-3.RMT.R @@ -0,0 +1,261 @@ +# =========2.2RMT optimize===== + +#' Get RMT threshold for a correlation matrix +#' +#' @param occor.r a corr object or a correlation matrix +#' @param min_threshold min_threshold +#' @param max_threshold max_threshold +#' @param step step +#' @param gif render a .gif file? +#' @param verbose verbose +#' @param out_dir output dir +#' +#' @return a r-threshold +#' @export +#' @references +#' J. Zhou, Y. Deng, FALSE. Luo, Z. He, Q. Tu, X. Zhi, (2010) Functional Molecular Ecological Networks, doi:10.1128/mBio.00169-10. +#' <https://matstat.org/content_en/RMT/RMThreshold_Intro.pdf> +#' @examples +#' \donttest{ +#' data(otutab, package = "pcutils") +#' t(otutab) -> totu +#' c_net_calculate(totu) -> corr +#' rmt(corr) +#' # recommend: 0.69 +#' c_net_build(corr, r_threshold = 0.69) -> co_net_rmt +#' } +RMT_threshold <- function(occor.r, out_dir, min_threshold = 0.5, max_threshold = 0.8, + step = 0.02, gif = FALSE, verbose = FALSE) { + nwd <- getwd() + on.exit(setwd(nwd)) + + setwd(out_dir) + if (inherits(occor.r, "corr")) occor.r <- occor.r$r + if (!dir.exists("./RMT_temp")) dir.create("./RMT_temp") + diag(occor.r) <- 0 + + if (max_threshold >= max(abs(occor.r))) max_threshold <- (max(abs(occor.r)) - step) + if (min_threshold >= max_threshold) min_threshold <- max_threshold - 10 * step + + thres_seq <- seq(min_threshold, max_threshold, step) + + res <- data.frame() + for (i in seq_len(length(thres_seq))) { + threshold <- thres_seq[i] + if (!verbose) pcutils::dabiao(paste0("Calculating", i, ": threshold =", signif(threshold, 3)), print = TRUE) + corr_r1 <- occor.r + corr_r1[abs(corr_r1) < threshold] <- 0 + # calculate eigenvalues + rand.mat <- corr_r1 + eigenvalues <- eigen(rand.mat, only.values = TRUE)$values + eigenvalues <- eigenvalues[order(eigenvalues)] / max(abs(eigenvalues)) + eigenvalues <- pcutils::remove.outliers(unique(eigenvalues)) + + # get the NNDS + { # uf <- rm.unfold.gauss(eigenvalues,pop.up = TRUE) + dens <- density(eigenvalues, kernel = "gaussian") + midpoints <- \(x)(x[-length(x)] + 0.5 * diff(x)) + scale.function <- approx(dens$x, dens$y, xout = midpoints(eigenvalues)) + ev.spacing <- diff(eigenvalues) + ev.spacing <- ev.spacing * scale.function$y + ev.spacing <- ev.spacing / mean(ev.spacing) + } + + ev.spacing <- ev.spacing[ev.spacing <= 3] + # test whether fit possion? + p_ks_test <- ks.test(unique(ev.spacing), "pexp", 1)$p.value + # get sse + # sse = rm.sse(ev.spacing) + sse <- get_sse(ev.spacing) + log_sse <- log(sse) + + # maximum likelihood + evs <- ev.spacing[ev.spacing != 0] + N <- length(evs) + log_LE <- -sum(evs) / N + log_LW <- log(pi / 2) + sum(log(evs)) / N - 0.25 * pi * sum(evs^2) / N + + # save png + { + histo <- hist(ev.spacing, breaks = seq(min(ev.spacing), max(ev.spacing), len = 51), plot = FALSE) + grDevices::png(paste0("RMT_temp/rmt_nnsd", i, ".png"), height = 600, width = 700, res = 130) + nnsd_plot( + histo = histo, title = "Eigenvalue spacing distribution (NNSD)", threshold = threshold, + dis_GOE = log_LW, dis_possion = log_LE, p_ks_test = p_ks_test + ) + grDevices::dev.off() + } + res <- rbind(res, data.frame(threshold, p_ks_test, log_sse, log_LW, log_LE)) + } + message("The Intermediate files saved in ", out_dir, "/RMT_temp/.") + # transfer to gif + if (gif) { + lib_ps("gifski", library = FALSE) + gifski::gifski(paste0("RMT_temp/rmt_nnsd", seq_len(length(thres_seq)), ".png"), + gif_file = "RMT_temp/rmt_nnsd.gif" + ) + } + r_threshold <- (res[which(res$log_LW == min(res$log_LW)), "threshold"] + + res[which(res$log_LE == max(res$log_LE)), "threshold"]) / 2 + res <- list(res = res, r_threshold = r_threshold) + class(res) <- c("rmt_res", class(res)) + return(res) +} + +#' Plot a rmt_res +#' +#' @param x rmt_res +#' @param ... Additional arguments +#' +#' @return ggplot +#' @exportS3Method +#' @method plot rmt_res +plot.rmt_res <- function(x, ...) { + threshold <- value <- variable <- xi <- y <- NULL + res <- x$res + linedf <- data.frame( + variable = c("p_ks_test", "log_sse", "log_LW", "log_LE"), + xi = c( + res[which(res$p_ks_test == max(res$p_ks_test))[1], "threshold"], + res[which(res$log_sse == min(res$log_sse))[1], "threshold"], + res[which(res$log_LW == min(res$log_LW))[1], "threshold"], + res[which(res$log_LE == max(res$log_LE))[1], "threshold"] + ), + x = max(res$threshold) - min(res$threshold), + y = apply(res[, -1], 2, max) + ) + + reshape2::melt(res, "threshold") -> md + + # filter(threshold<0.77)%>% + p <- ggplot(md, aes(threshold, value)) + + geom_point(aes(col = variable)) + + geom_line(aes(col = variable)) + + scale_color_manual(values = get_cols(4, "col1")) + + facet_wrap(. ~ variable, scales = "free_y") + + theme_bw() + + xlab(NULL) + + geom_text(data = linedf, aes(x = xi - 0.02 * x, y = 0.5 * y, label = xi)) + + geom_vline(data = linedf, aes(xintercept = xi), linetype = 2, col = "red") + + theme(legend.position = "none") + + message(paste("recommend r_threshold: ", mean(linedf$xi))) + return(p) +} + +nnsd_plot <- \(histo = histo, title = title, threshold = threshold, + dis_GOE = dis_GOE, dis_possion = dis_possion, p_ks_test = p_ks_test) { + plot(histo, freq = FALSE, col = "#F4FCA1", main = title, font.main = 1, xlab = "eigenvalue spacing", ylab = "PDF of eigenvalue spacing") + { + actual.ymax <- par("yaxp")[2] + x0 <- -log(actual.ymax * 0.98) + possion_dis <- \(x)exp(-x) + graphics::curve(possion_dis, + from = max(x0, min(histo$breaks)), + to = max(histo$breaks), n = 1001, add = TRUE, type = "l", lty = 1, col = "#EB34FF", lwd = 2 + ) + } + { + GOE <- function(x) pi / 2 * x * exp(-pi / 4 * x^2) + graphics::curve(GOE, + from = min(histo$breaks), + to = max(histo$breaks), n = 1001, add = TRUE, type = "l", + lty = 1, col = "blue", lwd = 2 + ) + } + + if ((!is.na(dis_GOE)) && (!is.na(dis_possion))) { + graphics::mtext(side = 3, paste( + "Distance to GOE =", signif(dis_GOE, 3), + "\nDistance to Possion =", signif(dis_possion, 3), "; ks_test p.value for possion =", signif(p_ks_test, 3) + ), col = "#878787", cex = 0.6) + } + + if (!is.na(threshold)) graphics::mtext(side = 4, paste("threshold =", signif(threshold, 4))) + + graphics::legend("topright", inset = 0.05, c("Possion", "GOE"), col = c("#EB34FF", "blue"), lty = 1, lwd = 2, cex = 0.8) +} + +trapez <- \(x, y){ + ind <- 2:length(x) + as.double((x[ind] - x[ind - 1]) %*% (y[ind] + y[ind - 1])) / 2 +} + +get_sse <- \(ev.spacing){ + dens <- density(ev.spacing) + N <- 20 + x <- seq(min(ev.spacing), max(ev.spacing), len = 1000) + A <- exp(-min(ev.spacing)) - exp(-max(ev.spacing)) + xs <- numeric(N + 1) + xs[1] <- min(ev.spacing) + for (i in 1:N) xs[i + 1] <- -log(exp(-xs[i]) - A / N) + area <- numeric(N) + for (i in 1:N) { + xsec <- x[(x > xs[i]) & (x < xs[i + 1])] + xsec <- c(xs[i], xsec, xs[i + 1]) + ysec <- approx(dens$x, dens$y, xout = xsec)$y + area[i] <- trapez(xsec, ysec) + } + sse <- sum((area[i] - A / N)^2) + sse +} + +#' Get RMT threshold for a correlation matrix roughly +#' +#' @export +#' @return recommend threshold +#' @rdname RMT_threshold +rmt <- function(occor.r, min_threshold = 0.5, max_threshold = 0.85, step = 0.01) { + if (inherits(occor.r, "corr")) occor.r <- occor.r$r + NNSD <- \(x)abs(diff(x)) + + s <- seq(0, 3, 0.1) + poisson_d <- exp(-s) + nnsdpois <- density(NNSD(poisson_d)) + + ps <- c() + threshold <- c() + + for (i in seq(min_threshold, max_threshold, step)) { + corr_r1 <- occor.r + corr_r1[abs(corr_r1) < i] <- 0 + { + eigen_res <- sort(eigen(corr_r1)$value) + # spline to eigen_res + check <- tryCatch(ssp <- smooth.spline(eigen_res, control.spar = list(low = 0, high = 3)), + error = \(e) { + TRUE + } + ) + if (rlang::is_true(check)) next + nnsdw <- density(NNSD(ssp$y)) + chival <- sum((nnsdw$y - nnsdpois$y)^2 / nnsdpois$y / 1e3) + } + + ps <- c(ps, chival) + threshold <- c(threshold, i) + if (((i * 100) %% 5 == 0)) { + message(paste0("Calculating: ", i)) + } + } + + res <- data.frame(threshold, ps) + recommend_thres <- res[which.min(res[, 2]), 1] + p <- ggplot(res, aes(threshold, ps)) + + geom_point() + + geom_vline(xintercept = recommend_thres, linetype = 2, col = "red") + + geom_text(x = recommend_thres + 0.01, y = 0.5 * max(res$ps), label = recommend_thres) + + theme_bw(base_size = 15) + print(p) + + res1 <- res[(res$threshold < (recommend_thres + 0.05)) & (res$threshold > (recommend_thres - 0.05)), ] + p <- ggplot(res1, aes(threshold, ps)) + + geom_point() + + geom_vline(xintercept = recommend_thres, linetype = 2, col = "red") + + geom_text(x = recommend_thres + 0.01, y = 0.5 * max(res1$ps), label = recommend_thres) + + theme_bw(base_size = 15) + print(p) + + message("We recommend r-threshold: ", recommend_thres, ", you can calculate again in a smaller region") + recommend_thres +}