|
a |
|
b/R/2-3.RMT.R |
|
|
1 |
# =========2.2RMT optimize===== |
|
|
2 |
|
|
|
3 |
#' Get RMT threshold for a correlation matrix |
|
|
4 |
#' |
|
|
5 |
#' @param occor.r a corr object or a correlation matrix |
|
|
6 |
#' @param min_threshold min_threshold |
|
|
7 |
#' @param max_threshold max_threshold |
|
|
8 |
#' @param step step |
|
|
9 |
#' @param gif render a .gif file? |
|
|
10 |
#' @param verbose verbose |
|
|
11 |
#' @param out_dir output dir |
|
|
12 |
#' |
|
|
13 |
#' @return a r-threshold |
|
|
14 |
#' @export |
|
|
15 |
#' @references |
|
|
16 |
#' J. Zhou, Y. Deng, FALSE. Luo, Z. He, Q. Tu, X. Zhi, (2010) Functional Molecular Ecological Networks, doi:10.1128/mBio.00169-10. |
|
|
17 |
#' <https://matstat.org/content_en/RMT/RMThreshold_Intro.pdf> |
|
|
18 |
#' @examples |
|
|
19 |
#' \donttest{ |
|
|
20 |
#' data(otutab, package = "pcutils") |
|
|
21 |
#' t(otutab) -> totu |
|
|
22 |
#' c_net_calculate(totu) -> corr |
|
|
23 |
#' rmt(corr) |
|
|
24 |
#' # recommend: 0.69 |
|
|
25 |
#' c_net_build(corr, r_threshold = 0.69) -> co_net_rmt |
|
|
26 |
#' } |
|
|
27 |
RMT_threshold <- function(occor.r, out_dir, min_threshold = 0.5, max_threshold = 0.8, |
|
|
28 |
step = 0.02, gif = FALSE, verbose = FALSE) { |
|
|
29 |
nwd <- getwd() |
|
|
30 |
on.exit(setwd(nwd)) |
|
|
31 |
|
|
|
32 |
setwd(out_dir) |
|
|
33 |
if (inherits(occor.r, "corr")) occor.r <- occor.r$r |
|
|
34 |
if (!dir.exists("./RMT_temp")) dir.create("./RMT_temp") |
|
|
35 |
diag(occor.r) <- 0 |
|
|
36 |
|
|
|
37 |
if (max_threshold >= max(abs(occor.r))) max_threshold <- (max(abs(occor.r)) - step) |
|
|
38 |
if (min_threshold >= max_threshold) min_threshold <- max_threshold - 10 * step |
|
|
39 |
|
|
|
40 |
thres_seq <- seq(min_threshold, max_threshold, step) |
|
|
41 |
|
|
|
42 |
res <- data.frame() |
|
|
43 |
for (i in seq_len(length(thres_seq))) { |
|
|
44 |
threshold <- thres_seq[i] |
|
|
45 |
if (!verbose) pcutils::dabiao(paste0("Calculating", i, ": threshold =", signif(threshold, 3)), print = TRUE) |
|
|
46 |
corr_r1 <- occor.r |
|
|
47 |
corr_r1[abs(corr_r1) < threshold] <- 0 |
|
|
48 |
# calculate eigenvalues |
|
|
49 |
rand.mat <- corr_r1 |
|
|
50 |
eigenvalues <- eigen(rand.mat, only.values = TRUE)$values |
|
|
51 |
eigenvalues <- eigenvalues[order(eigenvalues)] / max(abs(eigenvalues)) |
|
|
52 |
eigenvalues <- pcutils::remove.outliers(unique(eigenvalues)) |
|
|
53 |
|
|
|
54 |
# get the NNDS |
|
|
55 |
{ # uf <- rm.unfold.gauss(eigenvalues,pop.up = TRUE) |
|
|
56 |
dens <- density(eigenvalues, kernel = "gaussian") |
|
|
57 |
midpoints <- \(x)(x[-length(x)] + 0.5 * diff(x)) |
|
|
58 |
scale.function <- approx(dens$x, dens$y, xout = midpoints(eigenvalues)) |
|
|
59 |
ev.spacing <- diff(eigenvalues) |
|
|
60 |
ev.spacing <- ev.spacing * scale.function$y |
|
|
61 |
ev.spacing <- ev.spacing / mean(ev.spacing) |
|
|
62 |
} |
|
|
63 |
|
|
|
64 |
ev.spacing <- ev.spacing[ev.spacing <= 3] |
|
|
65 |
# test whether fit possion? |
|
|
66 |
p_ks_test <- ks.test(unique(ev.spacing), "pexp", 1)$p.value |
|
|
67 |
# get sse |
|
|
68 |
# sse = rm.sse(ev.spacing) |
|
|
69 |
sse <- get_sse(ev.spacing) |
|
|
70 |
log_sse <- log(sse) |
|
|
71 |
|
|
|
72 |
# maximum likelihood |
|
|
73 |
evs <- ev.spacing[ev.spacing != 0] |
|
|
74 |
N <- length(evs) |
|
|
75 |
log_LE <- -sum(evs) / N |
|
|
76 |
log_LW <- log(pi / 2) + sum(log(evs)) / N - 0.25 * pi * sum(evs^2) / N |
|
|
77 |
|
|
|
78 |
# save png |
|
|
79 |
{ |
|
|
80 |
histo <- hist(ev.spacing, breaks = seq(min(ev.spacing), max(ev.spacing), len = 51), plot = FALSE) |
|
|
81 |
grDevices::png(paste0("RMT_temp/rmt_nnsd", i, ".png"), height = 600, width = 700, res = 130) |
|
|
82 |
nnsd_plot( |
|
|
83 |
histo = histo, title = "Eigenvalue spacing distribution (NNSD)", threshold = threshold, |
|
|
84 |
dis_GOE = log_LW, dis_possion = log_LE, p_ks_test = p_ks_test |
|
|
85 |
) |
|
|
86 |
grDevices::dev.off() |
|
|
87 |
} |
|
|
88 |
res <- rbind(res, data.frame(threshold, p_ks_test, log_sse, log_LW, log_LE)) |
|
|
89 |
} |
|
|
90 |
message("The Intermediate files saved in ", out_dir, "/RMT_temp/.") |
|
|
91 |
# transfer to gif |
|
|
92 |
if (gif) { |
|
|
93 |
lib_ps("gifski", library = FALSE) |
|
|
94 |
gifski::gifski(paste0("RMT_temp/rmt_nnsd", seq_len(length(thres_seq)), ".png"), |
|
|
95 |
gif_file = "RMT_temp/rmt_nnsd.gif" |
|
|
96 |
) |
|
|
97 |
} |
|
|
98 |
r_threshold <- (res[which(res$log_LW == min(res$log_LW)), "threshold"] + |
|
|
99 |
res[which(res$log_LE == max(res$log_LE)), "threshold"]) / 2 |
|
|
100 |
res <- list(res = res, r_threshold = r_threshold) |
|
|
101 |
class(res) <- c("rmt_res", class(res)) |
|
|
102 |
return(res) |
|
|
103 |
} |
|
|
104 |
|
|
|
105 |
#' Plot a rmt_res |
|
|
106 |
#' |
|
|
107 |
#' @param x rmt_res |
|
|
108 |
#' @param ... Additional arguments |
|
|
109 |
#' |
|
|
110 |
#' @return ggplot |
|
|
111 |
#' @exportS3Method |
|
|
112 |
#' @method plot rmt_res |
|
|
113 |
plot.rmt_res <- function(x, ...) { |
|
|
114 |
threshold <- value <- variable <- xi <- y <- NULL |
|
|
115 |
res <- x$res |
|
|
116 |
linedf <- data.frame( |
|
|
117 |
variable = c("p_ks_test", "log_sse", "log_LW", "log_LE"), |
|
|
118 |
xi = c( |
|
|
119 |
res[which(res$p_ks_test == max(res$p_ks_test))[1], "threshold"], |
|
|
120 |
res[which(res$log_sse == min(res$log_sse))[1], "threshold"], |
|
|
121 |
res[which(res$log_LW == min(res$log_LW))[1], "threshold"], |
|
|
122 |
res[which(res$log_LE == max(res$log_LE))[1], "threshold"] |
|
|
123 |
), |
|
|
124 |
x = max(res$threshold) - min(res$threshold), |
|
|
125 |
y = apply(res[, -1], 2, max) |
|
|
126 |
) |
|
|
127 |
|
|
|
128 |
reshape2::melt(res, "threshold") -> md |
|
|
129 |
|
|
|
130 |
# filter(threshold<0.77)%>% |
|
|
131 |
p <- ggplot(md, aes(threshold, value)) + |
|
|
132 |
geom_point(aes(col = variable)) + |
|
|
133 |
geom_line(aes(col = variable)) + |
|
|
134 |
scale_color_manual(values = get_cols(4, "col1")) + |
|
|
135 |
facet_wrap(. ~ variable, scales = "free_y") + |
|
|
136 |
theme_bw() + |
|
|
137 |
xlab(NULL) + |
|
|
138 |
geom_text(data = linedf, aes(x = xi - 0.02 * x, y = 0.5 * y, label = xi)) + |
|
|
139 |
geom_vline(data = linedf, aes(xintercept = xi), linetype = 2, col = "red") + |
|
|
140 |
theme(legend.position = "none") |
|
|
141 |
|
|
|
142 |
message(paste("recommend r_threshold: ", mean(linedf$xi))) |
|
|
143 |
return(p) |
|
|
144 |
} |
|
|
145 |
|
|
|
146 |
nnsd_plot <- \(histo = histo, title = title, threshold = threshold, |
|
|
147 |
dis_GOE = dis_GOE, dis_possion = dis_possion, p_ks_test = p_ks_test) { |
|
|
148 |
plot(histo, freq = FALSE, col = "#F4FCA1", main = title, font.main = 1, xlab = "eigenvalue spacing", ylab = "PDF of eigenvalue spacing") |
|
|
149 |
{ |
|
|
150 |
actual.ymax <- par("yaxp")[2] |
|
|
151 |
x0 <- -log(actual.ymax * 0.98) |
|
|
152 |
possion_dis <- \(x)exp(-x) |
|
|
153 |
graphics::curve(possion_dis, |
|
|
154 |
from = max(x0, min(histo$breaks)), |
|
|
155 |
to = max(histo$breaks), n = 1001, add = TRUE, type = "l", lty = 1, col = "#EB34FF", lwd = 2 |
|
|
156 |
) |
|
|
157 |
} |
|
|
158 |
{ |
|
|
159 |
GOE <- function(x) pi / 2 * x * exp(-pi / 4 * x^2) |
|
|
160 |
graphics::curve(GOE, |
|
|
161 |
from = min(histo$breaks), |
|
|
162 |
to = max(histo$breaks), n = 1001, add = TRUE, type = "l", |
|
|
163 |
lty = 1, col = "blue", lwd = 2 |
|
|
164 |
) |
|
|
165 |
} |
|
|
166 |
|
|
|
167 |
if ((!is.na(dis_GOE)) && (!is.na(dis_possion))) { |
|
|
168 |
graphics::mtext(side = 3, paste( |
|
|
169 |
"Distance to GOE =", signif(dis_GOE, 3), |
|
|
170 |
"\nDistance to Possion =", signif(dis_possion, 3), "; ks_test p.value for possion =", signif(p_ks_test, 3) |
|
|
171 |
), col = "#878787", cex = 0.6) |
|
|
172 |
} |
|
|
173 |
|
|
|
174 |
if (!is.na(threshold)) graphics::mtext(side = 4, paste("threshold =", signif(threshold, 4))) |
|
|
175 |
|
|
|
176 |
graphics::legend("topright", inset = 0.05, c("Possion", "GOE"), col = c("#EB34FF", "blue"), lty = 1, lwd = 2, cex = 0.8) |
|
|
177 |
} |
|
|
178 |
|
|
|
179 |
trapez <- \(x, y){ |
|
|
180 |
ind <- 2:length(x) |
|
|
181 |
as.double((x[ind] - x[ind - 1]) %*% (y[ind] + y[ind - 1])) / 2 |
|
|
182 |
} |
|
|
183 |
|
|
|
184 |
get_sse <- \(ev.spacing){ |
|
|
185 |
dens <- density(ev.spacing) |
|
|
186 |
N <- 20 |
|
|
187 |
x <- seq(min(ev.spacing), max(ev.spacing), len = 1000) |
|
|
188 |
A <- exp(-min(ev.spacing)) - exp(-max(ev.spacing)) |
|
|
189 |
xs <- numeric(N + 1) |
|
|
190 |
xs[1] <- min(ev.spacing) |
|
|
191 |
for (i in 1:N) xs[i + 1] <- -log(exp(-xs[i]) - A / N) |
|
|
192 |
area <- numeric(N) |
|
|
193 |
for (i in 1:N) { |
|
|
194 |
xsec <- x[(x > xs[i]) & (x < xs[i + 1])] |
|
|
195 |
xsec <- c(xs[i], xsec, xs[i + 1]) |
|
|
196 |
ysec <- approx(dens$x, dens$y, xout = xsec)$y |
|
|
197 |
area[i] <- trapez(xsec, ysec) |
|
|
198 |
} |
|
|
199 |
sse <- sum((area[i] - A / N)^2) |
|
|
200 |
sse |
|
|
201 |
} |
|
|
202 |
|
|
|
203 |
#' Get RMT threshold for a correlation matrix roughly |
|
|
204 |
#' |
|
|
205 |
#' @export |
|
|
206 |
#' @return recommend threshold |
|
|
207 |
#' @rdname RMT_threshold |
|
|
208 |
rmt <- function(occor.r, min_threshold = 0.5, max_threshold = 0.85, step = 0.01) { |
|
|
209 |
if (inherits(occor.r, "corr")) occor.r <- occor.r$r |
|
|
210 |
NNSD <- \(x)abs(diff(x)) |
|
|
211 |
|
|
|
212 |
s <- seq(0, 3, 0.1) |
|
|
213 |
poisson_d <- exp(-s) |
|
|
214 |
nnsdpois <- density(NNSD(poisson_d)) |
|
|
215 |
|
|
|
216 |
ps <- c() |
|
|
217 |
threshold <- c() |
|
|
218 |
|
|
|
219 |
for (i in seq(min_threshold, max_threshold, step)) { |
|
|
220 |
corr_r1 <- occor.r |
|
|
221 |
corr_r1[abs(corr_r1) < i] <- 0 |
|
|
222 |
{ |
|
|
223 |
eigen_res <- sort(eigen(corr_r1)$value) |
|
|
224 |
# spline to eigen_res |
|
|
225 |
check <- tryCatch(ssp <- smooth.spline(eigen_res, control.spar = list(low = 0, high = 3)), |
|
|
226 |
error = \(e) { |
|
|
227 |
TRUE |
|
|
228 |
} |
|
|
229 |
) |
|
|
230 |
if (rlang::is_true(check)) next |
|
|
231 |
nnsdw <- density(NNSD(ssp$y)) |
|
|
232 |
chival <- sum((nnsdw$y - nnsdpois$y)^2 / nnsdpois$y / 1e3) |
|
|
233 |
} |
|
|
234 |
|
|
|
235 |
ps <- c(ps, chival) |
|
|
236 |
threshold <- c(threshold, i) |
|
|
237 |
if (((i * 100) %% 5 == 0)) { |
|
|
238 |
message(paste0("Calculating: ", i)) |
|
|
239 |
} |
|
|
240 |
} |
|
|
241 |
|
|
|
242 |
res <- data.frame(threshold, ps) |
|
|
243 |
recommend_thres <- res[which.min(res[, 2]), 1] |
|
|
244 |
p <- ggplot(res, aes(threshold, ps)) + |
|
|
245 |
geom_point() + |
|
|
246 |
geom_vline(xintercept = recommend_thres, linetype = 2, col = "red") + |
|
|
247 |
geom_text(x = recommend_thres + 0.01, y = 0.5 * max(res$ps), label = recommend_thres) + |
|
|
248 |
theme_bw(base_size = 15) |
|
|
249 |
print(p) |
|
|
250 |
|
|
|
251 |
res1 <- res[(res$threshold < (recommend_thres + 0.05)) & (res$threshold > (recommend_thres - 0.05)), ] |
|
|
252 |
p <- ggplot(res1, aes(threshold, ps)) + |
|
|
253 |
geom_point() + |
|
|
254 |
geom_vline(xintercept = recommend_thres, linetype = 2, col = "red") + |
|
|
255 |
geom_text(x = recommend_thres + 0.01, y = 0.5 * max(res1$ps), label = recommend_thres) + |
|
|
256 |
theme_bw(base_size = 15) |
|
|
257 |
print(p) |
|
|
258 |
|
|
|
259 |
message("We recommend r-threshold: ", recommend_thres, ", you can calculate again in a smaller region") |
|
|
260 |
recommend_thres |
|
|
261 |
} |