|
a |
|
b/R/internal-functions.R |
|
|
1 |
#' @importFrom fpc clusterboot cluster.stats calinhara dudahart2 |
|
|
2 |
#' @importFrom withr with_par with_options local_options |
|
|
3 |
clustfun <- function( |
|
|
4 |
x, |
|
|
5 |
clustnr = 20, |
|
|
6 |
bootnr = 50, |
|
|
7 |
metric = "pearson", |
|
|
8 |
do.gap = TRUE, |
|
|
9 |
SE.method = "Tibs2001SEmax", |
|
|
10 |
SE.factor = .25, |
|
|
11 |
B.gap = 50, |
|
|
12 |
cln = 0, |
|
|
13 |
rseed = rseed, |
|
|
14 |
quiet = FALSE) { |
|
|
15 |
if (clustnr < 2) stop("Choose clustnr > 1") |
|
|
16 |
di <- dist.gen(t(x), method = metric) |
|
|
17 |
if (do.gap | cln > 0) { |
|
|
18 |
gpr <- NULL |
|
|
19 |
if (do.gap) { |
|
|
20 |
set.seed(rseed) |
|
|
21 |
gpr <- clusGap( |
|
|
22 |
as.matrix(di), |
|
|
23 |
FUNcluster = kmeans, |
|
|
24 |
K.max = clustnr, |
|
|
25 |
B = B.gap, |
|
|
26 |
verbose = !quiet |
|
|
27 |
) |
|
|
28 |
if (cln == 0) { |
|
|
29 |
cln <- maxSE( |
|
|
30 |
gpr$Tab[, 3], |
|
|
31 |
gpr$Tab[, 4], |
|
|
32 |
method = SE.method, |
|
|
33 |
SE.factor |
|
|
34 |
) |
|
|
35 |
} |
|
|
36 |
} |
|
|
37 |
if (cln <= 1) { |
|
|
38 |
clb <- list( |
|
|
39 |
result = list(partition = rep(1, dim(x)[2])), |
|
|
40 |
bootmean = 1 |
|
|
41 |
) |
|
|
42 |
names(clb$result$partition) <- names(x) |
|
|
43 |
return(list(x = x, clb = clb, gpr = gpr, di = di)) |
|
|
44 |
} |
|
|
45 |
clb <- clusterboot( |
|
|
46 |
di, |
|
|
47 |
B = bootnr, |
|
|
48 |
distances = FALSE, |
|
|
49 |
bootmethod = "boot", |
|
|
50 |
clustermethod = KmeansCBI, |
|
|
51 |
krange = cln, |
|
|
52 |
scaling = FALSE, |
|
|
53 |
multipleboot = FALSE, |
|
|
54 |
bscompare = TRUE, |
|
|
55 |
seed = rseed, |
|
|
56 |
count = !quiet |
|
|
57 |
) |
|
|
58 |
return(list(x = x, clb = clb, gpr = gpr, di = di)) |
|
|
59 |
} |
|
|
60 |
} |
|
|
61 |
|
|
|
62 |
Kmeansruns <- function( |
|
|
63 |
data, |
|
|
64 |
krange = 2:10, |
|
|
65 |
criterion = "ch", |
|
|
66 |
iter.max = 100, |
|
|
67 |
runs = 100, |
|
|
68 |
scaledata = FALSE, |
|
|
69 |
alpha = 0.001, |
|
|
70 |
critout = FALSE, |
|
|
71 |
plot = FALSE, |
|
|
72 |
method = "euclidean", |
|
|
73 |
...) { |
|
|
74 |
data <- as.matrix(data) |
|
|
75 |
if (criterion == "asw") sdata <- dist(data) |
|
|
76 |
if (scaledata) data <- scale(data) |
|
|
77 |
cluster1 <- 1 %in% krange |
|
|
78 |
crit <- numeric(max(krange)) |
|
|
79 |
km <- list() |
|
|
80 |
for (k in krange) { |
|
|
81 |
if (k > 1) { |
|
|
82 |
minSS <- Inf |
|
|
83 |
kmopt <- NULL |
|
|
84 |
for (i in 1:runs) { |
|
|
85 |
opar <- withr::local_options(show.error.messages = FALSE) |
|
|
86 |
repeat { |
|
|
87 |
kmm <- try(kmeans(data, k)) |
|
|
88 |
if (!is(kmm, "try-error")) break |
|
|
89 |
} |
|
|
90 |
opar <- withr::local_options(show.error.messages = TRUE) |
|
|
91 |
on.exit(withr::local_options(opar)) |
|
|
92 |
swss <- sum(kmm$withinss) |
|
|
93 |
if (swss < minSS) { |
|
|
94 |
kmopt <- kmm |
|
|
95 |
minSS <- swss |
|
|
96 |
} |
|
|
97 |
if (plot) { |
|
|
98 |
opar <- withr::local_par(ask = TRUE) |
|
|
99 |
on.exit(withr::local_par(opar)) |
|
|
100 |
pairs(data, col = kmm$cluster, main = swss) |
|
|
101 |
} |
|
|
102 |
} |
|
|
103 |
km[[k]] <- kmopt |
|
|
104 |
crit[k] <- switch(criterion, |
|
|
105 |
asw = cluster.stats(sdata, km[[k]]$cluster)$avg.silwidth, |
|
|
106 |
ch = calinhara(data, km[[k]]$cluster) |
|
|
107 |
) |
|
|
108 |
if (critout) { |
|
|
109 |
message(k, " clusters ", crit[k], "\n") |
|
|
110 |
} |
|
|
111 |
} |
|
|
112 |
} |
|
|
113 |
if (cluster1) { |
|
|
114 |
cluster1 <- dudahart2(data, km[[2]]$cluster, alpha = alpha)$cluster1 |
|
|
115 |
} |
|
|
116 |
k.best <- which.max(crit) |
|
|
117 |
if (cluster1) k.best <- 1 |
|
|
118 |
km[[k.best]]$crit <- crit |
|
|
119 |
km[[k.best]]$bestk <- k.best |
|
|
120 |
out <- km[[k.best]] |
|
|
121 |
return(out) |
|
|
122 |
} |
|
|
123 |
|
|
|
124 |
KmeansCBI <- function( |
|
|
125 |
data, |
|
|
126 |
krange, |
|
|
127 |
k = NULL, |
|
|
128 |
scaling = FALSE, |
|
|
129 |
runs = 1, |
|
|
130 |
criterion = "ch", |
|
|
131 |
method = "euclidean", |
|
|
132 |
...) { |
|
|
133 |
if (!is.null(k)) krange <- k |
|
|
134 |
if (!identical(scaling, FALSE)) { |
|
|
135 |
sdata <- scale(data, center = TRUE, scale = scaling) |
|
|
136 |
} else { |
|
|
137 |
sdata <- data |
|
|
138 |
} |
|
|
139 |
c1 <- Kmeansruns( |
|
|
140 |
sdata, |
|
|
141 |
krange, |
|
|
142 |
runs = runs, |
|
|
143 |
criterion = criterion, |
|
|
144 |
method = method, |
|
|
145 |
... |
|
|
146 |
) |
|
|
147 |
partition <- c1$cluster |
|
|
148 |
cl <- list() |
|
|
149 |
nc <- krange |
|
|
150 |
for (i in 1:nc) cl[[i]] <- partition == i |
|
|
151 |
out <- list( |
|
|
152 |
result = c1, |
|
|
153 |
nc = nc, |
|
|
154 |
clusterlist = cl, |
|
|
155 |
partition = partition, |
|
|
156 |
clustermethod = "kmeans" |
|
|
157 |
) |
|
|
158 |
return(out) |
|
|
159 |
} |
|
|
160 |
|
|
|
161 |
dist.gen <- function(x, method = "euclidean") { |
|
|
162 |
if (method %in% c("spearman", "pearson", "kendall")) { |
|
|
163 |
as.dist(1 - cor(t(x), method = method)) |
|
|
164 |
} else { |
|
|
165 |
dist(x, method = method) |
|
|
166 |
} |
|
|
167 |
} |
|
|
168 |
|
|
|
169 |
binompval <- function(p, N, n) { |
|
|
170 |
pval <- pbinom(n, round(N, 0), p, lower.tail = TRUE) |
|
|
171 |
filter <- !is.na(pval) & pval > 0.5 |
|
|
172 |
pval[filter] <- 1 - pval[filter] |
|
|
173 |
return(pval) |
|
|
174 |
} |
|
|
175 |
|
|
|
176 |
add_legend <- function(...) { |
|
|
177 |
opar <- withr::local_par( |
|
|
178 |
fig = c(0, 1, 0, 1), |
|
|
179 |
oma = c(0, 0, 0, 0), |
|
|
180 |
mar = c(0, 0, 0, 0), |
|
|
181 |
new = TRUE |
|
|
182 |
) |
|
|
183 |
on.exit(withr::local_par(opar)) |
|
|
184 |
plot( |
|
|
185 |
x = 0, |
|
|
186 |
y = 0, |
|
|
187 |
type = "n", |
|
|
188 |
bty = "n", |
|
|
189 |
xaxt = "n", |
|
|
190 |
yaxt = "n" |
|
|
191 |
) |
|
|
192 |
legend(...) |
|
|
193 |
} |
|
|
194 |
|
|
|
195 |
downsample <- function(x, n, dsn) { |
|
|
196 |
x <- round(x[, apply(x, 2, sum, na.rm = TRUE) >= n], 0) |
|
|
197 |
nn <- min(apply(x, 2, sum)) |
|
|
198 |
for (j in 1:dsn) { |
|
|
199 |
z <- data.frame(GENEID = rownames(x)) |
|
|
200 |
rownames(z) <- rownames(x) |
|
|
201 |
initv <- rep(0, nrow(z)) |
|
|
202 |
for (i in seq_len(ncol(x))) { |
|
|
203 |
y <- aggregate( |
|
|
204 |
rep(1, nn), list(sample( |
|
|
205 |
rep(rownames(x), x[, i]), nn |
|
|
206 |
)), |
|
|
207 |
sum |
|
|
208 |
) |
|
|
209 |
na <- names(x)[i] |
|
|
210 |
names(y) <- c("GENEID", na) |
|
|
211 |
rownames(y) <- y$GENEID |
|
|
212 |
z[, na] <- initv |
|
|
213 |
k <- intersect(rownames(z), y$GENEID) |
|
|
214 |
z[k, na] <- y[k, na] |
|
|
215 |
z[is.na(z[, na]), na] <- 0 |
|
|
216 |
} |
|
|
217 |
rownames(z) <- as.vector(z$GENEID) |
|
|
218 |
ds <- if (j == 1) z[, -1] else ds + z[, -1] |
|
|
219 |
} |
|
|
220 |
ds <- ds / dsn + .1 |
|
|
221 |
return(ds) |
|
|
222 |
} |
|
|
223 |
|
|
|
224 |
eval.pred <- function(pred.class, true.class, class1, performance) { |
|
|
225 |
for (index in seq_len(length(pred.class))) { |
|
|
226 |
pred <- pred.class[index] |
|
|
227 |
true <- true.class[index] |
|
|
228 |
if (pred == true && true == class1) { |
|
|
229 |
performance["TP"] <- performance["TP"] + 1 |
|
|
230 |
} else if (pred != true && true == class1) { |
|
|
231 |
performance["FN"] <- performance["FN"] + 1 |
|
|
232 |
} else if (pred != true && true != class1) { |
|
|
233 |
performance["FP"] <- performance["FP"] + 1 |
|
|
234 |
} else if (pred == true && true != class1) { |
|
|
235 |
performance["TN"] <- performance["TN"] + 1 |
|
|
236 |
} |
|
|
237 |
} |
|
|
238 |
return(performance) |
|
|
239 |
} |
|
|
240 |
|
|
|
241 |
SN <- function(con.mat) { |
|
|
242 |
TP <- con.mat[1, 1] |
|
|
243 |
FN <- con.mat[2, 1] |
|
|
244 |
return(TP / (TP + FN)) |
|
|
245 |
} |
|
|
246 |
|
|
|
247 |
SP <- function(con.mat) { |
|
|
248 |
TN <- con.mat[2, 2] |
|
|
249 |
FP <- con.mat[1, 2] |
|
|
250 |
return(TN / (TN + FP)) |
|
|
251 |
} |
|
|
252 |
|
|
|
253 |
ACC <- function(con.mat) { |
|
|
254 |
TP <- con.mat[1, 1] |
|
|
255 |
FN <- con.mat[2, 1] |
|
|
256 |
TN <- con.mat[2, 2] |
|
|
257 |
FP <- con.mat[1, 2] |
|
|
258 |
return((TP + TN) / (TP + FN + TN + FP)) |
|
|
259 |
} |
|
|
260 |
|
|
|
261 |
MCC <- function(con.mat) { |
|
|
262 |
TP <- con.mat[1, 1] |
|
|
263 |
FN <- con.mat[2, 1] |
|
|
264 |
TN <- con.mat[2, 2] |
|
|
265 |
FP <- con.mat[1, 2] |
|
|
266 |
denom <- sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) |
|
|
267 |
denom <- ifelse(denom == 0, NA, denom) |
|
|
268 |
return((TP * TN - FP * FN) / denom) |
|
|
269 |
} |
|
|
270 |
|
|
|
271 |
#' @title Reformat Siggenes Table |
|
|
272 |
#' @description Reformats the Siggenes table output from the SAMR package |
|
|
273 |
#' @param table output from `samr::samr.compute.siggenes.table` |
|
|
274 |
#' @seealso replaceDecimals |
|
|
275 |
#' @author Waldir Leoncio |
|
|
276 |
reformatSiggenes <- function(table) { |
|
|
277 |
if (is.null(table)) { |
|
|
278 |
return(table) |
|
|
279 |
} |
|
|
280 |
table <- as.data.frame(table) |
|
|
281 |
# ========================================================================== |
|
|
282 |
# Replacing decimal separators |
|
|
283 |
# ========================================================================== |
|
|
284 |
table[, "Score(d)"] <- replaceDecimals(table[, "Score(d)"]) |
|
|
285 |
table[, "Numerator(r)"] <- replaceDecimals(table[, "Numerator(r)"]) |
|
|
286 |
table[, "Denominator(s+s0)"] <- replaceDecimals(table[, "Denominator(s+s0)"]) |
|
|
287 |
table[, "Fold Change"] <- replaceDecimals(table[, "Fold Change"]) |
|
|
288 |
table[, "q-value(%)"] <- replaceDecimals(table[, "q-value(%)"]) |
|
|
289 |
# ========================================================================== |
|
|
290 |
# Changing vector classes |
|
|
291 |
# ========================================================================== |
|
|
292 |
table[, "Row"] <- as.numeric(table[, "Row"]) |
|
|
293 |
table[, "Score(d)"] <- as.numeric(table[, "Score(d)"]) |
|
|
294 |
table[, "Numerator(r)"] <- as.numeric(table[, "Numerator(r)"]) |
|
|
295 |
table[, "Denominator(s+s0)"] <- as.numeric(table[, "Denominator(s+s0)"]) |
|
|
296 |
table[, "Fold Change"] <- as.numeric(table[, "Fold Change"]) |
|
|
297 |
table[, "q-value(%)"] <- as.numeric(table[, "q-value(%)"]) |
|
|
298 |
# ========================================================================== |
|
|
299 |
# Returning output |
|
|
300 |
# ========================================================================== |
|
|
301 |
return(table) |
|
|
302 |
} |
|
|
303 |
|
|
|
304 |
#' @title Replace Decimals |
|
|
305 |
#' @description Replaces decimals separators between comma and periods on a |
|
|
306 |
#' character vector |
|
|
307 |
#' @note This function was especially designed to be used with retormatSiggenes |
|
|
308 |
#' @param x vector of characters |
|
|
309 |
#' @param from decimal separator on input file |
|
|
310 |
#' @param to decimal separator for output file |
|
|
311 |
#' @seealso reformatSiggenes |
|
|
312 |
replaceDecimals <- function(x, from = ",", to = ".") { |
|
|
313 |
x <- gsub(",", ".", x) |
|
|
314 |
return(x) |
|
|
315 |
} |
|
|
316 |
|
|
|
317 |
#' @title Prepare Example Dataset |
|
|
318 |
#' @description Internal function that prepares a pre-treated dataset for use in |
|
|
319 |
#' several examples |
|
|
320 |
#' @param dataset Dataset used for transformation |
|
|
321 |
#' @param save save results? |
|
|
322 |
#' @details This function serves the purpose of treating datasets such as |
|
|
323 |
#' valuesG1msReduced to reduce examples of other functions by bypassing some |
|
|
324 |
#' analysis steps covered in the vignettes. |
|
|
325 |
#' @return Two rda files, ones for K-means clustering and another for |
|
|
326 |
#' Model-based clustering. |
|
|
327 |
#' @author Waldir Leoncio |
|
|
328 |
prepExampleDataset <- function(dataset, save = TRUE) { |
|
|
329 |
# ========================================================================== |
|
|
330 |
# Initial data treatment |
|
|
331 |
# ========================================================================== |
|
|
332 |
message("Treating dataset") |
|
|
333 |
sc <- DISCBIO(dataset) |
|
|
334 |
sc <- NoiseFiltering( |
|
|
335 |
sc, |
|
|
336 |
percentile = 0.9, CV = 0.2, export = FALSE, plot = FALSE, quiet = TRUE |
|
|
337 |
) |
|
|
338 |
sc <- Normalizedata(sc) |
|
|
339 |
sc <- FinalPreprocessing(sc, export = FALSE, quiet = TRUE) |
|
|
340 |
# ========================================================================== |
|
|
341 |
# Clustering |
|
|
342 |
# ========================================================================== |
|
|
343 |
message("K-means clustering") |
|
|
344 |
sc_k <- Clustexp(sc, cln = 3, quiet = TRUE) |
|
|
345 |
sc_k <- comptSNE(sc_k, quiet = TRUE) |
|
|
346 |
valuesG1msReduced_treated_K <- sc_k |
|
|
347 |
message("Model-based clustering") |
|
|
348 |
sc_mb <- Exprmclust(sc, quiet = TRUE) |
|
|
349 |
sc_mb <- comptSNE(sc_mb, rseed = 15555, quiet = TRUE) |
|
|
350 |
valuesG1msReduced_treated_MB <- sc_mb |
|
|
351 |
# ========================================================================== |
|
|
352 |
# Output |
|
|
353 |
# ========================================================================== |
|
|
354 |
message("Saving datasets") |
|
|
355 |
if (save) { |
|
|
356 |
save( |
|
|
357 |
valuesG1msReduced_treated_K, |
|
|
358 |
file = file.path("data", "valuesG1msReduced_treated_K.rda") |
|
|
359 |
) |
|
|
360 |
save( |
|
|
361 |
valuesG1msReduced_treated_MB, |
|
|
362 |
file = file.path("data", "valuesG1msReduced_treated_MB.rda") |
|
|
363 |
) |
|
|
364 |
} else { |
|
|
365 |
message("Not saving dataset because (save == FALSE)") |
|
|
366 |
} |
|
|
367 |
} |
|
|
368 |
|
|
|
369 |
#' @title Retries a URL |
|
|
370 |
#' @description Retries a URL |
|
|
371 |
#' @param data A gene list |
|
|
372 |
#' @param species The taxonomy name/id. Default is "9606" for Homo sapiens |
|
|
373 |
#' @param outputFormat format of the output. Can be "highres_image", "tsv", |
|
|
374 |
#' "json", "tsv-no-header", "xml" |
|
|
375 |
#' @param maxRetries maximum number of attempts to connect to the STRING api. |
|
|
376 |
#' @param successCode Status code number that represents success |
|
|
377 |
#' @return either the output of httr::GET or an error message |
|
|
378 |
#' @importFrom httr GET status_code |
|
|
379 |
#' @author Waldir Leoncio |
|
|
380 |
retrieveURL <- function( |
|
|
381 |
data, species, outputFormat, maxRetries = 3, successCode = 200) { |
|
|
382 |
# ======================================================== # |
|
|
383 |
# Setting up retrieval # |
|
|
384 |
# ======================================================== # |
|
|
385 |
string_api_url <- "https://string-db.org/api/" |
|
|
386 |
method <- "network" |
|
|
387 |
url <- paste0( |
|
|
388 |
string_api_url, outputFormat, "/", method, "?identifiers=", |
|
|
389 |
paste(as.character(data), collapse = "%0d"), "&species=", |
|
|
390 |
species |
|
|
391 |
) |
|
|
392 |
|
|
|
393 |
# ======================================================== # |
|
|
394 |
# Retrieving URL # |
|
|
395 |
# ======================================================== # |
|
|
396 |
message("Retrieving URL. Please wait...") |
|
|
397 |
repos <- GET(url) |
|
|
398 |
failedGET <- status_code(repos) != successCode |
|
|
399 |
r <- 1 |
|
|
400 |
while (failedGET & (r <= maxRetries)) { |
|
|
401 |
message("Failed retrieval. Retry ", r, " out of ", maxRetries, ".") |
|
|
402 |
repos <- GET(url) |
|
|
403 |
failedGET <- status_code(repos) != successCode |
|
|
404 |
r <- r + 1 |
|
|
405 |
} |
|
|
406 |
|
|
|
407 |
# ======================================================== # |
|
|
408 |
# Final output # |
|
|
409 |
# ======================================================== # |
|
|
410 |
if (failedGET) { |
|
|
411 |
stop( |
|
|
412 |
"Unable to retrieve URL. Please check the parameters ", |
|
|
413 |
"passed to the Networking() function, increase the ", |
|
|
414 |
"'maxRetries' parameter or try again later." |
|
|
415 |
) |
|
|
416 |
} else { |
|
|
417 |
message("Successful retrieval.") |
|
|
418 |
return(repos) |
|
|
419 |
} |
|
|
420 |
} |