[e26484]: / OmicsFold / R / feature_loadings.R

Download this file

425 lines (387 with data), 15.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
#' Extract sPLS-DA feature selection stability
#'
#' @description
#' Extract feature selection stabilities for a given component from a
#' performance validated mixOmics sPLS-DA model Feature stabilities are an
#' important indicator of the confidence that a selected feature is predictive
#' for the outcome classes in the model, and hence (in combination with the
#' loading weight) is likely to be of biological significance.
#'
#' @param splsda.model Trained mixOmics sPLS-DA model.
#' @param splsda.perf Performance evaluation of the sPLS-DA model generated by
#' mixOmics perf.
#' @param comp Component number of which to retrieve feature selection
#' stabilities.
#'
#' @return A data frame containing the features selected for the specificed
#' component and their relative stability, as a proportion of trained models
#' 0-1.
#' @export
#'
#' @examples
#' \dontrun{
#' feature.selection.stability(splsda.analysis, perf.splsda.analysis, comp=1)
#' }
feature.selection.stability <- function(splsda.model, splsda.perf, comp) {
ind.match = match(selectVar(splsda.model, comp = comp)$name,
names(splsda.perf$features$stable[[comp]]))
freq = as.numeric(splsda.perf$features$stable[[comp]][ind.match])
stability <- data.frame(selectVar(splsda.model, comp = comp)$value, freq)
colnames(stability) <- c("value", "stability")
return(stability)
}
#' Get an sPLS-DA loadings table
#'
#' @description
#' Get a dataframe indicating the loading factors for a trained mixOmics sPLS-DA
#' model. Limited to two components.
#'
#' @param splsda.analysis Trained mixOmics sPLS-DA model
#'
#' @return Data frame including ordered loadings over the first two components
#' of the model. Selected features are ordered by component one followed by
#' component two, and then from highest to lowest (in absolute terms).
#' @export
#'
#' @examples
#' \dontrun{
#' get.loadings.table(splsda.analysis)
#' }
get.loadings.table <- function(splsda.analysis) {
rna.loadings <- splsda.analysis$loadings$X
rna.loadings <- rna.loadings[rowSums(rna.loadings) != 0,]
rna.loadings[rna.loadings == 0] <- NA
rna.loadings <- as.data.frame(rna.loadings)
# Sort by components columns in reverse order
for (column.id in rev(colnames(rna.loadings))) {
rna.loadings <-
rna.loadings[order(abs(rna.loadings[, column.id]), decreasing = TRUE), ,
drop = FALSE]
}
return(rna.loadings)
}
#' Extract DIABLO feature selection stability
#'
#' @description
#' Extract feature selection stabilities for a given component from a
#' performance validated mixOmics multi-omics DIABLO model. Feature stabilities
#' are an important indicator of the confidence that a selected feature is
#' predictive for the outcome classes in the model, and hence (in combination
#' with the loading weight) is likely to be of biological significance. Features
#' are retrieved for a single specified block and component.
#'
#' @param diablo.perf Performance evaluation of the DIABLO model generated by
#' mixOmics perf.
#' @param comp Component number for which to retrieve feature selection
#' stabilities.
#' @param block Block number for which to retrieve feature selection
#' stabilities.
#'
#' @return A data frame containing the the features selected for the specificed
#' component and their relative stability, as a proportion of trained models
#' 0-1.
#' @export
#'
#' @examples
#' \dontrun{
#' diablo.selection.stability(perf.diablo.analysis, comp=1, block=1)
#' }
diablo.selection.stability <- function(diablo.perf, comp, block) {
stable <- diablo.perf$features$stable
stable.wms <- lapply(stable, "[[", block)
stable.wms <- lapply(stable.wms, "[[", comp)
melted <- reshape2::melt(stable.wms)
stability.mean <- melted %>%
group_by(Var1) %>%
summarise(stability = mean(value)) %>%
arrange(desc(stability)) %>%
dplyr::filter(!is.na(Var1)) %>%
as.data.frame()
colnames(stability.mean) <- c('feature', 'stability')
rownames(stability.mean) <- stability.mean$feature
return(stability.mean)
}
#' Get a DIABLO loadings table
#'
#' @description
#' Get a dataframe indicating the top ranking loading factors for a trained
#' mixOmics multi-omics DIABLO model for a provided block from the model.
#'
#' @param diablo.loadings Loadings from the trained model.
#' @param feature.count The number of top loadings from each component to
#' include.
#'
#' @return A data frame containing the top (by absolute magnitude) factor
#' loadings for the first, then second component, ordered by absolute weight.
#' @export
#'
#' @examples
#' \dontrun{
#' get.diablo.top.loadings(diablo.tuned.analysis$loadings$metab)
#' }
get.diablo.top.loadings <- function(diablo.loadings, feature.count = 20) {
rna.loadings <- diablo.loadings
rna.loadings <- rna.loadings[rowSums(rna.loadings) != 0,]
rna.loadings[rna.loadings == 0] <- NA
rna.loadings <- as.data.frame(rna.loadings)
# Construct new data frame with each component's top features included
loadings <- data.frame()
for (col.index in 1:ncol(rna.loadings)) {
rna.loadings <-
rna.loadings[order(abs(rna.loadings[, col.index]), decreasing = TRUE), ,
drop = FALSE]
col.name <- colnames(rna.loadings)[col.index]
rows.to.copy <- min(feature.count, nrow(rna.loadings))
for (row.index in 1:rows.to.copy) {
row.name <- rownames(rna.loadings)[row.index]
loadings[row.name, col.name] <- rna.loadings[row.name, col.name]
}
}
# Sort the columns, going from right to left
for (column.id in rev(colnames(loadings))) {
loadings <-
loadings[order(abs(loadings[, column.id]), decreasing = TRUE), ,
drop = FALSE]
}
return(loadings)
}
#' Get top loadings for a single component DIABLO model
#'
#' @description
#' Get a dataframe indicating the top ranking loading factors for a trained
#' mixOmics multi-omics DIABLO model for a provided block from the model.
#'
#' @param diablo.loadings DIABLO loadings for a single component model.
#' @param feature.count The number of top loadings from each component to
#' include.
#'
#' @return A data frame containing the top (by absolute magnitude) factor
#' loadings for the single component, ordered by absolute weight.
#' @export
#'
#' @examples
#' \dontrun{
#' get.diablo.top.loadings.1comp(diablo.tuned.analysis$loadings$metab)
#' }
get.diablo.top.loadings.1comp <- function(diablo.loadings, feature.count = 20) {
return(get.diablo.top.loadings(diablo.loadings, feature.count))
}
#' Get DIABLO top loadings and stabilities
#'
#' @description
#' Get a dataframe showing the top ranked loading factors for a trained mixOmics
#' multi-omics DIABLO model along with selection stability values. Where a
#' feature appears in more than one component, the selection stability value
#' shown is for the lowest numbered component.
#'
#' @param trained.model A multiomics DIABLO model.
#' @param perf.result The results from a perf test on the above model.
#' @param block The name or index of the block to get the data for.
#' @param feature.count The number of top loadings from each component to
#' include.
#'
#' @return A dataframe containing the top (by absolute magnitude) factor
#' loadings for each component and selection stability values for those
#' features.
#' @export
#'
#' @examples
#' \dontrun{
#' get.diablo.top.loadings.with.stability(diablo.model, diablo.perf.result, 'species', 100)
#' }
get.diablo.top.loadings.with.stability <- function(trained.model, perf.result,
block, feature.count = 20) {
loadings <- get.diablo.top.loadings(trained.model$loadings[[block]],
feature.count = feature.count)
# Bind on stability values in reverse component order
for (comp in rev(colnames(loadings))) {
selection.stability <-
diablo.selection.stability(perf.result, comp = comp, block = block)
component.features <-
rownames(loadings[!is.na(loadings[comp]), comp, drop = FALSE])
loadings[component.features, "stability"] <-
selection.stability[component.features, "stability"]
}
# Remove any features with only NA values
loadings <- loadings[apply(loadings, 1, function(x) { !all(is.na(x)) }) ,]
return(loadings)
}
#' Merge DIABLO stability onto a data frame of features
#'
#' @description
#' Utility function to combine top loading factors extracted from a trained
#' multi-omics mixOmics (DIABLO) model with the feature stability, previously
#' retrieved from the performance evaluation using e.g.
#' diablo.selection.stability and create a nicely formatted data frame.
#'
#' @param features Data frame of top features, e.g. provided by
#' get.diablo.top.loadings().
#' @param stability Data frame of feature stability, e.g. provided by
#' diablo.selection.stability().
#'
#' @return Data frame combining top features with their measured stability.
#' @export merge.feature.stability
#'
#' @examples
#' \dontrun{
#' merge.feature.stability(get.diablo.top.loadings(diablo.tuned.analysis$loadings$metab), diablo.selection.stability(perf.diablo.analysis, comp=1, block=1))
#' }
merge.feature.stability <- function(features, stability) {
merged <- merge(x = features, y = stability[, "stability", drop = FALSE],
by.x = "row.names", by.y = "row.names", all.x = TRUE)
for (column.id in rev(colnames(features))) {
merged <- merged[order(abs(merged[,column.id]), decreasing = TRUE),]
}
merged <- as.data.frame(merged)
colnames(merged)[1] <- "feature"
return(merged)
}
#' Merge stability onto a data frame of features in a single component DIABLO
#' model
#'
#' @description
#' Utility function to combine top loading factors extracted from a trained
#' multi-omics mixOmics (DIABLO) model with the feature stability, previously
#' retrieved from the performance evaluation using e.g.
#' diablo.selection.stability and create a nicely formatted data frame.
#' Specialised for DIABLO models with a single component, which otherwise
#' behaves differently in R's data structures.
#'
#' @param features Data frame of top features, e.g. provided by
#' get.diablo.top.loadings.1comp().
#' @param stability Data frame of feature stability, e.g. provided by
#' diablo.selection.stability().
#'
#' @return Data frame combining top features with their measured stability.
#' @export merge.feature.stability.1comp
#'
#' @examples
#' \dontrun{
#' merge.feature.stability.1comp(get.diablo.top.loadings.1comp(diablo.tuned.analysis$loadings$metab), diablo.selection.stability(perf.diablo.analysis, comp=1, block=1))
#' }
merge.feature.stability.1comp <- function(features, stability) {
return(
merge.feature.stability(features, stability)
)
}
#' Plot feature stability
#'
#' @description
#' Plot feature stability for a summary overview.
#'
#' @param selection.stability Stability data for features selected for a
#' component.
#'
#' @return Plotted stability data.
#' @export plot.feature.stability
#'
#' @examples
#' \dontrun{
#' plot.feature.stability(diablo.selection.stability(perf.diablo.analysis, comp=1, block=1))
#' }
plot.feature.stability <- function(selection.stability) {
return(
plot(selection.stability, type = 'h', ylab = 'Stability', xlab = 'Features',
main = 'Selection stability')
)
}
#' Export a covariance matrix as a network CSV file
#'
#' @description
#' Export a covariance matrix, e.g. one generated using
#' `find.feature.associations` as data describing the association network and
#' suitable for e.g. loading into Cytoscape. The method can optionally take a
#' cutoff value, which is used to threshold (absolute) association scores. The
#' output is written to disk as a CSV file using the filename given.
#'
#' @param m Symmetrical matrix of association scores.
#' @param filename Filename to which to write network output.
#' @param cutoff Optional value of which to filter associations.
#' @param block.association If the matrix does not contain equal number of
#' variables from each block, then a custom list of block associations can be
#' passed in.
#' @param block.feature.count The number of features selected from each block,
#' defaulting to 20.
#'
#' @return The long-form network data that is also written to disk
#' @export
#'
#' @examples
#' \dontrun{
#' export.matrix.as.network(mat, "network.csv")
#' export.matrix.as.network(find.feature.associations(diablo.trained.analysis, 3), filename="network.csv", cutoff=0.7)
#' }
export.matrix.as.network <- function(m, filename, cutoff=NA,
block.association=NULL,
block.feature.count=20) {
block.labels <- as.data.frame(rownames(m))
if (is.null(block.association)) {
num.blocks <- length(rownames(m)) / block.feature.count
block.association <- rep(1:num.blocks, each=block.feature.count)
}
block.labels$block <- block.association
colnames(block.labels) <- c("feature", "block")
melted <- reshape2::melt(replace(m, lower.tri(m, TRUE), NA), na.rm = TRUE)
melted <- merge(x = melted, y = block.labels,
by.x = "Var1", by.y = "feature", all.x = TRUE)
colnames(melted) <- c("feature.1", "feature.2", "value", "block")
if (!is.na(cutoff)) {
melted <- melted[abs(melted$value) >= cutoff,]
}
write.csv(melted, filename, row.names=FALSE, quote=FALSE)
return(melted)
}
#' Get model variance
#'
#' @description
#' Get percent of variance explained by each component for a tuned model. Works
#' with sPLS-DA models or DIABLO models as input, but the block parameter must
#' be specified for DIABLO models.
#'
#' @param tuned.model Either an sPLS-DA model from singleomics or a DIABLO model
#' from multiomics.
#' @param block Where the model type is DIABLO, the block to get model variance
#' for.
#'
#' @return A data frame with the model variance for the specified block.
#' @export
#'
#' @examples
#' \dontrun{
#' get.model.variance(splsda.model)
#' get.model.variance(diablo.model, block = 'transcriptomics')
#' }
get.model.variance <- function(tuned.model, block = "X"){
model.var <- cor(as.numeric(as.factor(tuned.model$Y)),
tuned.model$variates[[block]][, 1:tuned.model$ncomp[block]])
model.var <- apply(model.var^2, 2, sum)
model.var <- cbind(model.var, cumsum(model.var))
colnames(model.var) <- c("Proportion", "Cumulative")
return(model.var)
}
#' Center truncate long strings
#'
#' @description
#' Truncate a string to no longer than 43 characters. Where a string is longer
#' than 43 characters, the first and last 20 characters sandwich three periods.
#' This can be useful for shortening long feature names before plotting.
#'
#' @param x A string to truncate.
#'
#' @return A string where each has been reduced to 43 characters or left as is.
#' @export
#'
#' @examples
#' \dontrun{
#' center.truncate('A string to truncate if long enough')
#' }
center.truncate <- function(x) {
string.length <- nchar(x)
if (string.length < 43) {
return(x)
} else {
return(paste0(substr(x, 1, 20),
"...",
substr(x, string.length - 20, string.length)))
}
}