|
a |
|
b/R/feature_processing.R |
|
|
1 |
#' Wrapper to extract features (nodes) from given pathways. |
|
|
2 |
#' |
|
|
3 |
#' Function to extract the features (nodes) from a given list of pathways. These |
|
|
4 |
#' pathways have to be compiled with the |
|
|
5 |
#' \code{\link[graphite:pathways]{pathways}} function. Features can only be |
|
|
6 |
#' extracted for \'proteins\' or \'metabolites\'. Features will by default be |
|
|
7 |
#' mapped to gene symbols. |
|
|
8 |
#' |
|
|
9 |
#' @param pathway A pathway created with \code{\link[graphite:pathways]{pathways}} command. |
|
|
10 |
#' @param which Mode to extract the features, either \'proteins\' or |
|
|
11 |
#' \'metabolites\'. |
|
|
12 |
#' @param org String specifying the organism, which is necessary for featureID |
|
|
13 |
#' mapping. Default: human |
|
|
14 |
#' @param returntype String that specifies the returning ID type. Default: |
|
|
15 |
#' SYMBOL Options (genes/proteins): SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ |
|
|
16 |
#' Options (metabolites): HMDB, CAS, DTXCID, DTXSID, SID, CID, ChEBI, KEGG, Drugbank |
|
|
17 |
#' |
|
|
18 |
#' @return Feature list with gene symbols (genes/proteins) or CHEBI IDs |
|
|
19 |
#' (metabolites) |
|
|
20 |
#' |
|
|
21 |
#' @examples |
|
|
22 |
#' pathways <- graphite::pathways("hsapiens", "kegg")[[1]] |
|
|
23 |
#' getFeatures(pathways) |
|
|
24 |
#' \donttest{ |
|
|
25 |
#' pathways <- graphite::pathways("mmusculus", "kegg")[[1]] |
|
|
26 |
#' getFeatures(pathways, which = "metabolites", org = "mmusculus", returntype = "HMDB") |
|
|
27 |
#' |
|
|
28 |
#' pathways <- graphite::pathways("mmusculus", "kegg")[[1]] |
|
|
29 |
#' getFeatures(pathways, which = "proteins", org = "mmusculus", returntype = "SYMBOL") |
|
|
30 |
#' } |
|
|
31 |
#' |
|
|
32 |
#' @importFrom graphite nodes |
|
|
33 |
#' |
|
|
34 |
#' @export |
|
|
35 |
getFeatures <- function(pathway, which = "proteins", org = "hsapiens", returntype = "SYMBOL") { |
|
|
36 |
if (which != "proteins" && which != "metabolites") { |
|
|
37 |
stop("Only 'proteins' and 'metabolites' are supported for 'which'.", |
|
|
38 |
call. = FALSE |
|
|
39 |
) |
|
|
40 |
} |
|
|
41 |
|
|
|
42 |
org <- tolower(org) |
|
|
43 |
|
|
|
44 |
## check for the correct organism |
|
|
45 |
if (!(org %in% getOrganisms())) { |
|
|
46 |
stop("Please insert a correct organism name! Use getOrganisms() |
|
|
47 |
to see all supported organisms.", |
|
|
48 |
call. = FALSE |
|
|
49 |
) |
|
|
50 |
} |
|
|
51 |
|
|
|
52 |
## extract the features (genes/proteins/metabolites) from the pathway nodes. |
|
|
53 |
features <- graphite::nodes(pathway, which = which) |
|
|
54 |
|
|
|
55 |
if (length(features) == 0) { |
|
|
56 |
return(list()) |
|
|
57 |
} |
|
|
58 |
|
|
|
59 |
if (which == "proteins") { |
|
|
60 |
## extract the keytype of the ID format |
|
|
61 |
kt <- gsub(":.*", "", features[1]) |
|
|
62 |
mapped <- gsub("[A-Z]+:", "", features) |
|
|
63 |
|
|
|
64 |
## get the mapping from the keytype to the user-specific type |
|
|
65 |
mapped <- getGeneMapping( |
|
|
66 |
features = mapped, keytype = kt, |
|
|
67 |
org = org, returntype = returntype |
|
|
68 |
) |
|
|
69 |
} |
|
|
70 |
|
|
|
71 |
## its special for metabolites, because sometimes there are different |
|
|
72 |
## identifiers used in the same pathway |
|
|
73 |
if (which == "metabolites") { |
|
|
74 |
chebi <- mapIDType( |
|
|
75 |
features = features, keytype = "CHEBI", |
|
|
76 |
maptype = "ChEBI", returntype = returntype |
|
|
77 |
) |
|
|
78 |
|
|
|
79 |
kegg <- mapIDType( |
|
|
80 |
features = features, keytype = "KEGGCOMP", |
|
|
81 |
maptype = "KEGG", returntype = returntype |
|
|
82 |
) |
|
|
83 |
|
|
|
84 |
pubchem <- mapIDType( |
|
|
85 |
features = features, keytype = "PUBCHEM", |
|
|
86 |
maptype = "CID", returntype = returntype |
|
|
87 |
) |
|
|
88 |
|
|
|
89 |
cas <- mapIDType( |
|
|
90 |
features = features, keytype = "CAS", |
|
|
91 |
maptype = "CAS", returntype = returntype |
|
|
92 |
) |
|
|
93 |
|
|
|
94 |
hmdb <- mapIDType( |
|
|
95 |
features = features, keytype = "HMDB", |
|
|
96 |
maptype = "HMDB", returntype = returntype |
|
|
97 |
) |
|
|
98 |
|
|
|
99 |
mapped <- c(chebi, kegg, pubchem, cas, hmdb) |
|
|
100 |
} |
|
|
101 |
|
|
|
102 |
return(mapped) |
|
|
103 |
} |
|
|
104 |
|
|
|
105 |
|
|
|
106 |
|
|
|
107 |
#' Mapping between pathway encoded genes/proteins and different ID formats. |
|
|
108 |
#' |
|
|
109 |
#' Function to retrieve the gene/protein identifier mapping. Ongoing from |
|
|
110 |
#' genes/proteins retrieved from pathway definitions, which often include two or |
|
|
111 |
#' more ID formats or a format that is not present in your omics measurement, |
|
|
112 |
#' this function maps those IDs to a given format. Depending on the organism, |
|
|
113 |
#' additional packages have to be installed. |
|
|
114 |
#' |
|
|
115 |
#' @param features List of identifiers to be mapped. |
|
|
116 |
#' @param keytype String specifying the ID type, e.g., "ENTREZID" or "UNIPROT". |
|
|
117 |
#' @param org String that defines the organism. Default: hsapiens |
|
|
118 |
#' Options: see \code{\link[multiGSEA]{getOrganisms}} |
|
|
119 |
#' @param returntype String that specifies the returning ID type. Default: |
|
|
120 |
#' SYMBOL, Options: SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ |
|
|
121 |
#' |
|
|
122 |
#' @return List containing mapped gene/protein IDs. |
|
|
123 |
#' |
|
|
124 |
#' @examples |
|
|
125 |
#' features <- graphite::nodes(graphite::pathways("hsapiens", "kegg")[[1]]) |
|
|
126 |
#' features <- gsub("ENTREZID:", "", features) |
|
|
127 |
#' keytype <- "ENTREZID" |
|
|
128 |
#' getGeneMapping(features, keytype) |
|
|
129 |
#' |
|
|
130 |
#' getGeneMapping(features, keytype, returntype = "UNIPROT") |
|
|
131 |
#' \donttest{ |
|
|
132 |
#' features <- graphite::nodes(graphite::pathways("rnorvegicus", "reactome")[[1]]) |
|
|
133 |
#' features <- gsub("UNIPROT:", "", features) |
|
|
134 |
#' getGeneMapping(features, keytype = "UNIPROT", org = "rnorvegicus") |
|
|
135 |
#' |
|
|
136 |
#' getGeneMapping(features, |
|
|
137 |
#' keytype = "UNIPROT", |
|
|
138 |
#' org = "rnorvegicus", |
|
|
139 |
#' returntype = "ENSEMBL" |
|
|
140 |
#' ) |
|
|
141 |
#' } |
|
|
142 |
#' |
|
|
143 |
#' @importFrom AnnotationDbi select |
|
|
144 |
#' |
|
|
145 |
#' @export |
|
|
146 |
getGeneMapping <- function(features, keytype, org = "hsapiens", returntype = "SYMBOL") { |
|
|
147 |
org <- tolower(org) |
|
|
148 |
|
|
|
149 |
## check for the correct organism |
|
|
150 |
if (!(org %in% getOrganisms())) { |
|
|
151 |
stop("Please insert a correct organism name! Use getOrganisms() |
|
|
152 |
to see all supported organisms.", |
|
|
153 |
call. = FALSE |
|
|
154 |
) |
|
|
155 |
} |
|
|
156 |
|
|
|
157 |
db <- getIDMappingDatabase(org) |
|
|
158 |
|
|
|
159 |
supportedIDs <- c("SYMBOL", "ENTREZID", "UNIPROT", "ENSEMBL", "REFSEQ") |
|
|
160 |
if (org != "dmelanogaster" && !returntype %in% supportedIDs) { |
|
|
161 |
stop("Insert one of the following IDs to be returned (returntype): |
|
|
162 |
SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ.", |
|
|
163 |
call. = FALSE |
|
|
164 |
) |
|
|
165 |
} |
|
|
166 |
|
|
|
167 |
supportedIDs <- c(supportedIDs, "FLYBASE", "FLYBASECG") |
|
|
168 |
if (org == "dmelanogaster" && !returntype %in% supportedIDs) { |
|
|
169 |
stop("Insert one of the following IDs to be returned (returntype): |
|
|
170 |
SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ, FLYBASE, FLYBASECG.", |
|
|
171 |
call. = FALSE |
|
|
172 |
) |
|
|
173 |
} |
|
|
174 |
|
|
|
175 |
## design the columns field such that we create a triple ID mapping between |
|
|
176 |
## ENTREZIDs, UNIPROT, and gene symbols |
|
|
177 |
if (keytype == "UNIPROT") { |
|
|
178 |
col <- unique(c("SYMBOL", "ENTREZID", returntype)) |
|
|
179 |
} else { |
|
|
180 |
col <- unique(c("SYMBOL", "UNIPROT", returntype)) |
|
|
181 |
} |
|
|
182 |
|
|
|
183 |
## run the actual mapping of IDS and return a list of the user-given type |
|
|
184 |
map <- tryCatch( |
|
|
185 |
{ |
|
|
186 |
map <- AnnotationDbi::select(db, |
|
|
187 |
keys = features, |
|
|
188 |
columns = col, keytype = keytype |
|
|
189 |
) |
|
|
190 |
m <- match(unique(map[[keytype]]), map[[keytype]]) |
|
|
191 |
map <- map[m, ] |
|
|
192 |
map[[returntype]][!is.na(map[[returntype]])] |
|
|
193 |
}, |
|
|
194 |
error = function(cond) { |
|
|
195 |
return(list()) |
|
|
196 |
} |
|
|
197 |
) |
|
|
198 |
|
|
|
199 |
return(map) |
|
|
200 |
} |
|
|
201 |
|
|
|
202 |
|
|
|
203 |
#' Mapping between pathway encoded metabolites and different metabolite ID |
|
|
204 |
#' formats. |
|
|
205 |
#' |
|
|
206 |
#' Function to retrieve the metabolite identifier mapping. Ongoing from |
|
|
207 |
#' metabolites retrieved from pathway definitions, which often include two or |
|
|
208 |
#' more ID formats, this function maps those IDs to a given format. The complete |
|
|
209 |
#' mapping table based on \href{https://comptox.epa.gov/dashboard}{Comptox |
|
|
210 |
#' Dashboard}, \href{https://pubchem.ncbi.nlm.nih.gov/}{PubChem}, |
|
|
211 |
#' \href{https://hmdb.ca/}{HMDB}, and \href{https://www.ebi.ac.uk/chebi}{ChEBI} |
|
|
212 |
#' is provided in the AnnotationHub package metaboliteIDmapping. |
|
|
213 |
#' |
|
|
214 |
#' @param features List of identifiers to be mapped. |
|
|
215 |
#' @param keytype String specifying the ID type, e.g., "ChEBI" or "KEGGCOMP". |
|
|
216 |
#' |
|
|
217 |
#' @param returntype String that specifies the returning ID type. |
|
|
218 |
#' Default: HMDB |
|
|
219 |
#' Options: HMDB, CAS, DTXCID, DTXSID, SID, CID, ChEBI, KEGG, Drugbank |
|
|
220 |
#' |
|
|
221 |
#' @return List containing mapped gene/protein IDs. |
|
|
222 |
#' |
|
|
223 |
#' @examples |
|
|
224 |
#' features <- graphite::nodes(graphite::pathways("hsapiens", "kegg")[[1]], which = "metabolites") |
|
|
225 |
#' features <- gsub("KEGGCOMP:", "", features) |
|
|
226 |
#' keytype <- "KEGG" |
|
|
227 |
#' |
|
|
228 |
#' getMetaboliteMapping(features, keytype) |
|
|
229 |
#' |
|
|
230 |
#' getMetaboliteMapping(features, keytype = "KEGG", returntype = "CID") |
|
|
231 |
#' |
|
|
232 |
#' @importFrom dplyr pull filter distinct |
|
|
233 |
#' @importFrom metaboliteIDmapping metabolitesMapping |
|
|
234 |
#' @importFrom magrittr %>% |
|
|
235 |
#' |
|
|
236 |
#' @export |
|
|
237 |
getMetaboliteMapping <- function(features, keytype, returntype = "HMDB") { |
|
|
238 |
## check for the correct metabolite mapping format |
|
|
239 |
supportedIDs <- c( |
|
|
240 |
"HMDB", "ChEBI", "KEGG", "CAS", "DTXCID", |
|
|
241 |
"DTXSID", "SID", "CID", "Drugbank" |
|
|
242 |
) |
|
|
243 |
if (!returntype %in% supportedIDs) { |
|
|
244 |
stop("Insert one of the following IDs to be returned (returntype): |
|
|
245 |
HMDB, CAS, ChEBI, KEGG, SID, CID, DTXCID, DTXSID, Drugbank, Name", |
|
|
246 |
call. = FALSE |
|
|
247 |
) |
|
|
248 |
} |
|
|
249 |
|
|
|
250 |
## load the mapping table which is deposited in the |
|
|
251 |
## metaboliteIDmapping package. |
|
|
252 |
if (!requireNamespace("metaboliteIDmapping", quietly = TRUE)) { |
|
|
253 |
stop("The necessary package metaboliteIDmapping is not installed.", |
|
|
254 |
call. = FALSE |
|
|
255 |
) |
|
|
256 |
} |
|
|
257 |
|
|
|
258 |
|
|
|
259 |
## run the actual mapping of IDS and return a list of the user-given type |
|
|
260 |
map <- tryCatch( |
|
|
261 |
{ |
|
|
262 |
## to speed up the mapping, we need to subest the whole |
|
|
263 |
## metabolitesIDmapping table in the first place to contain |
|
|
264 |
## only thoses entries that match the given feature list |
|
|
265 |
SUBmappingTable <- metaboliteIDmapping::metabolitesMapping %>% |
|
|
266 |
dplyr::select(!!as.name(keytype), !!as.name(returntype)) %>% |
|
|
267 |
dplyr::filter(!!as.name(keytype) %in% unique(features)) %>% |
|
|
268 |
dplyr::distinct() |
|
|
269 |
colnames(SUBmappingTable) <- c("Original", "Mapped") |
|
|
270 |
|
|
|
271 |
SUBmappingTable %>% dplyr::pull("Mapped") |
|
|
272 |
}, |
|
|
273 |
error = function(cond) { |
|
|
274 |
return( |
|
|
275 |
rep("NA", length(features)) |
|
|
276 |
) |
|
|
277 |
} |
|
|
278 |
) |
|
|
279 |
|
|
|
280 |
map <- map[!is.na(map)] |
|
|
281 |
return(map) |
|
|
282 |
} |
|
|
283 |
|
|
|
284 |
|
|
|
285 |
|
|
|
286 |
#' Collect feature mapping for user given databases and omics layer. |
|
|
287 |
#' |
|
|
288 |
#' The functions makes use of the graphite R package to collect pathways from |
|
|
289 |
#' user specified databases. Depending on the omics layer specified, the |
|
|
290 |
#' function extracts either annotated genes/proteins (for transcriptome, |
|
|
291 |
#' proteome layer) or metabolites (for metabolite layer). The data structure |
|
|
292 |
#' that is returned is mandatory to calculate the multi-omics pathway |
|
|
293 |
#' enrichment. |
|
|
294 |
#' |
|
|
295 |
#' @param dbs List of databases that should be queried for pathways. Default: |
|
|
296 |
#' all available databases |
|
|
297 |
#' @param layer List of omics layer that should be addressed. Default: all three |
|
|
298 |
#' layer (transcriptome, proteome, metabolome) |
|
|
299 |
#' @param returnTranscriptome String specifying the returned gene ID format. |
|
|
300 |
#' Default: SYMBOL Options: SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ |
|
|
301 |
#' @param returnProteome String specifying the returned protein ID format. |
|
|
302 |
#' Default: SYMBOL Options: SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ |
|
|
303 |
#' @param returnMetabolome String specifying the returned metabolite ID format. |
|
|
304 |
#' Default: HMDB Options: HMDB, CAS, DTXCID, DTXSID, SID, CID, ChEBI, KEGG, Drugbank |
|
|
305 |
#' @param organism String specifying the organism of interest. This has direct |
|
|
306 |
#' influence on the available pathway databases. Default: "hsapiens" |
|
|
307 |
#' Options: see \code{\link[multiGSEA]{getOrganisms}} |
|
|
308 |
#' @param useLocal Boolean to use local pathway/feature descriptions. In case |
|
|
309 |
#' useLocal is set to FALSE, pathway definitions and feature extraction |
|
|
310 |
#' will be recalculated. This could take several minutes depending on the |
|
|
311 |
#' database used. Pathbank, for example, contains nearly 50000 pathway |
|
|
312 |
#' definition that have to be re-mapped. useLocal has no effect when |
|
|
313 |
#' pathway definitions are retrieved for the first time. Default: TRUE |
|
|
314 |
#' |
|
|
315 |
#' @return Nested list with extracted and mapped pathway features. |
|
|
316 |
#' |
|
|
317 |
#' @examples |
|
|
318 |
#' |
|
|
319 |
#' getMultiOmicsFeatures( |
|
|
320 |
#' dbs = c("kegg"), |
|
|
321 |
#' layer = c("transcriptome", "proteome"), |
|
|
322 |
#' organism = "hsapiens" |
|
|
323 |
#' ) |
|
|
324 |
#' \donttest{ |
|
|
325 |
#' getMultiOmicsFeatures( |
|
|
326 |
#' dbs = c("kegg", "reactome"), |
|
|
327 |
#' layer = c("transcriptome", "metabolome"), |
|
|
328 |
#' organism = "mmusculus" |
|
|
329 |
#' ) |
|
|
330 |
#' |
|
|
331 |
#' getMultiOmicsFeatures( |
|
|
332 |
#' dbs = c("reactome"), |
|
|
333 |
#' layer = c("proteome"), |
|
|
334 |
#' organism = "rnorvegicus", |
|
|
335 |
#' returnProteome = "ENTREZID" |
|
|
336 |
#' ) |
|
|
337 |
#' } |
|
|
338 |
#' @importFrom graphite pathwayDatabases pathways |
|
|
339 |
#' @importFrom magrittr %>% |
|
|
340 |
#' @importFrom dplyr filter pull |
|
|
341 |
#' @importFrom rlang .data |
|
|
342 |
#' |
|
|
343 |
#' @export |
|
|
344 |
getMultiOmicsFeatures <- function(dbs = c("all"), layer = c("all"), |
|
|
345 |
returnTranscriptome = "SYMBOL", |
|
|
346 |
returnProteome = "SYMBOL", |
|
|
347 |
returnMetabolome = "HMDB", |
|
|
348 |
organism = "hsapiens", |
|
|
349 |
useLocal = TRUE) { |
|
|
350 |
layers <- c("all", "metabolome", "proteome", "transcriptome") |
|
|
351 |
organism <- tolower(organism) |
|
|
352 |
|
|
|
353 |
returnTranscriptome <- toupper(returnTranscriptome) |
|
|
354 |
returnProteome <- toupper(returnProteome) |
|
|
355 |
returnMetabolome <- toupper(returnMetabolome) |
|
|
356 |
if(returnMetabolome == "CHEBI") returnMetabolome <- "ChEBI" |
|
|
357 |
if(returnMetabolome == "DRUGBANK") returnMetabolome <- "Drugbank" |
|
|
358 |
|
|
|
359 |
## check for the correct transcriptome mapping format |
|
|
360 |
supportedIDs <- c("SYMBOL", "ENTREZID", "UNIPROT", "ENSEMBL", "REFSEQ") |
|
|
361 |
if (!returnTranscriptome %in% supportedIDs) { |
|
|
362 |
stop("Insert one of the following IDs to be returned (returnTranscriptome): |
|
|
363 |
SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ", |
|
|
364 |
call. = FALSE |
|
|
365 |
) |
|
|
366 |
} |
|
|
367 |
|
|
|
368 |
## check for the correct proteome mapping format |
|
|
369 |
if (!returnProteome %in% supportedIDs) { |
|
|
370 |
stop("Insert one of the following IDs to be returned (returnProteome): |
|
|
371 |
SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ", |
|
|
372 |
call. = FALSE |
|
|
373 |
) |
|
|
374 |
} |
|
|
375 |
|
|
|
376 |
## check for the correct metabolite mapping format |
|
|
377 |
supportedIDs <- c( |
|
|
378 |
"HMDB", "ChEBI", "KEGG", "CAS", "DTXCID", |
|
|
379 |
"DTXSID", "SID", "CID", "Drugbank" |
|
|
380 |
) |
|
|
381 |
if (!returnMetabolome %in% supportedIDs) { |
|
|
382 |
stop("Insert one of the following IDs to be returned (returnMetabolome): |
|
|
383 |
HMDB, CAS, ChEBI, KEGG, SID, CID, DTXCID, DTXSID, Drugbank", |
|
|
384 |
call. = FALSE |
|
|
385 |
) |
|
|
386 |
} |
|
|
387 |
|
|
|
388 |
## check if the given organism is supported |
|
|
389 |
if (!(organism %in% getOrganisms())) { |
|
|
390 |
stop("You entered an organism that is not supported! |
|
|
391 |
Use getOrganisms() to get a list of all suported organisms.", |
|
|
392 |
call. = FALSE |
|
|
393 |
) |
|
|
394 |
} |
|
|
395 |
|
|
|
396 |
if (sum(tolower(layer) %in% layers) != length(layer)) { |
|
|
397 |
stop("You entered wrong input for the omics layer specification. |
|
|
398 |
Options are: all, transcriptome, proteome, metabolome, or a combination thereof.", |
|
|
399 |
call. = FALSE |
|
|
400 |
) |
|
|
401 |
} |
|
|
402 |
|
|
|
403 |
pDBs <- graphite::pathwayDatabases() |
|
|
404 |
dbs0 <- pDBs %>% |
|
|
405 |
dplyr::filter(.data$species == organism) %>% |
|
|
406 |
dplyr::pull(.data$database) |
|
|
407 |
databases <- c("all", as.vector(dbs0)) |
|
|
408 |
|
|
|
409 |
if (sum(tolower(dbs) %in% databases) != length(dbs)) { |
|
|
410 |
stop(paste0("You entered wrong input for the omics layer specification. |
|
|
411 |
Options are: ", paste(databases, collapse = " "), " or a combination thereof."), |
|
|
412 |
call. = FALSE |
|
|
413 |
) |
|
|
414 |
} |
|
|
415 |
|
|
|
416 |
if ("all" %in% dbs) dbs <- as.vector(dbs0) |
|
|
417 |
|
|
|
418 |
if ("all" %in% layer) { |
|
|
419 |
layer <- c("transcriptome", "proteome", "metabolome") |
|
|
420 |
} |
|
|
421 |
|
|
|
422 |
pathways <- lapply(dbs, function(x) { |
|
|
423 |
graphite::pathways(organism, x) |
|
|
424 |
}) |
|
|
425 |
names(pathways) <- dbs |
|
|
426 |
|
|
|
427 |
features <- list() |
|
|
428 |
if ("transcriptome" %in% layer) { |
|
|
429 |
features$transcriptome <- getMappedFeatures( |
|
|
430 |
pathways = pathways, |
|
|
431 |
organism = organism, |
|
|
432 |
returnID = returnTranscriptome, |
|
|
433 |
useLocal = useLocal |
|
|
434 |
) |
|
|
435 |
|
|
|
436 |
## adapt for duplicated pathways |
|
|
437 |
names(features$transcriptome) <- rename_duplicates(names(features$transcriptome)) |
|
|
438 |
} |
|
|
439 |
|
|
|
440 |
if ("proteome" %in% layer) { |
|
|
441 |
if ("transcriptome" %in% layer && returnProteome == returnTranscriptome) { |
|
|
442 |
features$proteome <- features$transcriptome |
|
|
443 |
} else { |
|
|
444 |
features$proteome <- getMappedFeatures( |
|
|
445 |
pathways = pathways, |
|
|
446 |
organism = organism, |
|
|
447 |
returnID = returnProteome, |
|
|
448 |
useLocal = useLocal |
|
|
449 |
) |
|
|
450 |
|
|
|
451 |
## adapt for duplicated pathways |
|
|
452 |
names(features$proteome) <- rename_duplicates(names(features$proteome)) |
|
|
453 |
} |
|
|
454 |
} |
|
|
455 |
|
|
|
456 |
|
|
|
457 |
if ("metabolome" %in% layer) { |
|
|
458 |
features$metabolome <- getMappedFeatures( |
|
|
459 |
pathways = pathways, |
|
|
460 |
organism = organism, |
|
|
461 |
returnID = returnMetabolome, |
|
|
462 |
which = "metabolites", |
|
|
463 |
useLocal = useLocal |
|
|
464 |
) |
|
|
465 |
|
|
|
466 |
## adapt for duplicated pathways |
|
|
467 |
names(features$metabolome) <- rename_duplicates(names(features$metabolome)) |
|
|
468 |
} |
|
|
469 |
|
|
|
470 |
return(features) |
|
|
471 |
} |
|
|
472 |
|
|
|
473 |
|
|
|
474 |
|
|
|
475 |
#' Wrapper to get feature mappings. |
|
|
476 |
#' |
|
|
477 |
#' Feature mappings will be used from hard disk in case they have been |
|
|
478 |
#' mapped before and `useLocal` is not set to be FALSE. |
|
|
479 |
#' In other cases, a feature extraction will be done and the results are |
|
|
480 |
#' stored for a following occasion. |
|
|
481 |
#' |
|
|
482 |
#' @param pathways List of pathway definitions. |
|
|
483 |
#' @param returnID String specifying the returned ID format. |
|
|
484 |
#' @param organism String defining the organism of analysis. |
|
|
485 |
#' @param which Mode to extract the features, either \'proteins\' or |
|
|
486 |
#' \'metabolites\'. |
|
|
487 |
#' @param useLocal Boolean specifying whether or not to use the local |
|
|
488 |
#' preprocessed mapping. |
|
|
489 |
#' |
|
|
490 |
#' @return List of mapped features for an omics layer. |
|
|
491 |
getMappedFeatures <- function(pathways, returnID = "SYMBOL", organism = "hsapiens", which = "proteins", useLocal = TRUE) { |
|
|
492 |
feat <- unlist(lapply(names(pathways), function(db) { |
|
|
493 |
ap <- archivePath(paste0(organism, "_", db, "_", returnID)) |
|
|
494 |
|
|
|
495 |
if (file.exists(ap) && useLocal) { |
|
|
496 |
loadLocal(ap) |
|
|
497 |
} else { |
|
|
498 |
tmp <- lapply(pathways[[db]], function(p) { |
|
|
499 |
getFeatures( |
|
|
500 |
pathway = p, org = organism, |
|
|
501 |
which = which, |
|
|
502 |
returntype = returnID |
|
|
503 |
) |
|
|
504 |
}) |
|
|
505 |
|
|
|
506 |
header <- rep(paste0("(", toupper(db), ") "), length(pathways[[db]])) |
|
|
507 |
names(tmp) <- paste0(header, names(tmp)) |
|
|
508 |
saveRDS(tmp, file = ap) |
|
|
509 |
tmp |
|
|
510 |
} |
|
|
511 |
}), recursive = FALSE) |
|
|
512 |
|
|
|
513 |
return(feat) |
|
|
514 |
} |
|
|
515 |
|
|
|
516 |
|
|
|
517 |
|
|
|
518 |
#' Helper function to map only a subset of metabolite IDs |
|
|
519 |
#' |
|
|
520 |
#' This helper function becomes necessary since there are sometimes multiple ID |
|
|
521 |
#' formats used in a single pathway definition. |
|
|
522 |
#' |
|
|
523 |
#' @param features List of metabolite feature IDs of the pathway. |
|
|
524 |
#' @param keytype String specifying the ID format in pathway definition. |
|
|
525 |
#' @param maptype String specifying the corresponding ID format in multiGSEA. |
|
|
526 |
#' @param returntype String identifying the ID type that should be mapped. |
|
|
527 |
#' |
|
|
528 |
#' @return List of mapped metabolite IDs. |
|
|
529 |
mapIDType <- function(features, keytype = "CHEBI", maptype = "ChEBI", returntype = "HMDB") { |
|
|
530 |
mapped <- c() |
|
|
531 |
ids <- gsub(paste0(keytype, ":"), "", features[grep(keytype, features)]) |
|
|
532 |
|
|
|
533 |
if (returntype != maptype) { |
|
|
534 |
mapped <- getMetaboliteMapping( |
|
|
535 |
features = ids, |
|
|
536 |
keytype = maptype, |
|
|
537 |
returntype = returntype |
|
|
538 |
) |
|
|
539 |
} else { |
|
|
540 |
mapped <- ids |
|
|
541 |
} |
|
|
542 |
|
|
|
543 |
return(mapped) |
|
|
544 |
} |
|
|
545 |
|
|
|
546 |
|
|
|
547 |
|
|
|
548 |
#' Get list of supported organisms |
|
|
549 |
#' |
|
|
550 |
#' Get a list of organisms that are covered in our workflow through a supporting |
|
|
551 |
#' `AnnotationDBi` package. Without such a package we would not be able to map |
|
|
552 |
#' transcript and protein identifier between different formats. All the |
|
|
553 |
#' organisms that are listed here have at lest kegg and or reactome pathway |
|
|
554 |
#' annotation that can be queried by `graphite`. |
|
|
555 |
#' |
|
|
556 |
#' @return List of supported organisms |
|
|
557 |
#' |
|
|
558 |
#' @examples |
|
|
559 |
#' getOrganisms() |
|
|
560 |
#' @export |
|
|
561 |
getOrganisms <- function() { |
|
|
562 |
orglist <- c( |
|
|
563 |
"hsapiens", "rnorvegicus", "mmusculus", "sscrofa", |
|
|
564 |
"btaurus", "celegans", "dmelanogaster", "drerio", |
|
|
565 |
"ggallus", "xlaevis", "cfamiliaris" |
|
|
566 |
) |
|
|
567 |
|
|
|
568 |
return(orglist) |
|
|
569 |
} |
|
|
570 |
|
|
|
571 |
|
|
|
572 |
|
|
|
573 |
#' Get the correct ID mapping database |
|
|
574 |
#' |
|
|
575 |
#' Check by means of the given organism name if the required `AnnotationDbi` |
|
|
576 |
#' package is installed. Select the ID mapping table based on the organism name |
|
|
577 |
#' and return it. |
|
|
578 |
#' |
|
|
579 |
#' @param organism String that defines the organism. |
|
|
580 |
#' |
|
|
581 |
#' @return AnnotationDbi database for ID mapping. |
|
|
582 |
getIDMappingDatabase <- function(organism) { |
|
|
583 |
map <- c( |
|
|
584 |
hsapiens = "org.Hs.eg.db", rnorvegicus = "org.Rn.eg.db", |
|
|
585 |
mmusculus = "org.Mm.eg.db", sscrofa = "org.Ss.eg.db", |
|
|
586 |
btaurus = "org.Bt.eg.db", celegans = "org.Ce.eg.db", |
|
|
587 |
dmelanogaster = "org.Dm.eg.db", drerio = "org.Dr.eg.db", |
|
|
588 |
ggallus = "org.Gg.eg.db", xlaevis = "org.Xl.eg.db", |
|
|
589 |
cfamiliaris = "org.Cf.eg.db" |
|
|
590 |
) |
|
|
591 |
|
|
|
592 |
stopifnot(organism %in% names(map)) |
|
|
593 |
pkg <- map[[organism]] |
|
|
594 |
|
|
|
595 |
if (!requireNamespace(pkg, quietly = TRUE)) { |
|
|
596 |
stop(paste0("The necessary package ", pkg, " is not installed."), |
|
|
597 |
call. = FALSE |
|
|
598 |
) |
|
|
599 |
} |
|
|
600 |
|
|
|
601 |
return(get(pkg, envir = getNamespace(pkg))) |
|
|
602 |
} |
|
|
603 |
|
|
|
604 |
|
|
|
605 |
|
|
|
606 |
#' Pre-rank features prior to calculating enrichment scores. |
|
|
607 |
#' |
|
|
608 |
#' Rank features based on the direction of their fold change and their magnitude |
|
|
609 |
#' implicated through their assigned p-value. |
|
|
610 |
#' |
|
|
611 |
#' @param logFC Vector containing the log-transformed fold changes of features. |
|
|
612 |
#' @param pvalues Vector containing the p-values associated with those logFCs. |
|
|
613 |
#' @param base Integer specifying the base of the logarithm. Default: 10 |
|
|
614 |
#' |
|
|
615 |
#' @return Vector of pre-ranked features, still unsorted |
|
|
616 |
#' |
|
|
617 |
#' @examples |
|
|
618 |
#' logFC <- rnorm(10) |
|
|
619 |
#' pvalues <- runif(10) |
|
|
620 |
#' rankFeatures(logFC, pvalues) |
|
|
621 |
#' |
|
|
622 |
#' @export |
|
|
623 |
rankFeatures <- function(logFC, pvalues, base = 10) { |
|
|
624 |
return(sign(logFC) * -log(pvalues, base = base)) |
|
|
625 |
} |
|
|
626 |
|
|
|
627 |
|
|
|
628 |
|
|
|
629 |
#' Create an empty data structure for measured omics features |
|
|
630 |
#' |
|
|
631 |
#' This function creates a data structure of nested but empty lists. One list |
|
|
632 |
#' for each omics layer. By default all three supported omics layer are used to |
|
|
633 |
#' create a data structures with three empty sublists: transcriptome, proteome, |
|
|
634 |
#' and metabolome. |
|
|
635 |
#' |
|
|
636 |
#' @param layer List specifying the omics layer which should be created |
|
|
637 |
#' |
|
|
638 |
#' @return List with length(layer) empty sublists |
|
|
639 |
#' |
|
|
640 |
#' @examples |
|
|
641 |
#' initOmicsDataStructure() |
|
|
642 |
#' initOmicsDataStructure(c("transcriptome", "proteome")) |
|
|
643 |
#' initOmicsDataStructure(c("Transcriptome", "Metabolome")) |
|
|
644 |
#' @export |
|
|
645 |
initOmicsDataStructure <- function(layer = c("transcriptome", "proteome", "metabolome")) { |
|
|
646 |
l <- c() |
|
|
647 |
layer <- tolower(layer) |
|
|
648 |
if (length(grep("trans*", layer)) > 0) l <- c(l, "transcriptome") |
|
|
649 |
if (length(grep("prote*", layer)) > 0) l <- c(l, "proteome") |
|
|
650 |
if (length(grep("metabo*", layer)) > 0) l <- c(l, "metabolome") |
|
|
651 |
|
|
|
652 |
if (length(l) != length(layer)) { |
|
|
653 |
stop("Not all your omics layer could be mapped to |
|
|
654 |
'transcriptome', 'proteome', or 'metabolome'. ", |
|
|
655 |
call. = TRUE |
|
|
656 |
) |
|
|
657 |
} |
|
|
658 |
|
|
|
659 |
empty_structure <- rep(list(list()), length(l)) |
|
|
660 |
names(empty_structure) <- l |
|
|
661 |
|
|
|
662 |
return(empty_structure) |
|
|
663 |
} |
|
|
664 |
|
|
|
665 |
|
|
|
666 |
|
|
|
667 |
#' Helper function to get all different metabolite ID formats |
|
|
668 |
#' |
|
|
669 |
#' This helper function extracts all used ID formats in all pathways |
|
|
670 |
#' and returns a nested list for each pathway database. |
|
|
671 |
#' |
|
|
672 |
#' @param pathways List of pathway databases and their pathway definition. |
|
|
673 |
#' |
|
|
674 |
#' @return List of metabolite ID formats. |
|
|
675 |
#' |
|
|
676 |
#' @importFrom graphite nodes |
|
|
677 |
getMetaboliteIDformats <- function(pathways) { |
|
|
678 |
n1 <- lapply(names(pathways), function(dbs) { |
|
|
679 |
n2 <- lapply(pathways[[dbs]], function(p) { |
|
|
680 |
unique(gsub(":.*", "", graphite::nodes(p, which = "metabolites"))) |
|
|
681 |
}) |
|
|
682 |
|
|
|
683 |
unique(unlist(n2)) |
|
|
684 |
}) |
|
|
685 |
|
|
|
686 |
names(n1) <- names(pathways) |
|
|
687 |
return(n1) |
|
|
688 |
} |