Diff of /Lecture 2/Lecture 2 Lab.r [000000] .. [0be70f]

Switch to unified view

a b/Lecture 2/Lecture 2 Lab.r
1
## to install MOVICS
2
### note: there are many dependencies; you may get an error
3
### for a missing package; download and try again
4
### it may take several times
5
### you can refer to their IMPORTS file from their DESCRIPTION file on Github
6
### for a list of dependencies:
7
### https://github.com/xlucpu/MOVICS/blob/master/DESCRIPTION
8
9
# if (!requireNamespace("BiocManager", quietly = TRUE))
10
#     install.packages("BiocManager")
11
# if (!require("devtools")) 
12
#     install.packages("devtools")
13
# devtools::install_github("xlucpu/MOVICS")
14
15
install.packages("~/Downloads/SNFtool_2.3.0.tar.gz", repos = NULL, type="source") 
16
17
18
19
library(MOVICS)
20
set.seed(4444)
21
22
library("jpeg")
23
jj <- readJPEG("MOVICS_pipeline.jpeg",native=TRUE)
24
plot(0:1,0:1,type="n",ann=FALSE,axes=FALSE)
25
rasterImage(jj,0,0,1,1)
26
27
28
jj <- readJPEG("methods_comparison.jpeg",native=TRUE)
29
plot(0:1,0:1,type="n",ann=FALSE,axes=FALSE)
30
rasterImage(jj,0,0,1,1)
31
32
33
# install.packages("Rfssa") if you want to download through github
34
library(Rfssa)
35
url <- "https://github.com/KechrisLab/ASAShortCourse-MultiOmics/blob/main/Lecture%202/brca_dat.Rdata"
36
load_github_data(url)
37
38
# load("brca_dat.Rdata")
39
40
# let's get a quick look at our data
41
names(brca_dat)
42
paste("dim of clinical data:", dim(brca_dat[["clinical"]]))
43
head(brca_dat[["clinical"]])
44
45
# check sample names all match
46
identical(brca_dat[["clinical"]]$bcr_patient_barcode, colnames(brca_dat[["MO"]][["Expression"]]))
47
identical(brca_dat[["clinical"]]$bcr_patient_barcode, colnames(brca_dat[["MO"]][["Methylation"]]))
48
identical(brca_dat[["clinical"]]$bcr_patient_barcode, colnames(brca_dat[["MO"]][["miRNA"]]))
49
                                                                                
50
identical(colnames(brca_dat[["MO"]][["Expression"]]), colnames(brca_dat[["MO"]][["Methylation"]]))
51
identical(colnames(brca_dat[["MO"]][["Expression"]]), colnames(brca_dat[["MO"]][["miRNA"]]))
52
identical(colnames(brca_dat[["MO"]][["Methylation"]]), colnames(brca_dat[["MO"]][["miRNA"]]))
53
# should all be TRUE (6)
54
55
paste("names of MO data:", names(brca_dat[["MO"]]))
56
paste("dim of mRNA data:", dim(brca_dat[["MO"]][["Expression"]]))
57
paste("dim of methylation data:", dim(brca_dat[["MO"]][["Methylation"]]))
58
paste("dim of miRNA data:", dim(brca_dat[["MO"]][["miRNA"]]))
59
60
# data checking -- are there any missing values?
61
sum(is.na(brca_dat[["MO"]][["Expression"]]))
62
sum(is.na(brca_dat[["MO"]][["Methylation"]]))
63
sum(is.na(brca_dat[["MO"]][["miRNA"]]))
64
65
# exp
66
range(brca_dat[["MO"]][["Expression"]])
67
plot(density(brca_dat[["MO"]][["Expression"]]), main = "Expression")
68
69
# methyl
70
range(brca_dat[["MO"]][["Methylation"]])
71
plot(density(brca_dat[["MO"]][["Methylation"]]), main = "Methylation")
72
73
# miRNA
74
range(brca_dat[["MO"]][["miRNA"]])
75
plot(density(brca_dat[["MO"]][["miRNA"]]), main = "miRNA")
76
77
78
jj <- readJPEG("ex_breastcancer_pic.jpeg")
79
plot(0:1,0:1,type="n",ann=FALSE,axes=FALSE)
80
rasterImage(jj,0,0,1,1)
81
# https://doi.org/10.1016/B978-0-12-800886-7.00021-2
82
83
optk.brca <- getClustNum(data        = brca_dat[["MO"]],
84
                         is.binary   = c(F,F,F), # all omics data is continuous (not binary)
85
                         try.N.clust = 2:8, # try cluster number from 2 to 8
86
                         fig.name    = "CLUSTER NUMBER OF TCGA-BRCA")
87
88
89
90
optk.brca
91
92
# what if we use the suggested k=3 clusters?
93
# you don't need to run this during lab; I'm just presenting it as an example
94
mo_rslts_3 <- getMOIC(data = brca_dat[["MO"]],
95
                         methodslist = list("SNF", "PINSPlus", "NEMO", "LRAcluster", "IntNMF"),
96
                         N.clust     = 3,
97
                         type        = c("gaussian", "gaussian", "gaussian"))
98
99
cmoic.brca_3 <- getConsensusMOIC(moic.res.list = mo_rslts_3,
100
                               fig.name      = "CONSENSUS HEATMAP - 3 Clusters",
101
                               distance      = "euclidean",
102
                               linkage       = "average")
103
104
getSilhouette(sil      = cmoic.brca_3$sil, # a sil object returned by getConsensusMOIC()
105
              fig.path = getwd(),
106
              fig.name = "SILHOUETTE",
107
              height   = 5.5,
108
              width    = 5)
109
110
mo_rslts <- getMOIC(data = brca_dat[["MO"]],
111
                         methodslist = list("SNF", "PINSPlus", "NEMO", "LRAcluster", "IntNMF"),
112
                         N.clust     = 4, # set number of clusters
113
                         type        = c("gaussian", "gaussian", "gaussian")) # what is the distribution of the datasets in MO list (same order)
114
115
116
cmoic.brca <- getConsensusMOIC(moic.res.list = mo_rslts,
117
                               fig.name      = "CONSENSUS HEATMAP - 4 Clusters",
118
                               distance      = "euclidean",
119
                               linkage       = "ward.D")
120
121
getSilhouette(sil      = cmoic.brca$sil, # a sil object returned by getConsensusMOIC()
122
              fig.path = getwd(),
123
              fig.name = "SILHOUETTE",
124
              height   = 5.5,
125
              width    = 5)
126
127
# convert beta value to M value for stronger signal
128
std_dat <- brca_dat[["MO"]]
129
std_dat[["Methylation"]] <- log2(std_dat[["Methylation"]] / (1 - std_dat[["Methylation"]]))
130
131
# data normalization for heatmap
132
plotdata <- getStdiz(data       = std_dat,
133
                     halfwidth  = c(2,2,2), # no truncation for mutation
134
                     centerFlag = c(T,T,T), # no center for mutation
135
                     scaleFlag  = c(T,T,T)) # no scale for mutation
136
137
mRNA.col   <- c("#00FF00", "#008000", "#000000", "#800000", "#FF0000")
138
meth.col   <- c("#0074FE", "#96EBF9", "#FEE900", "#F00003")
139
miRNA.col <- c("#6699CC", "white"  , "#FF3C38")
140
col.list   <- list(mRNA.col, meth.col, miRNA.col)
141
142
# extract PAM50, pathologic stage for sample annotation
143
annCol    <- brca_dat[["clinical"]][,c("BRCA_Subtype_PAM50"), drop = FALSE]
144
145
# generate corresponding colors for sample annotation
146
annColors <- list(
147
                  BRCA_Subtype_PAM50  = c("Basal" = "blue",
148
                            "Her2"   = "red",
149
                            "LumA"   = "yellow",
150
                            "LumB"   = "green",
151
                            "Normal" = "black")
152
                )
153
154
155
156
# comprehensive heatmap
157
getMoHeatmap(data          = plotdata,
158
             row.title     = names(std_dat),
159
             is.binary     = c(F,F,F), # we don't have any binary omics data (ex mutation)
160
             legend.name   = c("mRNA","M value","miRNA"),
161
             clust.res     = mo_rslts$SNF$clust.res, # cluster results for SNF
162
             color         = col.list,
163
             # annCol        = annCol, # annotation for samples (if you want to show PAM50 classes too)
164
             # annColors     = annColors, # annotation color
165
             width         = 10, # width of each subheatmap
166
             height        = 5, # height of each subheatmap
167
             fig.name      = "COMPREHENSIVE HEATMAP OF SNF")
168
169
getMoHeatmap(data          = plotdata,
170
             row.title     = names(std_dat),
171
             is.binary     = c(F,F,F), # all data is continuous
172
             legend.name   = c("mRNA","M value","miRNA"),
173
             clust.res     = mo_rslts$PINSPlus$clust.res, # cluster results for PINSPlus
174
             color         = col.list,
175
             width         = 10, # width of each subheatmap
176
             height        = 5, # height of each subheatmap
177
             fig.name      = "COMPREHENSIVE HEATMAP OF PINSPlus")
178
179
# comprehensive heatmap (may take a while)
180
getMoHeatmap(data          = plotdata,
181
             row.title     = names(std_dat),
182
             is.binary     = c(F,F,F),
183
             legend.name   = c("mRNA","M value","miRNA"),
184
             clust.res     = mo_rslts$NEMO$clust.res, # cluster results for NEMO
185
             color         = col.list,
186
             width         = 10, # width of each subheatmap
187
             height        = 5, # height of each subheatmap
188
             fig.name      = "COMPREHENSIVE HEATMAP OF PINSPlus")
189
190
# comprehensive heatmap (may take a while)
191
getMoHeatmap(data          = plotdata,
192
             row.title     = names(std_dat),
193
             is.binary     = c(F,F,F),
194
             legend.name   = c("mRNA","M value","miRNA"),
195
             clust.res     = mo_rslts$LRAcluster$clust.res, # cluster results for LRAcluster
196
             color         = col.list,
197
             width         = 10, 
198
             height        = 5, 
199
             fig.name      = "COMPREHENSIVE HEATMAP OF PINSPlus")
200
201
# comprehensive heatmap (may take a while)
202
getMoHeatmap(data          = plotdata,
203
             row.title     = names(std_dat),
204
             is.binary     = c(F,F,F),
205
             legend.name   = c("mRNA","M value","miRNA"),
206
             clust.res     = mo_rslts$IntNMF$clust.res, # cluster results for intNMF
207
             color         = col.list,
208
             width         = 10, # width of each subheatmap
209
             height        = 5, # height of each subheatmap
210
             fig.name      = "COMPREHENSIVE HEATMAP OF IntNMF")
211
212
getMoHeatmap(data          = plotdata,
213
             row.title     = names(plotdata),
214
             is.binary     = c(F,F,F), # no binary omics data
215
             legend.name   = c("mRNA","M value","miRNA"),
216
             clust.res     = cmoic.brca$clust.res, # consensusMOIC results
217
             clust.dend    = NULL, # show no dendrogram for samples
218
             show.colnames = FALSE, # show no sample names
219
             show.row.dend = c(T,T,T), # show dendrogram for features
220
             annRow        = NULL, # no selected features
221
             color         = col.list,
222
             annCol        = annCol, # annotation for samples
223
             annColors     = annColors, # annotation color
224
             width         = 10, # width of each subheatmap
225
             height        = 5, # height of each subheatmap
226
             fig.name      = "COMPREHENSIVE HEATMAP OF CONSENSUSMOIC")
227
228
clust_rslts_SNF_df <- data.frame(mo_rslts$SNF$clust.res)
229
colnames(clust_rslts_SNF_df) <- c("samID", "SNF")
230
clust_rslts_CM_df <- data.frame(cmoic.brca$clust.res)
231
colnames(clust_rslts_CM_df) <- c("samID", "Consensus")
232
233
clust_rslts_df <- merge(clust_rslts_SNF_df, clust_rslts_CM_df, by="samID")
234
# head(clust_rslts_df)
235
236
table(clust_rslts_df$SNF, clust_rslts_df$Consensus)
237
238
# survival comparison
239
brca_dat[["clinical"]]$futime = as.numeric(brca_dat[["clinical"]]$futime)
240
head(brca_dat[["clinical"]])
241
surv.brca <- compSurv(moic.res         = cmoic.brca,
242
                      surv.info        = brca_dat[["clinical"]],
243
                      convt.time       = "m", # convert day unit to month
244
                      surv.median.line = "h", # draw horizontal line at median survival
245
                      xyrs.est         = c(5,10), # estimate 5 and 10-year survival
246
                      fig.name         = "KAPLAN-MEIER CURVE OF CONSENSUSMOIC")
247
248
249
print(surv.brca)
250
251
# clinVars_df <- brca_dat[["clinical"]][,c("ajcc_pathologic_stage", "age_at_diagnosis","ajcc_pathologic_t", "ajcc_pathologic_n","ajcc_pathologic_m")]
252
clinVars_df <- brca_dat[["clinical"]]
253
rownames(clinVars_df) <- clinVars_df$bcr_patient_barcode
254
clinVars_df <- clinVars_df[,c( "ajcc_pathologic_stage", "ajcc_pathologic_t", "ajcc_pathologic_n", "ajcc_pathologic_m", "age_at_diagnosis")]
255
head(clinVars_df)
256
257
258
clin.brca <- compClinvar(moic.res      = cmoic.brca,
259
                         var2comp      = clinVars_df, # data.frame needs to summarize (must has row names of samples)
260
                         strata        = "Subtype", # stratifying variable (e.g., Subtype in this example)
261
                         # factorVars    = c("ajcc_pathologic_stage"), # features that are considered categorical variables
262
                         factorVars    = c("ajcc_pathologic_stage", "ajcc_pathologic_t", "ajcc_pathologic_n", "ajcc_pathologic_m"), # features that are considered categorical variables
263
                         nonnormalVars = "age_at_diagnosis", # feature(s) that are considered using nonparametric test
264
                         exactVars     = NULL, # feature(s) that are considered using exact test
265
                         doWord        = FALSE, # generate .docx file in local path
266
                         tab.name      = "SUMMARIZATION OF CLINICAL FEATURES")
267
clin.brca
268
269
# compare agreement with other subtypes
270
sub_df = data.frame(
271
                    BRCA_Subtype_PAM50 = brca_dat[["clinical"]][,c("BRCA_Subtype_PAM50")])
272
rownames(sub_df) = brca_dat[["clinical"]][,c("bcr_patient_barcode")]
273
head(sub_df)
274
275
# agreement comparison (support up to 6 classifications include current subtype)
276
agree.brca <- compAgree(moic.res  = cmoic.brca,
277
                        subt2comp = sub_df,
278
                        doPlot    = TRUE,
279
                        box.width = 0.2,
280
                        fig.name  = "AGREEMENT OF CONSENSUSMOIC WITH PAM50 Subtype")
281
282
# run DEA with limma
283
runDEA(dea.method = "limma",
284
       expr       = brca_dat[["MO"]][["Expression"]], # normalized expression data
285
       moic.res   = cmoic.brca,
286
       overwt = T,
287
       res.path   = getwd(), # path to save marker files
288
       prefix     = "de_TCGA-BRCA")
289
290
291
# choose limma result to identify subtype-specific DOWN-regulated biomarkers
292
marker.dn <- runMarker(moic.res      = cmoic.brca,
293
                       dea.method    = "limma",
294
                       prefix        = "de_TCGA-BRCA",
295
                       dirct         = "down",
296
                       dat.path = getwd(),
297
                       res.path = getwd(),
298
                       p.cutoff      = 0.05, # p cutoff to identify significant DEGs
299
                       p.adj.cutoff  = 0.05, # padj cutoff to identify significant DEGs                       
300
                       n.marker      = 200, # number of biomarkers for each subtype
301
                       doplot        = T,
302
                       annCol        = annCol,
303
                       annColors     = annColors,
304
                       norm.expr     = brca_dat[["MO"]][["Expression"]],
305
                       fig.name      = "UPREGULATED BIOMARKER HEATMAP")
306
307
308
# subtype-specific UP-regulated biomarkers
309
marker.up <- runMarker(moic.res      = cmoic.brca,
310
                       dea.method    = "limma",
311
                       prefix        = "de_TCGA-BRCA",
312
                       dirct         = "up",
313
                       dat.path = getwd(),
314
                       res.path = getwd(),
315
                       p.cutoff      = 0.05, # p cutoff to identify significant DEGs
316
                       p.adj.cutoff  = 0.05, # padj cutoff to identify significant DEGs                       
317
                       n.marker      = 200, # number of biomarkers for each subtype
318
                       doplot        = T,
319
                       annCol        = annCol,
320
                       annColors     = annColors,
321
                       norm.expr     = brca_dat[["MO"]][["Expression"]],
322
                       fig.name      = "UPREGULATED BIOMARKER HEATMAP")
323
324
325
# MUST locate ABSOLUTE path of msigdb file
326
MSIGDB.FILE <- system.file("extdata", "c5.bp.v7.1.symbols.xls", package = "MOVICS", mustWork = TRUE)
327
328
# # run GSEA to identify DOWN-regulated GO pathways using results from edgeR
329
gsea.down <- runGSEA(moic.res     = cmoic.brca,
330
                   dea.method   = "limma", # name of DEA method
331
                   prefix       = "de_TCGA-BRCA", # MUST be the same of argument in runDEA()
332
                   dat.path     = getwd(), # path of DEA files
333
                   res.path     = getwd(), # path to save GSEA files
334
                   msigdb.path  = MSIGDB.FILE, # MUST be the ABSOLUTE path of msigdb file
335
                   norm.expr    = brca_dat[["MO"]][["Expression"]], # use normalized expression to calculate enrichment score
336
                   dirct        = "down", # direction of dysregulation in pathway
337
                   p.cutoff     = 0.05, # p cutoff to identify significant pathways
338
                   p.adj.cutoff = 0.25, # padj cutoff to identify significant pathways
339
                   gsva.method  = "gsva", # method to calculate single sample enrichment score
340
                   norm.method  = "mean", # normalization method to calculate subtype-specific enrichment score
341
                   fig.name     = "DOWNREGULATED PATHWAY HEATMAP")
342
343
344
data.frame(gsea.down$gsea.list$CS2[1:6,3:6])
345
346
head(round(gsea.down$grouped.es,3))
347
348
# # run GSEA to identify up-regulated GO pathways using results from limma
349
gsea.up <- runGSEA(moic.res     = cmoic.brca,
350
                   dea.method   = "limma", # name of DEA method
351
                   prefix       = "detesting_TCGA-BRCA", # MUST be the same of argument in runDEA()
352
                   dat.path     = getwd(), # path of DEA files
353
                   res.path     = getwd(), # path to save GSEA files
354
                   msigdb.path  = MSIGDB.FILE, # MUST be the ABSOLUTE path of msigdb file
355
                   norm.expr    = brca_dat[["MO"]][["Expression"]], # use normalized expression to calculate enrichment score
356
                   dirct        = "up", # direction of dysregulation in pathway
357
                   p.cutoff     = 0.05, # p cutoff to identify significant pathways
358
                   p.adj.cutoff = 0.25, # padj cutoff to identify significant pathways
359
                   gsva.method  = "gsva", # method to calculate single sample enrichment score
360
                   norm.method  = "mean", # normalization method to calculate subtype-specific enrichment score
361
                   fig.name     = "UPREGULATED PATHWAY HEATMAP")
362
363
364
# MUST locate ABSOLUTE path of gene set file
365
GSET.FILE <- 
366
  system.file("extdata", "gene sets of interest.gmt", package = "MOVICS", mustWork = TRUE)
367
368
# run GSVA to estimate single sample enrichment score based on given gene set of interest
369
gsva.res <- 
370
  runGSVA(moic.res      = cmoic.brca,
371
          norm.expr     = brca_dat[["MO"]][["Expression"]],
372
          gset.gmt.path = GSET.FILE, # ABSOLUTE path of gene set file
373
          gsva.method   = "gsva", # method to calculate single sample enrichment score
374
          annCol        = annCol,
375
          annColors     = annColors,
376
          fig.path      = getwd(),
377
          fig.name      = "GENE SETS OF INTEREST HEATMAP",
378
          height        = 5,
379
          width         = 10)
380
381
382
message("check raw enrichment score")
383
gsva.res$raw.es[1:3,1:3]
384
385
message("check z-scored and truncated enrichment score")
386
gsva.res$scaled.es[1:3,1:3]
387