[0be70f]: / Lecture 2 / Lecture 2 Lab.r

Download this file

388 lines (312 with data), 18.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
## to install MOVICS
### note: there are many dependencies; you may get an error
### for a missing package; download and try again
### it may take several times
### you can refer to their IMPORTS file from their DESCRIPTION file on Github
### for a list of dependencies:
### https://github.com/xlucpu/MOVICS/blob/master/DESCRIPTION
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# if (!require("devtools"))
# install.packages("devtools")
# devtools::install_github("xlucpu/MOVICS")
install.packages("~/Downloads/SNFtool_2.3.0.tar.gz", repos = NULL, type="source")
library(MOVICS)
set.seed(4444)
library("jpeg")
jj <- readJPEG("MOVICS_pipeline.jpeg",native=TRUE)
plot(0:1,0:1,type="n",ann=FALSE,axes=FALSE)
rasterImage(jj,0,0,1,1)
jj <- readJPEG("methods_comparison.jpeg",native=TRUE)
plot(0:1,0:1,type="n",ann=FALSE,axes=FALSE)
rasterImage(jj,0,0,1,1)
# install.packages("Rfssa") if you want to download through github
library(Rfssa)
url <- "https://github.com/KechrisLab/ASAShortCourse-MultiOmics/blob/main/Lecture%202/brca_dat.Rdata"
load_github_data(url)
# load("brca_dat.Rdata")
# let's get a quick look at our data
names(brca_dat)
paste("dim of clinical data:", dim(brca_dat[["clinical"]]))
head(brca_dat[["clinical"]])
# check sample names all match
identical(brca_dat[["clinical"]]$bcr_patient_barcode, colnames(brca_dat[["MO"]][["Expression"]]))
identical(brca_dat[["clinical"]]$bcr_patient_barcode, colnames(brca_dat[["MO"]][["Methylation"]]))
identical(brca_dat[["clinical"]]$bcr_patient_barcode, colnames(brca_dat[["MO"]][["miRNA"]]))
identical(colnames(brca_dat[["MO"]][["Expression"]]), colnames(brca_dat[["MO"]][["Methylation"]]))
identical(colnames(brca_dat[["MO"]][["Expression"]]), colnames(brca_dat[["MO"]][["miRNA"]]))
identical(colnames(brca_dat[["MO"]][["Methylation"]]), colnames(brca_dat[["MO"]][["miRNA"]]))
# should all be TRUE (6)
paste("names of MO data:", names(brca_dat[["MO"]]))
paste("dim of mRNA data:", dim(brca_dat[["MO"]][["Expression"]]))
paste("dim of methylation data:", dim(brca_dat[["MO"]][["Methylation"]]))
paste("dim of miRNA data:", dim(brca_dat[["MO"]][["miRNA"]]))
# data checking -- are there any missing values?
sum(is.na(brca_dat[["MO"]][["Expression"]]))
sum(is.na(brca_dat[["MO"]][["Methylation"]]))
sum(is.na(brca_dat[["MO"]][["miRNA"]]))
# exp
range(brca_dat[["MO"]][["Expression"]])
plot(density(brca_dat[["MO"]][["Expression"]]), main = "Expression")
# methyl
range(brca_dat[["MO"]][["Methylation"]])
plot(density(brca_dat[["MO"]][["Methylation"]]), main = "Methylation")
# miRNA
range(brca_dat[["MO"]][["miRNA"]])
plot(density(brca_dat[["MO"]][["miRNA"]]), main = "miRNA")
jj <- readJPEG("ex_breastcancer_pic.jpeg")
plot(0:1,0:1,type="n",ann=FALSE,axes=FALSE)
rasterImage(jj,0,0,1,1)
# https://doi.org/10.1016/B978-0-12-800886-7.00021-2
optk.brca <- getClustNum(data = brca_dat[["MO"]],
is.binary = c(F,F,F), # all omics data is continuous (not binary)
try.N.clust = 2:8, # try cluster number from 2 to 8
fig.name = "CLUSTER NUMBER OF TCGA-BRCA")
optk.brca
# what if we use the suggested k=3 clusters?
# you don't need to run this during lab; I'm just presenting it as an example
mo_rslts_3 <- getMOIC(data = brca_dat[["MO"]],
methodslist = list("SNF", "PINSPlus", "NEMO", "LRAcluster", "IntNMF"),
N.clust = 3,
type = c("gaussian", "gaussian", "gaussian"))
cmoic.brca_3 <- getConsensusMOIC(moic.res.list = mo_rslts_3,
fig.name = "CONSENSUS HEATMAP - 3 Clusters",
distance = "euclidean",
linkage = "average")
getSilhouette(sil = cmoic.brca_3$sil, # a sil object returned by getConsensusMOIC()
fig.path = getwd(),
fig.name = "SILHOUETTE",
height = 5.5,
width = 5)
mo_rslts <- getMOIC(data = brca_dat[["MO"]],
methodslist = list("SNF", "PINSPlus", "NEMO", "LRAcluster", "IntNMF"),
N.clust = 4, # set number of clusters
type = c("gaussian", "gaussian", "gaussian")) # what is the distribution of the datasets in MO list (same order)
cmoic.brca <- getConsensusMOIC(moic.res.list = mo_rslts,
fig.name = "CONSENSUS HEATMAP - 4 Clusters",
distance = "euclidean",
linkage = "ward.D")
getSilhouette(sil = cmoic.brca$sil, # a sil object returned by getConsensusMOIC()
fig.path = getwd(),
fig.name = "SILHOUETTE",
height = 5.5,
width = 5)
# convert beta value to M value for stronger signal
std_dat <- brca_dat[["MO"]]
std_dat[["Methylation"]] <- log2(std_dat[["Methylation"]] / (1 - std_dat[["Methylation"]]))
# data normalization for heatmap
plotdata <- getStdiz(data = std_dat,
halfwidth = c(2,2,2), # no truncation for mutation
centerFlag = c(T,T,T), # no center for mutation
scaleFlag = c(T,T,T)) # no scale for mutation
mRNA.col <- c("#00FF00", "#008000", "#000000", "#800000", "#FF0000")
meth.col <- c("#0074FE", "#96EBF9", "#FEE900", "#F00003")
miRNA.col <- c("#6699CC", "white" , "#FF3C38")
col.list <- list(mRNA.col, meth.col, miRNA.col)
# extract PAM50, pathologic stage for sample annotation
annCol <- brca_dat[["clinical"]][,c("BRCA_Subtype_PAM50"), drop = FALSE]
# generate corresponding colors for sample annotation
annColors <- list(
BRCA_Subtype_PAM50 = c("Basal" = "blue",
"Her2" = "red",
"LumA" = "yellow",
"LumB" = "green",
"Normal" = "black")
)
# comprehensive heatmap
getMoHeatmap(data = plotdata,
row.title = names(std_dat),
is.binary = c(F,F,F), # we don't have any binary omics data (ex mutation)
legend.name = c("mRNA","M value","miRNA"),
clust.res = mo_rslts$SNF$clust.res, # cluster results for SNF
color = col.list,
# annCol = annCol, # annotation for samples (if you want to show PAM50 classes too)
# annColors = annColors, # annotation color
width = 10, # width of each subheatmap
height = 5, # height of each subheatmap
fig.name = "COMPREHENSIVE HEATMAP OF SNF")
getMoHeatmap(data = plotdata,
row.title = names(std_dat),
is.binary = c(F,F,F), # all data is continuous
legend.name = c("mRNA","M value","miRNA"),
clust.res = mo_rslts$PINSPlus$clust.res, # cluster results for PINSPlus
color = col.list,
width = 10, # width of each subheatmap
height = 5, # height of each subheatmap
fig.name = "COMPREHENSIVE HEATMAP OF PINSPlus")
# comprehensive heatmap (may take a while)
getMoHeatmap(data = plotdata,
row.title = names(std_dat),
is.binary = c(F,F,F),
legend.name = c("mRNA","M value","miRNA"),
clust.res = mo_rslts$NEMO$clust.res, # cluster results for NEMO
color = col.list,
width = 10, # width of each subheatmap
height = 5, # height of each subheatmap
fig.name = "COMPREHENSIVE HEATMAP OF PINSPlus")
# comprehensive heatmap (may take a while)
getMoHeatmap(data = plotdata,
row.title = names(std_dat),
is.binary = c(F,F,F),
legend.name = c("mRNA","M value","miRNA"),
clust.res = mo_rslts$LRAcluster$clust.res, # cluster results for LRAcluster
color = col.list,
width = 10,
height = 5,
fig.name = "COMPREHENSIVE HEATMAP OF PINSPlus")
# comprehensive heatmap (may take a while)
getMoHeatmap(data = plotdata,
row.title = names(std_dat),
is.binary = c(F,F,F),
legend.name = c("mRNA","M value","miRNA"),
clust.res = mo_rslts$IntNMF$clust.res, # cluster results for intNMF
color = col.list,
width = 10, # width of each subheatmap
height = 5, # height of each subheatmap
fig.name = "COMPREHENSIVE HEATMAP OF IntNMF")
getMoHeatmap(data = plotdata,
row.title = names(plotdata),
is.binary = c(F,F,F), # no binary omics data
legend.name = c("mRNA","M value","miRNA"),
clust.res = cmoic.brca$clust.res, # consensusMOIC results
clust.dend = NULL, # show no dendrogram for samples
show.colnames = FALSE, # show no sample names
show.row.dend = c(T,T,T), # show dendrogram for features
annRow = NULL, # no selected features
color = col.list,
annCol = annCol, # annotation for samples
annColors = annColors, # annotation color
width = 10, # width of each subheatmap
height = 5, # height of each subheatmap
fig.name = "COMPREHENSIVE HEATMAP OF CONSENSUSMOIC")
clust_rslts_SNF_df <- data.frame(mo_rslts$SNF$clust.res)
colnames(clust_rslts_SNF_df) <- c("samID", "SNF")
clust_rslts_CM_df <- data.frame(cmoic.brca$clust.res)
colnames(clust_rslts_CM_df) <- c("samID", "Consensus")
clust_rslts_df <- merge(clust_rslts_SNF_df, clust_rslts_CM_df, by="samID")
# head(clust_rslts_df)
table(clust_rslts_df$SNF, clust_rslts_df$Consensus)
# survival comparison
brca_dat[["clinical"]]$futime = as.numeric(brca_dat[["clinical"]]$futime)
head(brca_dat[["clinical"]])
surv.brca <- compSurv(moic.res = cmoic.brca,
surv.info = brca_dat[["clinical"]],
convt.time = "m", # convert day unit to month
surv.median.line = "h", # draw horizontal line at median survival
xyrs.est = c(5,10), # estimate 5 and 10-year survival
fig.name = "KAPLAN-MEIER CURVE OF CONSENSUSMOIC")
print(surv.brca)
# clinVars_df <- brca_dat[["clinical"]][,c("ajcc_pathologic_stage", "age_at_diagnosis","ajcc_pathologic_t", "ajcc_pathologic_n","ajcc_pathologic_m")]
clinVars_df <- brca_dat[["clinical"]]
rownames(clinVars_df) <- clinVars_df$bcr_patient_barcode
clinVars_df <- clinVars_df[,c( "ajcc_pathologic_stage", "ajcc_pathologic_t", "ajcc_pathologic_n", "ajcc_pathologic_m", "age_at_diagnosis")]
head(clinVars_df)
clin.brca <- compClinvar(moic.res = cmoic.brca,
var2comp = clinVars_df, # data.frame needs to summarize (must has row names of samples)
strata = "Subtype", # stratifying variable (e.g., Subtype in this example)
# factorVars = c("ajcc_pathologic_stage"), # features that are considered categorical variables
factorVars = c("ajcc_pathologic_stage", "ajcc_pathologic_t", "ajcc_pathologic_n", "ajcc_pathologic_m"), # features that are considered categorical variables
nonnormalVars = "age_at_diagnosis", # feature(s) that are considered using nonparametric test
exactVars = NULL, # feature(s) that are considered using exact test
doWord = FALSE, # generate .docx file in local path
tab.name = "SUMMARIZATION OF CLINICAL FEATURES")
clin.brca
# compare agreement with other subtypes
sub_df = data.frame(
BRCA_Subtype_PAM50 = brca_dat[["clinical"]][,c("BRCA_Subtype_PAM50")])
rownames(sub_df) = brca_dat[["clinical"]][,c("bcr_patient_barcode")]
head(sub_df)
# agreement comparison (support up to 6 classifications include current subtype)
agree.brca <- compAgree(moic.res = cmoic.brca,
subt2comp = sub_df,
doPlot = TRUE,
box.width = 0.2,
fig.name = "AGREEMENT OF CONSENSUSMOIC WITH PAM50 Subtype")
# run DEA with limma
runDEA(dea.method = "limma",
expr = brca_dat[["MO"]][["Expression"]], # normalized expression data
moic.res = cmoic.brca,
overwt = T,
res.path = getwd(), # path to save marker files
prefix = "de_TCGA-BRCA")
# choose limma result to identify subtype-specific DOWN-regulated biomarkers
marker.dn <- runMarker(moic.res = cmoic.brca,
dea.method = "limma",
prefix = "de_TCGA-BRCA",
dirct = "down",
dat.path = getwd(),
res.path = getwd(),
p.cutoff = 0.05, # p cutoff to identify significant DEGs
p.adj.cutoff = 0.05, # padj cutoff to identify significant DEGs
n.marker = 200, # number of biomarkers for each subtype
doplot = T,
annCol = annCol,
annColors = annColors,
norm.expr = brca_dat[["MO"]][["Expression"]],
fig.name = "UPREGULATED BIOMARKER HEATMAP")
# subtype-specific UP-regulated biomarkers
marker.up <- runMarker(moic.res = cmoic.brca,
dea.method = "limma",
prefix = "de_TCGA-BRCA",
dirct = "up",
dat.path = getwd(),
res.path = getwd(),
p.cutoff = 0.05, # p cutoff to identify significant DEGs
p.adj.cutoff = 0.05, # padj cutoff to identify significant DEGs
n.marker = 200, # number of biomarkers for each subtype
doplot = T,
annCol = annCol,
annColors = annColors,
norm.expr = brca_dat[["MO"]][["Expression"]],
fig.name = "UPREGULATED BIOMARKER HEATMAP")
# MUST locate ABSOLUTE path of msigdb file
MSIGDB.FILE <- system.file("extdata", "c5.bp.v7.1.symbols.xls", package = "MOVICS", mustWork = TRUE)
# # run GSEA to identify DOWN-regulated GO pathways using results from edgeR
gsea.down <- runGSEA(moic.res = cmoic.brca,
dea.method = "limma", # name of DEA method
prefix = "de_TCGA-BRCA", # MUST be the same of argument in runDEA()
dat.path = getwd(), # path of DEA files
res.path = getwd(), # path to save GSEA files
msigdb.path = MSIGDB.FILE, # MUST be the ABSOLUTE path of msigdb file
norm.expr = brca_dat[["MO"]][["Expression"]], # use normalized expression to calculate enrichment score
dirct = "down", # direction of dysregulation in pathway
p.cutoff = 0.05, # p cutoff to identify significant pathways
p.adj.cutoff = 0.25, # padj cutoff to identify significant pathways
gsva.method = "gsva", # method to calculate single sample enrichment score
norm.method = "mean", # normalization method to calculate subtype-specific enrichment score
fig.name = "DOWNREGULATED PATHWAY HEATMAP")
data.frame(gsea.down$gsea.list$CS2[1:6,3:6])
head(round(gsea.down$grouped.es,3))
# # run GSEA to identify up-regulated GO pathways using results from limma
gsea.up <- runGSEA(moic.res = cmoic.brca,
dea.method = "limma", # name of DEA method
prefix = "detesting_TCGA-BRCA", # MUST be the same of argument in runDEA()
dat.path = getwd(), # path of DEA files
res.path = getwd(), # path to save GSEA files
msigdb.path = MSIGDB.FILE, # MUST be the ABSOLUTE path of msigdb file
norm.expr = brca_dat[["MO"]][["Expression"]], # use normalized expression to calculate enrichment score
dirct = "up", # direction of dysregulation in pathway
p.cutoff = 0.05, # p cutoff to identify significant pathways
p.adj.cutoff = 0.25, # padj cutoff to identify significant pathways
gsva.method = "gsva", # method to calculate single sample enrichment score
norm.method = "mean", # normalization method to calculate subtype-specific enrichment score
fig.name = "UPREGULATED PATHWAY HEATMAP")
# MUST locate ABSOLUTE path of gene set file
GSET.FILE <-
system.file("extdata", "gene sets of interest.gmt", package = "MOVICS", mustWork = TRUE)
# run GSVA to estimate single sample enrichment score based on given gene set of interest
gsva.res <-
runGSVA(moic.res = cmoic.brca,
norm.expr = brca_dat[["MO"]][["Expression"]],
gset.gmt.path = GSET.FILE, # ABSOLUTE path of gene set file
gsva.method = "gsva", # method to calculate single sample enrichment score
annCol = annCol,
annColors = annColors,
fig.path = getwd(),
fig.name = "GENE SETS OF INTEREST HEATMAP",
height = 5,
width = 10)
message("check raw enrichment score")
gsva.res$raw.es[1:3,1:3]
message("check z-scored and truncated enrichment score")
gsva.res$scaled.es[1:3,1:3]