--- a
+++ b/Lecture 2/Lecture 2 Lab.r
@@ -0,0 +1,387 @@
+## to install MOVICS
+### note: there are many dependencies; you may get an error
+### for a missing package; download and try again
+### it may take several times
+### you can refer to their IMPORTS file from their DESCRIPTION file on Github
+### for a list of dependencies:
+### https://github.com/xlucpu/MOVICS/blob/master/DESCRIPTION
+
+# if (!requireNamespace("BiocManager", quietly = TRUE))
+#     install.packages("BiocManager")
+# if (!require("devtools")) 
+#     install.packages("devtools")
+# devtools::install_github("xlucpu/MOVICS")
+
+install.packages("~/Downloads/SNFtool_2.3.0.tar.gz", repos = NULL, type="source") 
+
+
+
+library(MOVICS)
+set.seed(4444)
+
+library("jpeg")
+jj <- readJPEG("MOVICS_pipeline.jpeg",native=TRUE)
+plot(0:1,0:1,type="n",ann=FALSE,axes=FALSE)
+rasterImage(jj,0,0,1,1)
+
+
+jj <- readJPEG("methods_comparison.jpeg",native=TRUE)
+plot(0:1,0:1,type="n",ann=FALSE,axes=FALSE)
+rasterImage(jj,0,0,1,1)
+
+
+# install.packages("Rfssa") if you want to download through github
+library(Rfssa)
+url <- "https://github.com/KechrisLab/ASAShortCourse-MultiOmics/blob/main/Lecture%202/brca_dat.Rdata"
+load_github_data(url)
+
+# load("brca_dat.Rdata")
+
+# let's get a quick look at our data
+names(brca_dat)
+paste("dim of clinical data:", dim(brca_dat[["clinical"]]))
+head(brca_dat[["clinical"]])
+
+# check sample names all match
+identical(brca_dat[["clinical"]]$bcr_patient_barcode, colnames(brca_dat[["MO"]][["Expression"]]))
+identical(brca_dat[["clinical"]]$bcr_patient_barcode, colnames(brca_dat[["MO"]][["Methylation"]]))
+identical(brca_dat[["clinical"]]$bcr_patient_barcode, colnames(brca_dat[["MO"]][["miRNA"]]))
+                                                                                
+identical(colnames(brca_dat[["MO"]][["Expression"]]), colnames(brca_dat[["MO"]][["Methylation"]]))
+identical(colnames(brca_dat[["MO"]][["Expression"]]), colnames(brca_dat[["MO"]][["miRNA"]]))
+identical(colnames(brca_dat[["MO"]][["Methylation"]]), colnames(brca_dat[["MO"]][["miRNA"]]))
+# should all be TRUE (6)
+
+paste("names of MO data:", names(brca_dat[["MO"]]))
+paste("dim of mRNA data:", dim(brca_dat[["MO"]][["Expression"]]))
+paste("dim of methylation data:", dim(brca_dat[["MO"]][["Methylation"]]))
+paste("dim of miRNA data:", dim(brca_dat[["MO"]][["miRNA"]]))
+
+# data checking -- are there any missing values?
+sum(is.na(brca_dat[["MO"]][["Expression"]]))
+sum(is.na(brca_dat[["MO"]][["Methylation"]]))
+sum(is.na(brca_dat[["MO"]][["miRNA"]]))
+
+# exp
+range(brca_dat[["MO"]][["Expression"]])
+plot(density(brca_dat[["MO"]][["Expression"]]), main = "Expression")
+
+# methyl
+range(brca_dat[["MO"]][["Methylation"]])
+plot(density(brca_dat[["MO"]][["Methylation"]]), main = "Methylation")
+
+# miRNA
+range(brca_dat[["MO"]][["miRNA"]])
+plot(density(brca_dat[["MO"]][["miRNA"]]), main = "miRNA")
+
+
+jj <- readJPEG("ex_breastcancer_pic.jpeg")
+plot(0:1,0:1,type="n",ann=FALSE,axes=FALSE)
+rasterImage(jj,0,0,1,1)
+# https://doi.org/10.1016/B978-0-12-800886-7.00021-2
+
+optk.brca <- getClustNum(data        = brca_dat[["MO"]],
+                         is.binary   = c(F,F,F), # all omics data is continuous (not binary)
+                         try.N.clust = 2:8, # try cluster number from 2 to 8
+                         fig.name    = "CLUSTER NUMBER OF TCGA-BRCA")
+
+
+
+optk.brca
+
+# what if we use the suggested k=3 clusters?
+# you don't need to run this during lab; I'm just presenting it as an example
+mo_rslts_3 <- getMOIC(data = brca_dat[["MO"]],
+                         methodslist = list("SNF", "PINSPlus", "NEMO", "LRAcluster", "IntNMF"),
+                         N.clust     = 3,
+                         type        = c("gaussian", "gaussian", "gaussian"))
+
+cmoic.brca_3 <- getConsensusMOIC(moic.res.list = mo_rslts_3,
+                               fig.name      = "CONSENSUS HEATMAP - 3 Clusters",
+                               distance      = "euclidean",
+                               linkage       = "average")
+
+getSilhouette(sil      = cmoic.brca_3$sil, # a sil object returned by getConsensusMOIC()
+              fig.path = getwd(),
+              fig.name = "SILHOUETTE",
+              height   = 5.5,
+              width    = 5)
+
+mo_rslts <- getMOIC(data = brca_dat[["MO"]],
+                         methodslist = list("SNF", "PINSPlus", "NEMO", "LRAcluster", "IntNMF"),
+                         N.clust     = 4, # set number of clusters
+                         type        = c("gaussian", "gaussian", "gaussian")) # what is the distribution of the datasets in MO list (same order)
+
+
+cmoic.brca <- getConsensusMOIC(moic.res.list = mo_rslts,
+                               fig.name      = "CONSENSUS HEATMAP - 4 Clusters",
+                               distance      = "euclidean",
+                               linkage       = "ward.D")
+
+getSilhouette(sil      = cmoic.brca$sil, # a sil object returned by getConsensusMOIC()
+              fig.path = getwd(),
+              fig.name = "SILHOUETTE",
+              height   = 5.5,
+              width    = 5)
+
+# convert beta value to M value for stronger signal
+std_dat <- brca_dat[["MO"]]
+std_dat[["Methylation"]] <- log2(std_dat[["Methylation"]] / (1 - std_dat[["Methylation"]]))
+
+# data normalization for heatmap
+plotdata <- getStdiz(data       = std_dat,
+                     halfwidth  = c(2,2,2), # no truncation for mutation
+                     centerFlag = c(T,T,T), # no center for mutation
+                     scaleFlag  = c(T,T,T)) # no scale for mutation
+
+mRNA.col   <- c("#00FF00", "#008000", "#000000", "#800000", "#FF0000")
+meth.col   <- c("#0074FE", "#96EBF9", "#FEE900", "#F00003")
+miRNA.col <- c("#6699CC", "white"  , "#FF3C38")
+col.list   <- list(mRNA.col, meth.col, miRNA.col)
+
+# extract PAM50, pathologic stage for sample annotation
+annCol    <- brca_dat[["clinical"]][,c("BRCA_Subtype_PAM50"), drop = FALSE]
+
+# generate corresponding colors for sample annotation
+annColors <- list(
+                  BRCA_Subtype_PAM50  = c("Basal" = "blue",
+                            "Her2"   = "red",
+                            "LumA"   = "yellow",
+                            "LumB"   = "green",
+                            "Normal" = "black")
+                )
+
+
+
+# comprehensive heatmap
+getMoHeatmap(data          = plotdata,
+             row.title     = names(std_dat),
+             is.binary     = c(F,F,F), # we don't have any binary omics data (ex mutation)
+             legend.name   = c("mRNA","M value","miRNA"),
+             clust.res     = mo_rslts$SNF$clust.res, # cluster results for SNF
+             color         = col.list,
+             # annCol        = annCol, # annotation for samples (if you want to show PAM50 classes too)
+             # annColors     = annColors, # annotation color
+             width         = 10, # width of each subheatmap
+             height        = 5, # height of each subheatmap
+             fig.name      = "COMPREHENSIVE HEATMAP OF SNF")
+
+getMoHeatmap(data          = plotdata,
+             row.title     = names(std_dat),
+             is.binary     = c(F,F,F), # all data is continuous
+             legend.name   = c("mRNA","M value","miRNA"),
+             clust.res     = mo_rslts$PINSPlus$clust.res, # cluster results for PINSPlus
+             color         = col.list,
+             width         = 10, # width of each subheatmap
+             height        = 5, # height of each subheatmap
+             fig.name      = "COMPREHENSIVE HEATMAP OF PINSPlus")
+
+# comprehensive heatmap (may take a while)
+getMoHeatmap(data          = plotdata,
+             row.title     = names(std_dat),
+             is.binary     = c(F,F,F),
+             legend.name   = c("mRNA","M value","miRNA"),
+             clust.res     = mo_rslts$NEMO$clust.res, # cluster results for NEMO
+             color         = col.list,
+             width         = 10, # width of each subheatmap
+             height        = 5, # height of each subheatmap
+             fig.name      = "COMPREHENSIVE HEATMAP OF PINSPlus")
+
+# comprehensive heatmap (may take a while)
+getMoHeatmap(data          = plotdata,
+             row.title     = names(std_dat),
+             is.binary     = c(F,F,F),
+             legend.name   = c("mRNA","M value","miRNA"),
+             clust.res     = mo_rslts$LRAcluster$clust.res, # cluster results for LRAcluster
+             color         = col.list,
+             width         = 10, 
+             height        = 5, 
+             fig.name      = "COMPREHENSIVE HEATMAP OF PINSPlus")
+
+# comprehensive heatmap (may take a while)
+getMoHeatmap(data          = plotdata,
+             row.title     = names(std_dat),
+             is.binary     = c(F,F,F),
+             legend.name   = c("mRNA","M value","miRNA"),
+             clust.res     = mo_rslts$IntNMF$clust.res, # cluster results for intNMF
+             color         = col.list,
+             width         = 10, # width of each subheatmap
+             height        = 5, # height of each subheatmap
+             fig.name      = "COMPREHENSIVE HEATMAP OF IntNMF")
+
+getMoHeatmap(data          = plotdata,
+             row.title     = names(plotdata),
+             is.binary     = c(F,F,F), # no binary omics data
+             legend.name   = c("mRNA","M value","miRNA"),
+             clust.res     = cmoic.brca$clust.res, # consensusMOIC results
+             clust.dend    = NULL, # show no dendrogram for samples
+             show.colnames = FALSE, # show no sample names
+             show.row.dend = c(T,T,T), # show dendrogram for features
+             annRow        = NULL, # no selected features
+             color         = col.list,
+             annCol        = annCol, # annotation for samples
+             annColors     = annColors, # annotation color
+             width         = 10, # width of each subheatmap
+             height        = 5, # height of each subheatmap
+             fig.name      = "COMPREHENSIVE HEATMAP OF CONSENSUSMOIC")
+
+clust_rslts_SNF_df <- data.frame(mo_rslts$SNF$clust.res)
+colnames(clust_rslts_SNF_df) <- c("samID", "SNF")
+clust_rslts_CM_df <- data.frame(cmoic.brca$clust.res)
+colnames(clust_rslts_CM_df) <- c("samID", "Consensus")
+
+clust_rslts_df <- merge(clust_rslts_SNF_df, clust_rslts_CM_df, by="samID")
+# head(clust_rslts_df)
+
+table(clust_rslts_df$SNF, clust_rslts_df$Consensus)
+
+# survival comparison
+brca_dat[["clinical"]]$futime = as.numeric(brca_dat[["clinical"]]$futime)
+head(brca_dat[["clinical"]])
+surv.brca <- compSurv(moic.res         = cmoic.brca,
+                      surv.info        = brca_dat[["clinical"]],
+                      convt.time       = "m", # convert day unit to month
+                      surv.median.line = "h", # draw horizontal line at median survival
+                      xyrs.est         = c(5,10), # estimate 5 and 10-year survival
+                      fig.name         = "KAPLAN-MEIER CURVE OF CONSENSUSMOIC")
+
+
+print(surv.brca)
+
+# clinVars_df <- brca_dat[["clinical"]][,c("ajcc_pathologic_stage", "age_at_diagnosis","ajcc_pathologic_t", "ajcc_pathologic_n","ajcc_pathologic_m")]
+clinVars_df <- brca_dat[["clinical"]]
+rownames(clinVars_df) <- clinVars_df$bcr_patient_barcode
+clinVars_df <- clinVars_df[,c( "ajcc_pathologic_stage", "ajcc_pathologic_t", "ajcc_pathologic_n", "ajcc_pathologic_m", "age_at_diagnosis")]
+head(clinVars_df)
+
+
+clin.brca <- compClinvar(moic.res      = cmoic.brca,
+                         var2comp      = clinVars_df, # data.frame needs to summarize (must has row names of samples)
+                         strata        = "Subtype", # stratifying variable (e.g., Subtype in this example)
+                         # factorVars    = c("ajcc_pathologic_stage"), # features that are considered categorical variables
+                         factorVars    = c("ajcc_pathologic_stage", "ajcc_pathologic_t", "ajcc_pathologic_n", "ajcc_pathologic_m"), # features that are considered categorical variables
+                         nonnormalVars = "age_at_diagnosis", # feature(s) that are considered using nonparametric test
+                         exactVars     = NULL, # feature(s) that are considered using exact test
+                         doWord        = FALSE, # generate .docx file in local path
+                         tab.name      = "SUMMARIZATION OF CLINICAL FEATURES")
+clin.brca
+
+# compare agreement with other subtypes
+sub_df = data.frame(
+                    BRCA_Subtype_PAM50 = brca_dat[["clinical"]][,c("BRCA_Subtype_PAM50")])
+rownames(sub_df) = brca_dat[["clinical"]][,c("bcr_patient_barcode")]
+head(sub_df)
+
+# agreement comparison (support up to 6 classifications include current subtype)
+agree.brca <- compAgree(moic.res  = cmoic.brca,
+                        subt2comp = sub_df,
+                        doPlot    = TRUE,
+                        box.width = 0.2,
+                        fig.name  = "AGREEMENT OF CONSENSUSMOIC WITH PAM50 Subtype")
+
+# run DEA with limma
+runDEA(dea.method = "limma",
+       expr       = brca_dat[["MO"]][["Expression"]], # normalized expression data
+       moic.res   = cmoic.brca,
+       overwt = T,
+       res.path   = getwd(), # path to save marker files
+       prefix     = "de_TCGA-BRCA")
+
+
+# choose limma result to identify subtype-specific DOWN-regulated biomarkers
+marker.dn <- runMarker(moic.res      = cmoic.brca,
+                       dea.method    = "limma",
+                       prefix        = "de_TCGA-BRCA",
+                       dirct         = "down",
+                       dat.path = getwd(),
+                       res.path = getwd(),
+                       p.cutoff      = 0.05, # p cutoff to identify significant DEGs
+                       p.adj.cutoff  = 0.05, # padj cutoff to identify significant DEGs                       
+                       n.marker      = 200, # number of biomarkers for each subtype
+                       doplot        = T,
+                       annCol        = annCol,
+                       annColors     = annColors,
+                       norm.expr     = brca_dat[["MO"]][["Expression"]],
+                       fig.name      = "UPREGULATED BIOMARKER HEATMAP")
+
+
+# subtype-specific UP-regulated biomarkers
+marker.up <- runMarker(moic.res      = cmoic.brca,
+                       dea.method    = "limma",
+                       prefix        = "de_TCGA-BRCA",
+                       dirct         = "up",
+                       dat.path = getwd(),
+                       res.path = getwd(),
+                       p.cutoff      = 0.05, # p cutoff to identify significant DEGs
+                       p.adj.cutoff  = 0.05, # padj cutoff to identify significant DEGs                       
+                       n.marker      = 200, # number of biomarkers for each subtype
+                       doplot        = T,
+                       annCol        = annCol,
+                       annColors     = annColors,
+                       norm.expr     = brca_dat[["MO"]][["Expression"]],
+                       fig.name      = "UPREGULATED BIOMARKER HEATMAP")
+
+
+# MUST locate ABSOLUTE path of msigdb file
+MSIGDB.FILE <- system.file("extdata", "c5.bp.v7.1.symbols.xls", package = "MOVICS", mustWork = TRUE)
+
+# # run GSEA to identify DOWN-regulated GO pathways using results from edgeR
+gsea.down <- runGSEA(moic.res     = cmoic.brca,
+                   dea.method   = "limma", # name of DEA method
+                   prefix       = "de_TCGA-BRCA", # MUST be the same of argument in runDEA()
+                   dat.path     = getwd(), # path of DEA files
+                   res.path     = getwd(), # path to save GSEA files
+                   msigdb.path  = MSIGDB.FILE, # MUST be the ABSOLUTE path of msigdb file
+                   norm.expr    = brca_dat[["MO"]][["Expression"]], # use normalized expression to calculate enrichment score
+                   dirct        = "down", # direction of dysregulation in pathway
+                   p.cutoff     = 0.05, # p cutoff to identify significant pathways
+                   p.adj.cutoff = 0.25, # padj cutoff to identify significant pathways
+                   gsva.method  = "gsva", # method to calculate single sample enrichment score
+                   norm.method  = "mean", # normalization method to calculate subtype-specific enrichment score
+                   fig.name     = "DOWNREGULATED PATHWAY HEATMAP")
+
+
+data.frame(gsea.down$gsea.list$CS2[1:6,3:6])
+
+head(round(gsea.down$grouped.es,3))
+
+# # run GSEA to identify up-regulated GO pathways using results from limma
+gsea.up <- runGSEA(moic.res     = cmoic.brca,
+                   dea.method   = "limma", # name of DEA method
+                   prefix       = "detesting_TCGA-BRCA", # MUST be the same of argument in runDEA()
+                   dat.path     = getwd(), # path of DEA files
+                   res.path     = getwd(), # path to save GSEA files
+                   msigdb.path  = MSIGDB.FILE, # MUST be the ABSOLUTE path of msigdb file
+                   norm.expr    = brca_dat[["MO"]][["Expression"]], # use normalized expression to calculate enrichment score
+                   dirct        = "up", # direction of dysregulation in pathway
+                   p.cutoff     = 0.05, # p cutoff to identify significant pathways
+                   p.adj.cutoff = 0.25, # padj cutoff to identify significant pathways
+                   gsva.method  = "gsva", # method to calculate single sample enrichment score
+                   norm.method  = "mean", # normalization method to calculate subtype-specific enrichment score
+                   fig.name     = "UPREGULATED PATHWAY HEATMAP")
+
+
+# MUST locate ABSOLUTE path of gene set file
+GSET.FILE <- 
+  system.file("extdata", "gene sets of interest.gmt", package = "MOVICS", mustWork = TRUE)
+
+# run GSVA to estimate single sample enrichment score based on given gene set of interest
+gsva.res <- 
+  runGSVA(moic.res      = cmoic.brca,
+          norm.expr     = brca_dat[["MO"]][["Expression"]],
+          gset.gmt.path = GSET.FILE, # ABSOLUTE path of gene set file
+          gsva.method   = "gsva", # method to calculate single sample enrichment score
+          annCol        = annCol,
+          annColors     = annColors,
+          fig.path      = getwd(),
+          fig.name      = "GENE SETS OF INTEREST HEATMAP",
+          height        = 5,
+          width         = 10)
+
+
+message("check raw enrichment score")
+gsva.res$raw.es[1:3,1:3]
+
+message("check z-scored and truncated enrichment score")
+gsva.res$scaled.es[1:3,1:3]
+