a b/common_scripts/pathway_analysis/functions.GSEA.R
1
2
wrapper.GSEA=function(i, logicalVectors, data, cls.vector, GENESETS, datatype=NULL, clsname="", WD=getwd(), OUTDIR=getwd()){
3
  if(is.null(datatype)){stop("specify data type (B, N)")}
4
  
5
  # filter matrix and cls.vector
6
  data=data[,logicalVectors[[i]]&!is.na(logicalVectors)]
7
  cls.vector=cls.vector[logicalVectors[[i]]&!is.na(logicalVectors)]
8
  dataname=names(logicalVectors)[i]
9
10
  command=run.GSEA(data, cls.vector, GENESETS, datatype, dataname, clsname, WD, OUTDIR)
11
}
12
13
# function to go through
14
run.GSEA=function(data, cls.vector, GENESETS, datatype=NULL, dataname="", clsname="", WD=getwd(), OUTDIR=getwd(), GSEA_home="~/gsea-3.0.jar"){
15
  
16
  clsname=gsub("^_|_$", "", gsub("[[:punct:]]", "_", clsname)) # no special allowed here
17
  
18
  if(is.null(datatype)){stop("specify data type (B, N)")}
19
  
20
  # PATHS, create if needed
21
  if(!dir.exists(WD)){dir.create(WD, recursive = T); cat("Working directory made", sep="\n\n")}
22
  if(!dir.exists(OUTDIR)){dir.create(OUTDIR, recursive = T); cat("Output directory made", sep="\n\n")}
23
  setwd(WD)
24
  
25
  if(datatype=="N"){
26
    data=data[,order(as.numeric(cls.vector), decreasing=T)]
27
    cls.vector=cls.vector[order(as.numeric(cls.vector), decreasing=T)]
28
    CLUSTER_COMPARISON="continuous_phenotype"
29
  }
30
  if(datatype=="B"){
31
    data=data[,order(as.numeric(cls.vector), decreasing=T)]
32
    cls.vector=cls.vector[order(as.numeric(cls.vector), decreasing=T)]
33
    CLUSTER_COMPARISON=paste(unique(cls.vector), collapse="_versus_")
34
  }
35
  
36
  print("data sorted based on cls vector")
37
  #*********************** Need gct? ***************************
38
  
39
  NAME_OUT2=paste(dataname, gsub("-", "_", clsname), sep="_")
40
  GEXP=paste(WD, "/", NAME_OUT2, ".gct", sep="")
41
  
42
  print("making gct file")
43
  
44
  gsea.write.gct(data, GEXP)
45
  
46
  print("gct ready")
47
  
48
  #****************************** genesets ***********************************
49
  if(is.list(GENESETS)){
50
    writeGMT(GENESETS, "geneset_temp.gmt")
51
    print("gmt made from named list")
52
    GENESETS="geneset_temp.gmt"
53
  }
54
  # no need to do anything
55
  
56
  #*************************************************************************************
57
  
58
  if(datatype=="N"){
59
    print("this is numeric feature class, making class file")
60
    
61
    A1=paste("#numeric", sep = "\t")
62
    A2=paste("#feat", sep = "\t")
63
    A3=t(data.frame(as.character(cls.vector)))
64
    CLS=paste0(OUTDIR,"/",dataname, "_", CLUSTER_COMPARISON, "_", gsub("-", "_", clsname), ".cls")
65
    write(A1, file = CLS, append = F) # All .cls files require this dummy header line.
66
    write(A2, file = CLS, append = T)
67
    write.table(A3, file = CLS, append = T, quote = F, sep = "\t",
68
                na = "", row.names = F, col.names = F)
69
    PARAMS=" -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute sample -rnd_type no_balance -scoring_scheme weighted -metric Pearson -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 25 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 5 -zip_report false -gui false"
70
  }
71
  
72
  if(datatype=="B"){
73
    print("this is binary/two group feature class")
74
    
75
    # make vector with labels
76
    cls.vector=cls.vector[!is.na(cls.vector), drop=F]
77
    
78
    A1=paste(length(cls.vector), 2, 1, sep = "\t")
79
    A2=paste("#", "1", "0", sep = "\t")
80
    A3=t(data.frame(as.character(cls.vector)))
81
    CLS=paste0(OUTDIR,"/", dataname, "_", CLUSTER_COMPARISON, "_", gsub("-", "_", clsname), "_temp.cls")
82
    write(A1, file = CLS, append = F) # All .cls files require this dummy header line.
83
    write(A2, file = CLS, append = T)
84
    write.table(A3, file = CLS, append = T, quote = F, sep = "\t",
85
                na = "", row.names = F, col.names = F)
86
    
87
    PARAMS=" -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute sample -rnd_type no_balance -scoring_scheme weighted -metric Signal2Noise -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 25 -rnd_seed timestamp -save_rnd_lists false -zip_report false -gui false"
88
    
89
  }
90
  
91
  RUN=paste0("java -Xmx5000m -cp ", GSEA_home, " xtools.gsea.Gsea -res ", GEXP,
92
             " -cls ", CLS, 
93
             " -gmx ", GENESETS,
94
             " -rpt_label ", NAME_OUT2, "_", CLUSTER_COMPARISON,
95
             " -out ", OUTDIR, " ", PARAMS)
96
}
97
98
gsea.write.gct <- function(exprMat, gctFn){
99
  nGenes = nrow(exprMat)
100
  nConds = ncol(exprMat)
101
  write("#1.2", file = gctFn, append = F) # All .gct files require this dummy header line.
102
  write(paste(nGenes, nConds, sep = "\t"), file = gctFn, append = T)
103
  write(paste("Name", "Description", paste(colnames(exprMat), collapse = "\t"), sep = "\t"), file = gctFn, append = T)
104
  # The second column of the .gct file, "Description", is filled out with "na"'s. 
105
  rownames(exprMat) = paste(rownames(exprMat), "na", sep = "\t") # Append "\tna" to every gene clsname. 
106
  write.table(exprMat, file = gctFn, append = T, quote = F, sep = "\t",
107
              na = "", row.names = T, col.names = F)
108
}
109
110
writeGMT <- function #Create a gmt (gene matrix transposed) file
111
### Createss a gmt (gene matrix transposed) file such as those
112
### provided by mSigDB or geneSigDB, from an R list object.
113
### Function by Levi Waldron.
114
(object,
115
 ### R list object that will be converted to GMT file.  Each element
116
 ### should contain a vector of gene names, and the names of the
117
 ### elements will used for the gene set names
118
 fname
119
 ### Output file name for .gmt file
120
){
121
  if (class(object) != "list") stop("object should be of class 'list'")
122
  if(file.exists(fname)) unlink(fname)
123
  for (iElement in 1:length(object)){
124
    write.table(t(c(make.names(rep(names(object)[iElement],2)),object[[iElement]])),
125
                sep="\t",quote=FALSE,
126
                file=fname,append=TRUE,col.names=FALSE,row.names=FALSE)
127
  }
128
  ### Called for the effect of writing a .gmt file
129
}
130
131
132
GSEA = function(gene_list, genesets, pval) {
133
  set.seed(54321)
134
  library(dplyr)
135
  library(gage)
136
  library(fgsea)
137
  
138
  if ( any( duplicated(names(gene_list)) )  ) {
139
    warning("Duplicates in gene names")
140
    gene_list = gene_list[!duplicated(names(gene_list))]
141
  }
142
  if  ( !all( order(gene_list, decreasing = TRUE) == 1:length(gene_list)) ){
143
    warning("Gene list not sorted")
144
    gene_list = sort(gene_list, decreasing = TRUE)
145
  }
146
  
147
  # if not list:
148
  if(is.character(genesets)){
149
    genesets = fgsea::gmtPathways(genesets)
150
  }
151
  
152
  fgRes <- fgsea::fgsea(pathways = genesets, 
153
                        stats = gene_list,
154
                        minSize=15,
155
                        maxSize=600,
156
                        nperm=10000) %>% 
157
    as.data.frame() %>% 
158
    dplyr::filter(padj < !!pval)
159
  #print(dim(fgRes))
160
  
161
  ## Filter FGSEA by using gage results. Must be significant and in same direction to keep 
162
  gaRes = gage::gage(gene_list, gsets=genesets, same.dir=TRUE, set.size =c(15,600))
163
  
164
  ups = as.data.frame(gaRes$greater) %>% 
165
    tibble::rownames_to_column("Pathway") %>% 
166
    dplyr::filter(!is.na(p.geomean) & q.val < pval ) %>%
167
    dplyr::select("Pathway")
168
  
169
  downs = as.data.frame(gaRes$less) %>% 
170
    tibble::rownames_to_column("Pathway") %>% 
171
    dplyr::filter(!is.na(p.geomean) & q.val < pval ) %>%
172
    dplyr::select("Pathway")
173
  
174
  #print(dim(rbind(ups,downs)))
175
  keepups = fgRes[fgRes$NES > 0 & !is.na(match(fgRes$pathway, ups$Pathway)), ]
176
  keepdowns = fgRes[fgRes$NES < 0 & !is.na(match(fgRes$pathway, downs$Pathway)), ]
177
  
178
  ### Collapse redundant pathways
179
  Up = fgsea::collapsePathways(keepups, pathways = genesets, stats = gene_list,  nperm = 500, pval.threshold = 0.05)
180
  Down = fgsea::collapsePathways(keepdowns, genesets, gene_list,  nperm = 500, pval.threshold = 0.05) 
181
  
182
  fgRes = fgRes[ !is.na(match(fgRes$pathway, 
183
                              c( Up$mainPathways, Down$mainPathways))), ] %>% 
184
    arrange(desc(NES))
185
  fgRes$pathway = stringr::str_replace(fgRes$pathway, "GO_" , "")
186
  
187
  fgRes$Enrichment = ifelse(fgRes$NES > 0, "Up-regulated", "Down-regulated")
188
  filtRes = rbind(head(fgRes, n = 10),
189
                  tail(fgRes, n = 10 ))
190
  g = ggplot(filtRes, aes(reorder(pathway, NES), NES)) +
191
    geom_segment( aes(reorder(pathway, NES), xend=pathway, y=0, yend=NES)) +
192
    geom_point( size=5, aes( fill = Enrichment),
193
                shape=21, stroke=2) +
194
    scale_fill_manual(values = c("Down-regulated" = "dodgerblue",
195
                                 "Up-regulated" = "firebrick") ) +
196
    coord_flip() +
197
    labs(x="Pathway", y="Normalized Enrichment Score",
198
         title="GSEA - Biological Processes") + 
199
    theme_minimal()
200
  
201
  output = list("Results" = fgRes, "Plot" = g)
202
  return(output)
203
}