[8e0848]: / common_scripts / pathway_analysis / functions.GSEA.R

Download this file

203 lines (165 with data), 8.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
wrapper.GSEA=function(i, logicalVectors, data, cls.vector, GENESETS, datatype=NULL, clsname="", WD=getwd(), OUTDIR=getwd()){
if(is.null(datatype)){stop("specify data type (B, N)")}
# filter matrix and cls.vector
data=data[,logicalVectors[[i]]&!is.na(logicalVectors)]
cls.vector=cls.vector[logicalVectors[[i]]&!is.na(logicalVectors)]
dataname=names(logicalVectors)[i]
command=run.GSEA(data, cls.vector, GENESETS, datatype, dataname, clsname, WD, OUTDIR)
}
# function to go through
run.GSEA=function(data, cls.vector, GENESETS, datatype=NULL, dataname="", clsname="", WD=getwd(), OUTDIR=getwd(), GSEA_home="~/gsea-3.0.jar"){
clsname=gsub("^_|_$", "", gsub("[[:punct:]]", "_", clsname)) # no special allowed here
if(is.null(datatype)){stop("specify data type (B, N)")}
# PATHS, create if needed
if(!dir.exists(WD)){dir.create(WD, recursive = T); cat("Working directory made", sep="\n\n")}
if(!dir.exists(OUTDIR)){dir.create(OUTDIR, recursive = T); cat("Output directory made", sep="\n\n")}
setwd(WD)
if(datatype=="N"){
data=data[,order(as.numeric(cls.vector), decreasing=T)]
cls.vector=cls.vector[order(as.numeric(cls.vector), decreasing=T)]
CLUSTER_COMPARISON="continuous_phenotype"
}
if(datatype=="B"){
data=data[,order(as.numeric(cls.vector), decreasing=T)]
cls.vector=cls.vector[order(as.numeric(cls.vector), decreasing=T)]
CLUSTER_COMPARISON=paste(unique(cls.vector), collapse="_versus_")
}
print("data sorted based on cls vector")
#*********************** Need gct? ***************************
NAME_OUT2=paste(dataname, gsub("-", "_", clsname), sep="_")
GEXP=paste(WD, "/", NAME_OUT2, ".gct", sep="")
print("making gct file")
gsea.write.gct(data, GEXP)
print("gct ready")
#****************************** genesets ***********************************
if(is.list(GENESETS)){
writeGMT(GENESETS, "geneset_temp.gmt")
print("gmt made from named list")
GENESETS="geneset_temp.gmt"
}
# no need to do anything
#*************************************************************************************
if(datatype=="N"){
print("this is numeric feature class, making class file")
A1=paste("#numeric", sep = "\t")
A2=paste("#feat", sep = "\t")
A3=t(data.frame(as.character(cls.vector)))
CLS=paste0(OUTDIR,"/",dataname, "_", CLUSTER_COMPARISON, "_", gsub("-", "_", clsname), ".cls")
write(A1, file = CLS, append = F) # All .cls files require this dummy header line.
write(A2, file = CLS, append = T)
write.table(A3, file = CLS, append = T, quote = F, sep = "\t",
na = "", row.names = F, col.names = F)
PARAMS=" -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute sample -rnd_type no_balance -scoring_scheme weighted -metric Pearson -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 25 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 5 -zip_report false -gui false"
}
if(datatype=="B"){
print("this is binary/two group feature class")
# make vector with labels
cls.vector=cls.vector[!is.na(cls.vector), drop=F]
A1=paste(length(cls.vector), 2, 1, sep = "\t")
A2=paste("#", "1", "0", sep = "\t")
A3=t(data.frame(as.character(cls.vector)))
CLS=paste0(OUTDIR,"/", dataname, "_", CLUSTER_COMPARISON, "_", gsub("-", "_", clsname), "_temp.cls")
write(A1, file = CLS, append = F) # All .cls files require this dummy header line.
write(A2, file = CLS, append = T)
write.table(A3, file = CLS, append = T, quote = F, sep = "\t",
na = "", row.names = F, col.names = F)
PARAMS=" -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute sample -rnd_type no_balance -scoring_scheme weighted -metric Signal2Noise -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 25 -rnd_seed timestamp -save_rnd_lists false -zip_report false -gui false"
}
RUN=paste0("java -Xmx5000m -cp ", GSEA_home, " xtools.gsea.Gsea -res ", GEXP,
" -cls ", CLS,
" -gmx ", GENESETS,
" -rpt_label ", NAME_OUT2, "_", CLUSTER_COMPARISON,
" -out ", OUTDIR, " ", PARAMS)
}
gsea.write.gct <- function(exprMat, gctFn){
nGenes = nrow(exprMat)
nConds = ncol(exprMat)
write("#1.2", file = gctFn, append = F) # All .gct files require this dummy header line.
write(paste(nGenes, nConds, sep = "\t"), file = gctFn, append = T)
write(paste("Name", "Description", paste(colnames(exprMat), collapse = "\t"), sep = "\t"), file = gctFn, append = T)
# The second column of the .gct file, "Description", is filled out with "na"'s.
rownames(exprMat) = paste(rownames(exprMat), "na", sep = "\t") # Append "\tna" to every gene clsname.
write.table(exprMat, file = gctFn, append = T, quote = F, sep = "\t",
na = "", row.names = T, col.names = F)
}
writeGMT <- function #Create a gmt (gene matrix transposed) file
### Createss a gmt (gene matrix transposed) file such as those
### provided by mSigDB or geneSigDB, from an R list object.
### Function by Levi Waldron.
(object,
### R list object that will be converted to GMT file. Each element
### should contain a vector of gene names, and the names of the
### elements will used for the gene set names
fname
### Output file name for .gmt file
){
if (class(object) != "list") stop("object should be of class 'list'")
if(file.exists(fname)) unlink(fname)
for (iElement in 1:length(object)){
write.table(t(c(make.names(rep(names(object)[iElement],2)),object[[iElement]])),
sep="\t",quote=FALSE,
file=fname,append=TRUE,col.names=FALSE,row.names=FALSE)
}
### Called for the effect of writing a .gmt file
}
GSEA = function(gene_list, genesets, pval) {
set.seed(54321)
library(dplyr)
library(gage)
library(fgsea)
if ( any( duplicated(names(gene_list)) ) ) {
warning("Duplicates in gene names")
gene_list = gene_list[!duplicated(names(gene_list))]
}
if ( !all( order(gene_list, decreasing = TRUE) == 1:length(gene_list)) ){
warning("Gene list not sorted")
gene_list = sort(gene_list, decreasing = TRUE)
}
# if not list:
if(is.character(genesets)){
genesets = fgsea::gmtPathways(genesets)
}
fgRes <- fgsea::fgsea(pathways = genesets,
stats = gene_list,
minSize=15,
maxSize=600,
nperm=10000) %>%
as.data.frame() %>%
dplyr::filter(padj < !!pval)
#print(dim(fgRes))
## Filter FGSEA by using gage results. Must be significant and in same direction to keep
gaRes = gage::gage(gene_list, gsets=genesets, same.dir=TRUE, set.size =c(15,600))
ups = as.data.frame(gaRes$greater) %>%
tibble::rownames_to_column("Pathway") %>%
dplyr::filter(!is.na(p.geomean) & q.val < pval ) %>%
dplyr::select("Pathway")
downs = as.data.frame(gaRes$less) %>%
tibble::rownames_to_column("Pathway") %>%
dplyr::filter(!is.na(p.geomean) & q.val < pval ) %>%
dplyr::select("Pathway")
#print(dim(rbind(ups,downs)))
keepups = fgRes[fgRes$NES > 0 & !is.na(match(fgRes$pathway, ups$Pathway)), ]
keepdowns = fgRes[fgRes$NES < 0 & !is.na(match(fgRes$pathway, downs$Pathway)), ]
### Collapse redundant pathways
Up = fgsea::collapsePathways(keepups, pathways = genesets, stats = gene_list, nperm = 500, pval.threshold = 0.05)
Down = fgsea::collapsePathways(keepdowns, genesets, gene_list, nperm = 500, pval.threshold = 0.05)
fgRes = fgRes[ !is.na(match(fgRes$pathway,
c( Up$mainPathways, Down$mainPathways))), ] %>%
arrange(desc(NES))
fgRes$pathway = stringr::str_replace(fgRes$pathway, "GO_" , "")
fgRes$Enrichment = ifelse(fgRes$NES > 0, "Up-regulated", "Down-regulated")
filtRes = rbind(head(fgRes, n = 10),
tail(fgRes, n = 10 ))
g = ggplot(filtRes, aes(reorder(pathway, NES), NES)) +
geom_segment( aes(reorder(pathway, NES), xend=pathway, y=0, yend=NES)) +
geom_point( size=5, aes( fill = Enrichment),
shape=21, stroke=2) +
scale_fill_manual(values = c("Down-regulated" = "dodgerblue",
"Up-regulated" = "firebrick") ) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="GSEA - Biological Processes") +
theme_minimal()
output = list("Results" = fgRes, "Plot" = g)
return(output)
}