|
a |
|
b/common_scripts/pathway_analysis/functions.GSEA.R |
|
|
1 |
|
|
|
2 |
wrapper.GSEA=function(i, logicalVectors, data, cls.vector, GENESETS, datatype=NULL, clsname="", WD=getwd(), OUTDIR=getwd()){ |
|
|
3 |
if(is.null(datatype)){stop("specify data type (B, N)")} |
|
|
4 |
|
|
|
5 |
# filter matrix and cls.vector |
|
|
6 |
data=data[,logicalVectors[[i]]&!is.na(logicalVectors)] |
|
|
7 |
cls.vector=cls.vector[logicalVectors[[i]]&!is.na(logicalVectors)] |
|
|
8 |
dataname=names(logicalVectors)[i] |
|
|
9 |
|
|
|
10 |
command=run.GSEA(data, cls.vector, GENESETS, datatype, dataname, clsname, WD, OUTDIR) |
|
|
11 |
} |
|
|
12 |
|
|
|
13 |
# function to go through |
|
|
14 |
run.GSEA=function(data, cls.vector, GENESETS, datatype=NULL, dataname="", clsname="", WD=getwd(), OUTDIR=getwd(), GSEA_home="~/gsea-3.0.jar"){ |
|
|
15 |
|
|
|
16 |
clsname=gsub("^_|_$", "", gsub("[[:punct:]]", "_", clsname)) # no special allowed here |
|
|
17 |
|
|
|
18 |
if(is.null(datatype)){stop("specify data type (B, N)")} |
|
|
19 |
|
|
|
20 |
# PATHS, create if needed |
|
|
21 |
if(!dir.exists(WD)){dir.create(WD, recursive = T); cat("Working directory made", sep="\n\n")} |
|
|
22 |
if(!dir.exists(OUTDIR)){dir.create(OUTDIR, recursive = T); cat("Output directory made", sep="\n\n")} |
|
|
23 |
setwd(WD) |
|
|
24 |
|
|
|
25 |
if(datatype=="N"){ |
|
|
26 |
data=data[,order(as.numeric(cls.vector), decreasing=T)] |
|
|
27 |
cls.vector=cls.vector[order(as.numeric(cls.vector), decreasing=T)] |
|
|
28 |
CLUSTER_COMPARISON="continuous_phenotype" |
|
|
29 |
} |
|
|
30 |
if(datatype=="B"){ |
|
|
31 |
data=data[,order(as.numeric(cls.vector), decreasing=T)] |
|
|
32 |
cls.vector=cls.vector[order(as.numeric(cls.vector), decreasing=T)] |
|
|
33 |
CLUSTER_COMPARISON=paste(unique(cls.vector), collapse="_versus_") |
|
|
34 |
} |
|
|
35 |
|
|
|
36 |
print("data sorted based on cls vector") |
|
|
37 |
#*********************** Need gct? *************************** |
|
|
38 |
|
|
|
39 |
NAME_OUT2=paste(dataname, gsub("-", "_", clsname), sep="_") |
|
|
40 |
GEXP=paste(WD, "/", NAME_OUT2, ".gct", sep="") |
|
|
41 |
|
|
|
42 |
print("making gct file") |
|
|
43 |
|
|
|
44 |
gsea.write.gct(data, GEXP) |
|
|
45 |
|
|
|
46 |
print("gct ready") |
|
|
47 |
|
|
|
48 |
#****************************** genesets *********************************** |
|
|
49 |
if(is.list(GENESETS)){ |
|
|
50 |
writeGMT(GENESETS, "geneset_temp.gmt") |
|
|
51 |
print("gmt made from named list") |
|
|
52 |
GENESETS="geneset_temp.gmt" |
|
|
53 |
} |
|
|
54 |
# no need to do anything |
|
|
55 |
|
|
|
56 |
#************************************************************************************* |
|
|
57 |
|
|
|
58 |
if(datatype=="N"){ |
|
|
59 |
print("this is numeric feature class, making class file") |
|
|
60 |
|
|
|
61 |
A1=paste("#numeric", sep = "\t") |
|
|
62 |
A2=paste("#feat", sep = "\t") |
|
|
63 |
A3=t(data.frame(as.character(cls.vector))) |
|
|
64 |
CLS=paste0(OUTDIR,"/",dataname, "_", CLUSTER_COMPARISON, "_", gsub("-", "_", clsname), ".cls") |
|
|
65 |
write(A1, file = CLS, append = F) # All .cls files require this dummy header line. |
|
|
66 |
write(A2, file = CLS, append = T) |
|
|
67 |
write.table(A3, file = CLS, append = T, quote = F, sep = "\t", |
|
|
68 |
na = "", row.names = F, col.names = F) |
|
|
69 |
PARAMS=" -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute sample -rnd_type no_balance -scoring_scheme weighted -metric Pearson -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 25 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 5 -zip_report false -gui false" |
|
|
70 |
} |
|
|
71 |
|
|
|
72 |
if(datatype=="B"){ |
|
|
73 |
print("this is binary/two group feature class") |
|
|
74 |
|
|
|
75 |
# make vector with labels |
|
|
76 |
cls.vector=cls.vector[!is.na(cls.vector), drop=F] |
|
|
77 |
|
|
|
78 |
A1=paste(length(cls.vector), 2, 1, sep = "\t") |
|
|
79 |
A2=paste("#", "1", "0", sep = "\t") |
|
|
80 |
A3=t(data.frame(as.character(cls.vector))) |
|
|
81 |
CLS=paste0(OUTDIR,"/", dataname, "_", CLUSTER_COMPARISON, "_", gsub("-", "_", clsname), "_temp.cls") |
|
|
82 |
write(A1, file = CLS, append = F) # All .cls files require this dummy header line. |
|
|
83 |
write(A2, file = CLS, append = T) |
|
|
84 |
write.table(A3, file = CLS, append = T, quote = F, sep = "\t", |
|
|
85 |
na = "", row.names = F, col.names = F) |
|
|
86 |
|
|
|
87 |
PARAMS=" -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute sample -rnd_type no_balance -scoring_scheme weighted -metric Signal2Noise -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 25 -rnd_seed timestamp -save_rnd_lists false -zip_report false -gui false" |
|
|
88 |
|
|
|
89 |
} |
|
|
90 |
|
|
|
91 |
RUN=paste0("java -Xmx5000m -cp ", GSEA_home, " xtools.gsea.Gsea -res ", GEXP, |
|
|
92 |
" -cls ", CLS, |
|
|
93 |
" -gmx ", GENESETS, |
|
|
94 |
" -rpt_label ", NAME_OUT2, "_", CLUSTER_COMPARISON, |
|
|
95 |
" -out ", OUTDIR, " ", PARAMS) |
|
|
96 |
} |
|
|
97 |
|
|
|
98 |
gsea.write.gct <- function(exprMat, gctFn){ |
|
|
99 |
nGenes = nrow(exprMat) |
|
|
100 |
nConds = ncol(exprMat) |
|
|
101 |
write("#1.2", file = gctFn, append = F) # All .gct files require this dummy header line. |
|
|
102 |
write(paste(nGenes, nConds, sep = "\t"), file = gctFn, append = T) |
|
|
103 |
write(paste("Name", "Description", paste(colnames(exprMat), collapse = "\t"), sep = "\t"), file = gctFn, append = T) |
|
|
104 |
# The second column of the .gct file, "Description", is filled out with "na"'s. |
|
|
105 |
rownames(exprMat) = paste(rownames(exprMat), "na", sep = "\t") # Append "\tna" to every gene clsname. |
|
|
106 |
write.table(exprMat, file = gctFn, append = T, quote = F, sep = "\t", |
|
|
107 |
na = "", row.names = T, col.names = F) |
|
|
108 |
} |
|
|
109 |
|
|
|
110 |
writeGMT <- function #Create a gmt (gene matrix transposed) file |
|
|
111 |
### Createss a gmt (gene matrix transposed) file such as those |
|
|
112 |
### provided by mSigDB or geneSigDB, from an R list object. |
|
|
113 |
### Function by Levi Waldron. |
|
|
114 |
(object, |
|
|
115 |
### R list object that will be converted to GMT file. Each element |
|
|
116 |
### should contain a vector of gene names, and the names of the |
|
|
117 |
### elements will used for the gene set names |
|
|
118 |
fname |
|
|
119 |
### Output file name for .gmt file |
|
|
120 |
){ |
|
|
121 |
if (class(object) != "list") stop("object should be of class 'list'") |
|
|
122 |
if(file.exists(fname)) unlink(fname) |
|
|
123 |
for (iElement in 1:length(object)){ |
|
|
124 |
write.table(t(c(make.names(rep(names(object)[iElement],2)),object[[iElement]])), |
|
|
125 |
sep="\t",quote=FALSE, |
|
|
126 |
file=fname,append=TRUE,col.names=FALSE,row.names=FALSE) |
|
|
127 |
} |
|
|
128 |
### Called for the effect of writing a .gmt file |
|
|
129 |
} |
|
|
130 |
|
|
|
131 |
|
|
|
132 |
GSEA = function(gene_list, genesets, pval) { |
|
|
133 |
set.seed(54321) |
|
|
134 |
library(dplyr) |
|
|
135 |
library(gage) |
|
|
136 |
library(fgsea) |
|
|
137 |
|
|
|
138 |
if ( any( duplicated(names(gene_list)) ) ) { |
|
|
139 |
warning("Duplicates in gene names") |
|
|
140 |
gene_list = gene_list[!duplicated(names(gene_list))] |
|
|
141 |
} |
|
|
142 |
if ( !all( order(gene_list, decreasing = TRUE) == 1:length(gene_list)) ){ |
|
|
143 |
warning("Gene list not sorted") |
|
|
144 |
gene_list = sort(gene_list, decreasing = TRUE) |
|
|
145 |
} |
|
|
146 |
|
|
|
147 |
# if not list: |
|
|
148 |
if(is.character(genesets)){ |
|
|
149 |
genesets = fgsea::gmtPathways(genesets) |
|
|
150 |
} |
|
|
151 |
|
|
|
152 |
fgRes <- fgsea::fgsea(pathways = genesets, |
|
|
153 |
stats = gene_list, |
|
|
154 |
minSize=15, |
|
|
155 |
maxSize=600, |
|
|
156 |
nperm=10000) %>% |
|
|
157 |
as.data.frame() %>% |
|
|
158 |
dplyr::filter(padj < !!pval) |
|
|
159 |
#print(dim(fgRes)) |
|
|
160 |
|
|
|
161 |
## Filter FGSEA by using gage results. Must be significant and in same direction to keep |
|
|
162 |
gaRes = gage::gage(gene_list, gsets=genesets, same.dir=TRUE, set.size =c(15,600)) |
|
|
163 |
|
|
|
164 |
ups = as.data.frame(gaRes$greater) %>% |
|
|
165 |
tibble::rownames_to_column("Pathway") %>% |
|
|
166 |
dplyr::filter(!is.na(p.geomean) & q.val < pval ) %>% |
|
|
167 |
dplyr::select("Pathway") |
|
|
168 |
|
|
|
169 |
downs = as.data.frame(gaRes$less) %>% |
|
|
170 |
tibble::rownames_to_column("Pathway") %>% |
|
|
171 |
dplyr::filter(!is.na(p.geomean) & q.val < pval ) %>% |
|
|
172 |
dplyr::select("Pathway") |
|
|
173 |
|
|
|
174 |
#print(dim(rbind(ups,downs))) |
|
|
175 |
keepups = fgRes[fgRes$NES > 0 & !is.na(match(fgRes$pathway, ups$Pathway)), ] |
|
|
176 |
keepdowns = fgRes[fgRes$NES < 0 & !is.na(match(fgRes$pathway, downs$Pathway)), ] |
|
|
177 |
|
|
|
178 |
### Collapse redundant pathways |
|
|
179 |
Up = fgsea::collapsePathways(keepups, pathways = genesets, stats = gene_list, nperm = 500, pval.threshold = 0.05) |
|
|
180 |
Down = fgsea::collapsePathways(keepdowns, genesets, gene_list, nperm = 500, pval.threshold = 0.05) |
|
|
181 |
|
|
|
182 |
fgRes = fgRes[ !is.na(match(fgRes$pathway, |
|
|
183 |
c( Up$mainPathways, Down$mainPathways))), ] %>% |
|
|
184 |
arrange(desc(NES)) |
|
|
185 |
fgRes$pathway = stringr::str_replace(fgRes$pathway, "GO_" , "") |
|
|
186 |
|
|
|
187 |
fgRes$Enrichment = ifelse(fgRes$NES > 0, "Up-regulated", "Down-regulated") |
|
|
188 |
filtRes = rbind(head(fgRes, n = 10), |
|
|
189 |
tail(fgRes, n = 10 )) |
|
|
190 |
g = ggplot(filtRes, aes(reorder(pathway, NES), NES)) + |
|
|
191 |
geom_segment( aes(reorder(pathway, NES), xend=pathway, y=0, yend=NES)) + |
|
|
192 |
geom_point( size=5, aes( fill = Enrichment), |
|
|
193 |
shape=21, stroke=2) + |
|
|
194 |
scale_fill_manual(values = c("Down-regulated" = "dodgerblue", |
|
|
195 |
"Up-regulated" = "firebrick") ) + |
|
|
196 |
coord_flip() + |
|
|
197 |
labs(x="Pathway", y="Normalized Enrichment Score", |
|
|
198 |
title="GSEA - Biological Processes") + |
|
|
199 |
theme_minimal() |
|
|
200 |
|
|
|
201 |
output = list("Results" = fgRes, "Plot" = g) |
|
|
202 |
return(output) |
|
|
203 |
} |