|
a |
|
b/12-ELMER.R |
|
|
1 |
#Installing |
|
|
2 |
# devtools::install_github(repo = "tiagochst/ELMER.data") |
|
|
3 |
# devtools::install_github(repo = "tiagochst/ELMER") |
|
|
4 |
if (!requireNamespace("BiocManager", quietly=TRUE)) |
|
|
5 |
install.packages("BiocManager") |
|
|
6 |
BiocManager::install("ELMER") |
|
|
7 |
# BiocManager::install(version='devel') |
|
|
8 |
# BiocManager::install("TCGAbiolinks") |
|
|
9 |
# BiocManager::install("TCGAbiolinks", version = "2.5.9") |
|
|
10 |
# source("https://bioconductor.org/biocLite.R") |
|
|
11 |
# biocLite("TCGAbiolinks") |
|
|
12 |
library(ELMER) |
|
|
13 |
library(MultiAssayExperiment) |
|
|
14 |
library(ELMER.data) |
|
|
15 |
library(sesameData) |
|
|
16 |
library(tibble) |
|
|
17 |
library(TCGAbiolinks) |
|
|
18 |
#example |
|
|
19 |
{ |
|
|
20 |
distal.probes <- get.feature.probe(genome = "hg19",met.platform = "450K") |
|
|
21 |
head(distal.probes) |
|
|
22 |
data(LUSC_RNA_refined,package = "ELMER.data") |
|
|
23 |
data(LUSC_meth_refined,package = "ELMER.data") |
|
|
24 |
GeneExp[1:5,1:5] |
|
|
25 |
Meth[1:5,1:5] |
|
|
26 |
mae <- createMAE(exp = GeneExp, |
|
|
27 |
met = Meth, |
|
|
28 |
save = TRUE, |
|
|
29 |
linearize.exp = TRUE, |
|
|
30 |
save.filename = "tmp.rda", |
|
|
31 |
filter.probes = distal.probes, |
|
|
32 |
met.platform = "450K", |
|
|
33 |
genome = "hg19", |
|
|
34 |
TCGA = TRUE) |
|
|
35 |
tmp = as.data.frame(colData(mae)[1:5,]) |
|
|
36 |
View(tmp) |
|
|
37 |
} |
|
|
38 |
#data: RNA and methy |
|
|
39 |
{ |
|
|
40 |
setwd("E:/workplace/mywork/methy/dbgap/chf/data_chf_contr/early_chf/c1_UMN_JHU/RNA/") |
|
|
41 |
load(file="./Gene.Rdata") |
|
|
42 |
load(file="./exon.Rdata") |
|
|
43 |
exon[1:4,1:4];dim(exon) |
|
|
44 |
Gene[1:4,1:4];dim(Gene) |
|
|
45 |
exon$id <- paste("sample",exon$X,sep="_") |
|
|
46 |
exon <- column_to_rownames(exon,"id") |
|
|
47 |
exon <- exon[,-c(1:2)] |
|
|
48 |
Gene$id <- paste("sample",Gene$X,sep="_") |
|
|
49 |
Gene <- column_to_rownames(Gene,"id") |
|
|
50 |
Gene <- Gene[,-c(1:2)] |
|
|
51 |
|
|
|
52 |
setwd("E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary/new_champ/") |
|
|
53 |
load("CorrectedBeta.Rdata") |
|
|
54 |
CorrectedBeta[1:4,1:4] |
|
|
55 |
dim(CorrectedBeta) |
|
|
56 |
data_cpg <- read.csv("../champ_rawdata_train.csv") |
|
|
57 |
colnames(CorrectedBeta) <- paste("sample",data_cpg$shareid,sep="_") |
|
|
58 |
|
|
|
59 |
id = setdiff(rownames(exon),colnames(CorrectedBeta)) |
|
|
60 |
exon_data = exon[!(rownames(exon) %in% id),] |
|
|
61 |
exon_data <- t(exon_data) |
|
|
62 |
exon_data[1:4,1:4];dim(exon_data) |
|
|
63 |
|
|
|
64 |
Gene_data = Gene[!(rownames(Gene) %in% id),] |
|
|
65 |
Gene_data <- t(Gene_data) |
|
|
66 |
Gene_data[1:4,1:4];dim(Gene_data) |
|
|
67 |
|
|
|
68 |
CorrectedBeta <- subset(CorrectedBeta,select = colnames(exon_data)) |
|
|
69 |
dim(CorrectedBeta);dim(exon_data);dim(Gene_data) |
|
|
70 |
CorrectedBeta[1:4,1:4] |
|
|
71 |
exon_data[1:4,1:4] |
|
|
72 |
Gene_data[1:4,1:4] |
|
|
73 |
save(CorrectedBeta,exon_data,Gene_data,file="result/MAE.Rdata") |
|
|
74 |
{ |
|
|
75 |
library(stringr) |
|
|
76 |
exon_data[1:4,1:5] |
|
|
77 |
ids=data.frame(ensembl_id=str_split(rownames(exon_data),'[.]',simplify = T)[,1], |
|
|
78 |
median=apply(exon_data,1,median)) |
|
|
79 |
head(ids) |
|
|
80 |
head(ids$ensembl_id) |
|
|
81 |
library(org.Hs.eg.db) |
|
|
82 |
g2s=unique(toTable(org.Hs.egSYMBOL)) |
|
|
83 |
head(g2s) |
|
|
84 |
g2e=unique(toTable(org.Hs.egENSEMBL)) |
|
|
85 |
head(g2e) |
|
|
86 |
s2e=merge(g2e,g2s,by='gene_id') |
|
|
87 |
head(s2e) |
|
|
88 |
table(ids$ensembl_id %in% s2e$symbol) |
|
|
89 |
# FALSE TRUE |
|
|
90 |
# 2866 15448 |
|
|
91 |
ids=ids[ids$ensembl_id %in% s2e$symbol,] |
|
|
92 |
ids$ENSEMBL=s2e[match(ids$ensembl_id,s2e$symbol),2] |
|
|
93 |
length(unique(ids$ENSEMBL)) |
|
|
94 |
head(ids) |
|
|
95 |
ids=ids[order(ids$ENSEMBL,ids$median,decreasing = T),] |
|
|
96 |
ids=ids[!duplicated(ids$ENSEMBL),] |
|
|
97 |
dim(ids) |
|
|
98 |
exon_data = as.data.frame(exon_data) |
|
|
99 |
exon_data = rownames_to_column(exon_data,"ensembl_id") |
|
|
100 |
exon_data = merge(ids,exon_data,by="ensembl_id") |
|
|
101 |
exon_data = column_to_rownames(exon_data,"ENSEMBL") |
|
|
102 |
exon_data = exon_data[,-c(1,2)] |
|
|
103 |
exon_data[1:4,1:4] |
|
|
104 |
dim(exon_data) |
|
|
105 |
} |
|
|
106 |
{ |
|
|
107 |
library(stringr) |
|
|
108 |
Gene_data[1:4,1:5] |
|
|
109 |
ids=data.frame(ensembl_id=str_split(rownames(Gene_data),'[.]',simplify = T)[,1], |
|
|
110 |
median=apply(Gene_data,1,median)) |
|
|
111 |
head(ids) |
|
|
112 |
head(ids$ensembl_id) |
|
|
113 |
library(org.Hs.eg.db) |
|
|
114 |
g2s=unique(toTable(org.Hs.egSYMBOL)) |
|
|
115 |
head(g2s) |
|
|
116 |
g2e=unique(toTable(org.Hs.egENSEMBL)) |
|
|
117 |
head(g2e) |
|
|
118 |
s2e=merge(g2e,g2s,by='gene_id') |
|
|
119 |
head(s2e) |
|
|
120 |
table(ids$ensembl_id %in% s2e$symbol) |
|
|
121 |
# FALSE TRUE |
|
|
122 |
# 2529 14870 |
|
|
123 |
ids=ids[ids$ensembl_id %in% s2e$symbol,] |
|
|
124 |
ids$ENSEMBL=s2e[match(ids$ensembl_id,s2e$symbol),2] |
|
|
125 |
length(unique(ids$ENSEMBL)) |
|
|
126 |
head(ids) |
|
|
127 |
ids=ids[order(ids$ENSEMBL,ids$median,decreasing = T),] |
|
|
128 |
ids=ids[!duplicated(ids$ENSEMBL),] |
|
|
129 |
dim(ids) |
|
|
130 |
Gene_data = as.data.frame(Gene_data) |
|
|
131 |
Gene_data = rownames_to_column(Gene_data,"ensembl_id") |
|
|
132 |
Gene_data = merge(ids,Gene_data,by="ensembl_id") |
|
|
133 |
Gene_data = column_to_rownames(Gene_data,"ENSEMBL") |
|
|
134 |
Gene_data = Gene_data[,-c(1,2)] |
|
|
135 |
Gene_data[1:4,1:4] |
|
|
136 |
dim(Gene_data) |
|
|
137 |
} |
|
|
138 |
dim(Gene_data);dim(exon_data) |
|
|
139 |
save(exon_data,Gene_data,file="result/MAE_addENSEMBL.Rdata") |
|
|
140 |
} |
|
|
141 |
#creat mea |
|
|
142 |
{ |
|
|
143 |
load("result/MAE_addENSEMBL.Rdata") #exon_data,Gene_data |
|
|
144 |
distal.probes <- get.feature.probe(genome = "hg19",met.platform = "450K") |
|
|
145 |
assay <- c(rep("DNA methylation", ncol(CorrectedBeta)), |
|
|
146 |
rep("Gene expression", ncol(exon_data))) |
|
|
147 |
primary <- c(colnames(CorrectedBeta),colnames(exon_data)) |
|
|
148 |
data_cpg <- read.csv("../champ_rawdata_train.csv") |
|
|
149 |
data_cpg$shareid <- paste("sample",data_cpg$shareid,sep="_") |
|
|
150 |
colname <- data_cpg[colnames(CorrectedBeta) %in% data_cpg$shareid ,"Sample_Group"] |
|
|
151 |
colname <- c(colnames(CorrectedBeta),colnames(exon_data)) |
|
|
152 |
sampleMap <- data.frame(assay,primary,colname) |
|
|
153 |
#distal.probes <- get.feature.probe(genome = "hg19",met.platform = "450K") |
|
|
154 |
colData <- data.frame(sample = colnames(CorrectedBeta)) |
|
|
155 |
tmp = data_cpg[data_cpg$shareid %in% colData$sample ,] |
|
|
156 |
colData <- merge(colData,tmp[,c(1,11)],by.x="sample",by.y="shareid") |
|
|
157 |
colData = unique(colData) |
|
|
158 |
rownames(colData) <- colData$sample |
|
|
159 |
mae <- createMAE(exp = exon_data, |
|
|
160 |
met = CorrectedBeta, |
|
|
161 |
save = TRUE, |
|
|
162 |
filter.probes = distal.probes, |
|
|
163 |
colData = colData, |
|
|
164 |
#sampleMap = sampleMap, |
|
|
165 |
linearize.exp = TRUE, |
|
|
166 |
save.filename = "mae.rda", |
|
|
167 |
met.platform = "450K", |
|
|
168 |
genome = "hg19", |
|
|
169 |
TCGA = FALSE) |
|
|
170 |
save(mae,"result/mae.rda") |
|
|
171 |
} |
|
|
172 |
#DMP |
|
|
173 |
{ |
|
|
174 |
mae <- get(load("result/mae.rda")) |
|
|
175 |
sig.diff <- get.diff.meth(data = mae, |
|
|
176 |
group.col = "Sample_Group", |
|
|
177 |
group1 = "chf", |
|
|
178 |
group2 = "nochf", |
|
|
179 |
# if supervised mode set to 1 |
|
|
180 |
mode = "supervised", |
|
|
181 |
minSubgroupFrac = 1, |
|
|
182 |
sig.dif = 0, |
|
|
183 |
diff.dir = "both", # “hypo” and “hyper” |
|
|
184 |
cores = 1, |
|
|
185 |
dir.out ="result", |
|
|
186 |
save = FALSE , |
|
|
187 |
pvalue = 0.05#0.05#1 |
|
|
188 |
) |
|
|
189 |
head(sig.diff); |
|
|
190 |
dim(sig.diff)#pvalue = 0.05,19;0.5,8537 |
|
|
191 |
write.table(sig.diff,"result/getMethdiff.both.probes.csv")#pvalue = 0.5,8537 |
|
|
192 |
write.table(sig.diff,"result/getMethdiff.both.probes.significant.csv")#pvalue = 0.05,19 |
|
|
193 |
##"getMethdiff.hypo.probes.csv" |
|
|
194 |
##"getMethdiff.hypo.probes.significant.csv" |
|
|
195 |
save(sig.diff,file="result/sig_diff.Rdata")#pvalue = 0.5,8537 |
|
|
196 |
} |
|
|
197 |
#Identifying putative probe-gene pairs |
|
|
198 |
{ |
|
|
199 |
setwd("E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary/new_champ/") |
|
|
200 |
mae <- get(load("result/mae.rda")) |
|
|
201 |
#sig.diff <- read.csv("result/getMethdiff.both.probes.significant.csv",sep = " ") |
|
|
202 |
nearGenes <- GetNearGenes(data = mae, |
|
|
203 |
probes = sig.diff$probe, |
|
|
204 |
numFlankingGenes = 20) # 10 upstream and 10 dowstream genes |
|
|
205 |
save(nearGenes,file="result/nearGenes.Rdata") |
|
|
206 |
Hypo.pair <- get.pair(data = mae, |
|
|
207 |
group.col = "Sample_Group", |
|
|
208 |
group1 = "chf", |
|
|
209 |
group2 = "nochf", |
|
|
210 |
nearGenes = nearGenes, |
|
|
211 |
#minSubgroupFrac = 1, |
|
|
212 |
mode = "unsupervised",#supervised or unsupervised |
|
|
213 |
permu.dir = "result/permu", |
|
|
214 |
#Please set to 100000 to get significant results |
|
|
215 |
#permu.size = 100000, |
|
|
216 |
raw.pvalue = 0.05,#1, |
|
|
217 |
#Please set to 0.001 to get significant results |
|
|
218 |
Pe = 0.05,#1, |
|
|
219 |
cores = 1, |
|
|
220 |
#See preAssociationProbeFiltering function |
|
|
221 |
filter.probes = FALSE, |
|
|
222 |
# filter.percentage = 0.01, |
|
|
223 |
# filter.portion = 0.3, |
|
|
224 |
dir.out = "result", |
|
|
225 |
label = "both", |
|
|
226 |
save = FALSE) |
|
|
227 |
dim(Hypo.pair) |
|
|
228 |
write.table(Hypo.pair,"result/getPair.both.all.pairs.statistic5.csv") |
|
|
229 |
write.table(Hypo.pair,"result/getPair.both.all.pairs.statistic.significant.csv") |
|
|
230 |
#"getPair.hypo.all.pairs.statistic.csv" |
|
|
231 |
#"getPair.hypo.pairs.significant.csv" |
|
|
232 |
#"getPair.hypo.pairs.statistic.with.empirical.pvalue.csv" |
|
|
233 |
} |
|
|
234 |
#motif |
|
|
235 |
{ |
|
|
236 |
# Load results from previous sections |
|
|
237 |
mae <- get(load("result/mae.rda")) |
|
|
238 |
Hypo.pair1 = read.csv("result/getPair.both.all.pairs.statistic1.csv",sep=" ") |
|
|
239 |
Hypo.pair2 = read.csv("result/getPair.both.all.pairs.statistic2.csv",sep=" ") |
|
|
240 |
Hypo.pair3 = read.csv("result/getPair.both.all.pairs.statistic3.csv",sep=" ") |
|
|
241 |
Hypo.pair4 = read.csv("result/getPair.both.all.pairs.statistic4.csv",sep=" ") |
|
|
242 |
Hypo.pair5 = read.csv("result/getPair.both.all.pairs.statistic5.csv",sep=" ") |
|
|
243 |
pair <- rbind(rbind(rbind(rbind(Hypo.pair1,Hypo.pair2),Hypo.pair3),Hypo.pair4),Hypo.pair5) |
|
|
244 |
head(pair) # significantly hypomethylated probes with putative target genes |
|
|
245 |
pair = Hypo.pair |
|
|
246 |
# Identify enriched motif for significantly hypomethylated probes which |
|
|
247 |
# have putative target genes. |
|
|
248 |
enriched.motif <- get.enriched.motif(data = mae, |
|
|
249 |
#probes = pair$Probe, |
|
|
250 |
probes = sig.diff$probe, |
|
|
251 |
dir.out = "result", |
|
|
252 |
label = "both", |
|
|
253 |
min.incidence = 0, |
|
|
254 |
lower.OR = 1, |
|
|
255 |
save = FALSE) |
|
|
256 |
names(enriched.motif) |
|
|
257 |
head(enriched.motif[names(enriched.motif)[1]]) ## probes in the given set that have the first motif. |
|
|
258 |
write.table(enriched.motif$FOSL2_HUMAN.H11MO.0.A,"result/getMotif.both.motif.enrichment.txt") |
|
|
259 |
save(enriched.motif,file="result/getMotif.both.enriched.motifs.rda") |
|
|
260 |
#"getMotif.hypo.enriched.motifs.rda" "getMotif.both.motif.enrichment.csv" "motif.enrichment.pdf") |
|
|
261 |
} |
|
|
262 |
#TF |
|
|
263 |
{ |
|
|
264 |
# Load results from previous sections |
|
|
265 |
mae <- get(load("mae.rda")) |
|
|
266 |
load("result/getMotif.both.enriched.motifs12.rda") |
|
|
267 |
TF <- get.TFs(data = mae, |
|
|
268 |
group.col = "Sample_Group", |
|
|
269 |
group1 = "chf", |
|
|
270 |
group2 = "nochf", |
|
|
271 |
mode = "unsupervised", |
|
|
272 |
enriched.motif = enriched.motif, |
|
|
273 |
dir.out = "result", |
|
|
274 |
cores = 1, |
|
|
275 |
label = "both", |
|
|
276 |
save = FALSE ) |
|
|
277 |
##"getTF.hypo.TFs.with.motif.pvalue.rda" |
|
|
278 |
##"getTF.hypo.significant.TFs.with.motif.summary.csv" |
|
|
279 |
write.table(TF,"result/getTF.both.significant.TFs.with.motif.summary.csv") |
|
|
280 |
save(TF,file="result/getTF.both.TFs.with.motif.pvalue.rda") |
|
|
281 |
} |
|
|
282 |
#Scatter plots |
|
|
283 |
{ |
|
|
284 |
mae <- get(load("mae.rda")) |
|
|
285 |
load("result/getMotif.hypo.enriched.motifs.rda") |
|
|
286 |
scatter.plot(data = mae, |
|
|
287 |
byProbe = list(probe = c("cg27401945"), numFlankingGenes = 20), |
|
|
288 |
category = "Sample_Group", |
|
|
289 |
lm = TRUE, # Draw linear regression curve |
|
|
290 |
save = TRUE) |
|
|
291 |
scatter.plot(data = mae, |
|
|
292 |
byPair = list(probe = c("cg27401945"), gene = c("ENSG00000148704")), |
|
|
293 |
category = "Sample_Group", save = TRUE, lm_line = TRUE) |
|
|
294 |
|
|
|
295 |
load("result/getMotif.hypo.enriched.motifs.rda") |
|
|
296 |
names(enriched.motif)[1] |
|
|
297 |
scatter.plot(data = mae, |
|
|
298 |
byTF = list(TF = c("VAX1","DIP2C"), |
|
|
299 |
probe = enriched.motif[[names(enriched.motif)[2]]]), |
|
|
300 |
category = "Sample_Group", |
|
|
301 |
save = TRUE, |
|
|
302 |
lm_line = TRUE) |
|
|
303 |
} |
|
|
304 |
#XX |
|
|
305 |
{ |
|
|
306 |
# Load results from previous sections |
|
|
307 |
mae <- get(load("mae.rda")) |
|
|
308 |
#pair <- read.csv("result/getPair.hypo.pairs.significant.csv") |
|
|
309 |
schematic.plot(pair = pair, |
|
|
310 |
data = mae, |
|
|
311 |
group.col = "Sample_Group", |
|
|
312 |
byProbe = pair$Probe[1], |
|
|
313 |
save = FALSE) |
|
|
314 |
schematic.plot(pair = pair, |
|
|
315 |
data = mae, |
|
|
316 |
group.col = "Sample_Group", |
|
|
317 |
byGene = pair$GeneID[1], |
|
|
318 |
save = FALSE) |
|
|
319 |
} |
|
|
320 |
#XX motif |
|
|
321 |
{ |
|
|
322 |
motif.enrichment.plot(motif.enrichment = enriched.motif, |
|
|
323 |
significant = list(OR = 1.5,lowerOR = 1.3), |
|
|
324 |
label = "both", |
|
|
325 |
save = FALSE) |
|
|
326 |
motif.enrichment.plot(motif.enrichment = "result/getMotif.hypo.motif.enrichment.csv", |
|
|
327 |
significant = list(OR = 1.5,lowerOR = 1.3), |
|
|
328 |
label = "hypo", |
|
|
329 |
summary = TRUE, |
|
|
330 |
save = FALSE) |
|
|
331 |
} |
|
|
332 |
#TF |
|
|
333 |
{ |
|
|
334 |
load("result/getTF.hypo.TFs.with.motif.pvalue.rda") |
|
|
335 |
motif <- colnames(TF.meth.cor)[1] |
|
|
336 |
TF.rank.plot(motif.pvalue = TF.meth.cor, |
|
|
337 |
motif = motif, |
|
|
338 |
save = FALSE) |
|
|
339 |
} |
|
|
340 |
#XX heatmap |
|
|
341 |
{ |
|
|
342 |
# Load results from previous sections |
|
|
343 |
mae <- get(load("mae.rda")) |
|
|
344 |
pair <- read.csv("result/getPair.hypo.pairs.significant.csv") |
|
|
345 |
heatmapPairs(data = mae, |
|
|
346 |
group.col = "Sample_Group", |
|
|
347 |
group1 = "Chf", |
|
|
348 |
#annotation.col = c("years_smoked","gender"), |
|
|
349 |
group2 = "Nochf", |
|
|
350 |
pairs = pair, |
|
|
351 |
filename = NULL) |
|
|
352 |
} |
|
|
353 |
|
|
|
354 |
{ |
|
|
355 |
library(plyr) |
|
|
356 |
A<-strsplit(as.character(names(enriched.motif)), "_") |
|
|
357 |
tmp2 <- mapply( cbind, A ) |
|
|
358 |
df0 <- ldply (tmp2[1,], data.frame) |
|
|
359 |
#median level of methylation estimated from all distal probes |
|
|
360 |
length(enriched.motif$FOSL2_HUMAN.H11MO.0.A) |
|
|
361 |
FOSL2 = enriched.motif$FOSL2_HUMAN.H11MO.0.A |
|
|
362 |
FOSL1 = enriched.motif$FOSL1_HUMAN.H11MO.0.A |
|
|
363 |
} |