Diff of /12-ELMER.R [000000] .. [d2c46b]

Switch to unified view

a b/12-ELMER.R
1
#Installing 
2
# devtools::install_github(repo = "tiagochst/ELMER.data")
3
# devtools::install_github(repo = "tiagochst/ELMER")
4
if (!requireNamespace("BiocManager", quietly=TRUE))
5
  install.packages("BiocManager")
6
BiocManager::install("ELMER")
7
# BiocManager::install(version='devel')
8
# BiocManager::install("TCGAbiolinks")
9
# BiocManager::install("TCGAbiolinks", version = "2.5.9") 
10
# source("https://bioconductor.org/biocLite.R")
11
# biocLite("TCGAbiolinks")
12
library(ELMER)
13
library(MultiAssayExperiment)
14
library(ELMER.data)
15
library(sesameData)
16
library(tibble)
17
library(TCGAbiolinks)
18
#example
19
{
20
  distal.probes <- get.feature.probe(genome = "hg19",met.platform = "450K")
21
  head(distal.probes)
22
  data(LUSC_RNA_refined,package = "ELMER.data")
23
  data(LUSC_meth_refined,package = "ELMER.data")
24
  GeneExp[1:5,1:5]
25
  Meth[1:5,1:5]
26
  mae <- createMAE(exp = GeneExp, 
27
                   met = Meth,
28
                   save = TRUE,
29
                   linearize.exp = TRUE,
30
                   save.filename = "tmp.rda",
31
                   filter.probes = distal.probes,
32
                   met.platform = "450K",
33
                   genome = "hg19",
34
                   TCGA = TRUE)
35
  tmp = as.data.frame(colData(mae)[1:5,])
36
  View(tmp)
37
}
38
#data: RNA and methy
39
{
40
  setwd("E:/workplace/mywork/methy/dbgap/chf/data_chf_contr/early_chf/c1_UMN_JHU/RNA/")
41
  load(file="./Gene.Rdata")
42
  load(file="./exon.Rdata")
43
  exon[1:4,1:4];dim(exon)
44
  Gene[1:4,1:4];dim(Gene)
45
  exon$id <- paste("sample",exon$X,sep="_")
46
  exon <- column_to_rownames(exon,"id")
47
  exon <- exon[,-c(1:2)]
48
  Gene$id <- paste("sample",Gene$X,sep="_")
49
  Gene <- column_to_rownames(Gene,"id")
50
  Gene <- Gene[,-c(1:2)]
51
  
52
  setwd("E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary/new_champ/")
53
  load("CorrectedBeta.Rdata")
54
  CorrectedBeta[1:4,1:4]
55
  dim(CorrectedBeta)
56
  data_cpg <- read.csv("../champ_rawdata_train.csv")
57
  colnames(CorrectedBeta) <- paste("sample",data_cpg$shareid,sep="_")
58
  
59
  id = setdiff(rownames(exon),colnames(CorrectedBeta))
60
  exon_data = exon[!(rownames(exon) %in% id),]
61
  exon_data <- t(exon_data)
62
  exon_data[1:4,1:4];dim(exon_data)
63
  
64
  Gene_data = Gene[!(rownames(Gene) %in% id),]
65
  Gene_data <- t(Gene_data)
66
  Gene_data[1:4,1:4];dim(Gene_data)
67
  
68
  CorrectedBeta <- subset(CorrectedBeta,select = colnames(exon_data))
69
  dim(CorrectedBeta);dim(exon_data);dim(Gene_data)
70
  CorrectedBeta[1:4,1:4]
71
  exon_data[1:4,1:4]
72
  Gene_data[1:4,1:4]
73
  save(CorrectedBeta,exon_data,Gene_data,file="result/MAE.Rdata")
74
  {
75
    library(stringr) 
76
    exon_data[1:4,1:5]
77
    ids=data.frame(ensembl_id=str_split(rownames(exon_data),'[.]',simplify = T)[,1],
78
                   median=apply(exon_data,1,median))
79
    head(ids)
80
    head(ids$ensembl_id)
81
    library(org.Hs.eg.db)
82
    g2s=unique(toTable(org.Hs.egSYMBOL))
83
    head(g2s)
84
    g2e=unique(toTable(org.Hs.egENSEMBL)) 
85
    head(g2e)
86
    s2e=merge(g2e,g2s,by='gene_id')
87
    head(s2e)
88
    table(ids$ensembl_id %in% s2e$symbol)
89
    # FALSE  TRUE 
90
    # 2866 15448 
91
    ids=ids[ids$ensembl_id %in% s2e$symbol,]
92
    ids$ENSEMBL=s2e[match(ids$ensembl_id,s2e$symbol),2]
93
    length(unique(ids$ENSEMBL))
94
    head(ids) 
95
    ids=ids[order(ids$ENSEMBL,ids$median,decreasing = T),]
96
    ids=ids[!duplicated(ids$ENSEMBL),]
97
    dim(ids) 
98
    exon_data = as.data.frame(exon_data)
99
    exon_data = rownames_to_column(exon_data,"ensembl_id")
100
    exon_data = merge(ids,exon_data,by="ensembl_id")
101
    exon_data = column_to_rownames(exon_data,"ENSEMBL")
102
    exon_data = exon_data[,-c(1,2)]
103
    exon_data[1:4,1:4]  
104
    dim(exon_data)
105
  }
106
  {
107
    library(stringr) 
108
    Gene_data[1:4,1:5]
109
    ids=data.frame(ensembl_id=str_split(rownames(Gene_data),'[.]',simplify = T)[,1],
110
                   median=apply(Gene_data,1,median))
111
    head(ids)
112
    head(ids$ensembl_id)
113
    library(org.Hs.eg.db)
114
    g2s=unique(toTable(org.Hs.egSYMBOL))
115
    head(g2s)
116
    g2e=unique(toTable(org.Hs.egENSEMBL)) 
117
    head(g2e)
118
    s2e=merge(g2e,g2s,by='gene_id')
119
    head(s2e)
120
    table(ids$ensembl_id %in% s2e$symbol)
121
    # FALSE  TRUE 
122
    # 2529 14870 
123
    ids=ids[ids$ensembl_id %in% s2e$symbol,]
124
    ids$ENSEMBL=s2e[match(ids$ensembl_id,s2e$symbol),2]
125
    length(unique(ids$ENSEMBL))
126
    head(ids) 
127
    ids=ids[order(ids$ENSEMBL,ids$median,decreasing = T),]
128
    ids=ids[!duplicated(ids$ENSEMBL),]
129
    dim(ids) 
130
    Gene_data = as.data.frame(Gene_data)
131
    Gene_data = rownames_to_column(Gene_data,"ensembl_id")
132
    Gene_data = merge(ids,Gene_data,by="ensembl_id")
133
    Gene_data = column_to_rownames(Gene_data,"ENSEMBL")
134
    Gene_data = Gene_data[,-c(1,2)]
135
    Gene_data[1:4,1:4]  
136
    dim(Gene_data)
137
  }
138
  dim(Gene_data);dim(exon_data)
139
  save(exon_data,Gene_data,file="result/MAE_addENSEMBL.Rdata")
140
}
141
#creat mea
142
{
143
  load("result/MAE_addENSEMBL.Rdata") #exon_data,Gene_data
144
  distal.probes <- get.feature.probe(genome = "hg19",met.platform = "450K")
145
  assay <- c(rep("DNA methylation", ncol(CorrectedBeta)),
146
             rep("Gene expression", ncol(exon_data)))
147
  primary <- c(colnames(CorrectedBeta),colnames(exon_data))
148
  data_cpg <- read.csv("../champ_rawdata_train.csv")
149
  data_cpg$shareid <- paste("sample",data_cpg$shareid,sep="_")
150
  colname <- data_cpg[colnames(CorrectedBeta) %in% data_cpg$shareid ,"Sample_Group"]
151
  colname <- c(colnames(CorrectedBeta),colnames(exon_data))
152
  sampleMap <- data.frame(assay,primary,colname)
153
  #distal.probes <- get.feature.probe(genome = "hg19",met.platform = "450K")
154
  colData <- data.frame(sample = colnames(CorrectedBeta))
155
  tmp = data_cpg[data_cpg$shareid %in% colData$sample ,]
156
  colData <- merge(colData,tmp[,c(1,11)],by.x="sample",by.y="shareid")
157
  colData = unique(colData)
158
  rownames(colData) <- colData$sample
159
  mae <- createMAE(exp = exon_data, 
160
                   met = CorrectedBeta,
161
                   save = TRUE,
162
                   filter.probes = distal.probes,
163
                   colData = colData,
164
                   #sampleMap = sampleMap,
165
                   linearize.exp = TRUE,
166
                   save.filename = "mae.rda",
167
                   met.platform = "450K",
168
                   genome = "hg19",
169
                   TCGA = FALSE)
170
  save(mae,"result/mae.rda")
171
}
172
#DMP
173
{
174
  mae <- get(load("result/mae.rda"))
175
  sig.diff <- get.diff.meth(data = mae, 
176
                            group.col = "Sample_Group",
177
                            group1 =  "chf",
178
                            group2 = "nochf",
179
                            # if supervised mode set to 1
180
                            mode = "supervised",
181
                            minSubgroupFrac = 1, 
182
                            sig.dif = 0,
183
                            diff.dir = "both", # “hypo” and “hyper” 
184
                            cores = 1, 
185
                            dir.out ="result", 
186
                            save =  FALSE ,
187
                            pvalue = 0.05#0.05#1
188
                            )
189
  head(sig.diff);
190
  dim(sig.diff)#pvalue = 0.05,19;0.5,8537
191
  write.table(sig.diff,"result/getMethdiff.both.probes.csv")#pvalue = 0.5,8537
192
  write.table(sig.diff,"result/getMethdiff.both.probes.significant.csv")#pvalue = 0.05,19
193
  ##"getMethdiff.hypo.probes.csv"            
194
  ##"getMethdiff.hypo.probes.significant.csv"
195
  save(sig.diff,file="result/sig_diff.Rdata")#pvalue = 0.5,8537
196
}
197
#Identifying putative probe-gene pairs
198
{
199
  setwd("E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary/new_champ/")
200
  mae <- get(load("result/mae.rda"))
201
  #sig.diff <- read.csv("result/getMethdiff.both.probes.significant.csv",sep = " ")
202
  nearGenes <- GetNearGenes(data = mae, 
203
                            probes = sig.diff$probe, 
204
                            numFlankingGenes = 20) # 10 upstream and 10 dowstream genes
205
  save(nearGenes,file="result/nearGenes.Rdata")
206
  Hypo.pair <- get.pair(data = mae,
207
                        group.col = "Sample_Group",
208
                        group1 =  "chf",
209
                        group2 = "nochf",
210
                        nearGenes = nearGenes,
211
                        #minSubgroupFrac = 1,
212
                        mode = "unsupervised",#supervised or unsupervised
213
                        permu.dir = "result/permu",
214
                        #Please set to 100000 to get significant results
215
                        #permu.size = 100000,
216
                        raw.pvalue = 0.05,#1,
217
                        #Please set to 0.001 to get significant results
218
                        Pe = 0.05,#1, 
219
                        cores = 1,
220
                        #See preAssociationProbeFiltering function
221
                        filter.probes = FALSE, 
222
                        # filter.percentage = 0.01,
223
                        # filter.portion = 0.3,
224
                        dir.out = "result",
225
                        label = "both",
226
                        save = FALSE)
227
  dim(Hypo.pair)
228
  write.table(Hypo.pair,"result/getPair.both.all.pairs.statistic5.csv")
229
  write.table(Hypo.pair,"result/getPair.both.all.pairs.statistic.significant.csv")
230
  #"getPair.hypo.all.pairs.statistic.csv"                  
231
  #"getPair.hypo.pairs.significant.csv"                    
232
  #"getPair.hypo.pairs.statistic.with.empirical.pvalue.csv"
233
}
234
#motif
235
{
236
  # Load results from previous sections
237
  mae <- get(load("result/mae.rda"))
238
  Hypo.pair1 = read.csv("result/getPair.both.all.pairs.statistic1.csv",sep=" ")
239
  Hypo.pair2 = read.csv("result/getPair.both.all.pairs.statistic2.csv",sep=" ")
240
  Hypo.pair3 = read.csv("result/getPair.both.all.pairs.statistic3.csv",sep=" ")
241
  Hypo.pair4 = read.csv("result/getPair.both.all.pairs.statistic4.csv",sep=" ")
242
  Hypo.pair5 = read.csv("result/getPair.both.all.pairs.statistic5.csv",sep=" ")
243
  pair <- rbind(rbind(rbind(rbind(Hypo.pair1,Hypo.pair2),Hypo.pair3),Hypo.pair4),Hypo.pair5)
244
  head(pair) # significantly hypomethylated probes with putative target genes
245
  pair = Hypo.pair
246
  # Identify enriched motif for significantly hypomethylated probes which 
247
  # have putative target genes.
248
  enriched.motif <- get.enriched.motif(data = mae,
249
                                       #probes = pair$Probe, 
250
                                       probes = sig.diff$probe, 
251
                                       dir.out = "result", 
252
                                       label = "both",
253
                                       min.incidence = 0,
254
                                       lower.OR = 1,
255
                                       save = FALSE)
256
  names(enriched.motif)
257
  head(enriched.motif[names(enriched.motif)[1]]) ## probes in the given set that have the first motif.
258
  write.table(enriched.motif$FOSL2_HUMAN.H11MO.0.A,"result/getMotif.both.motif.enrichment.txt")
259
  save(enriched.motif,file="result/getMotif.both.enriched.motifs.rda")
260
  #"getMotif.hypo.enriched.motifs.rda"  "getMotif.both.motif.enrichment.csv" "motif.enrichment.pdf") 
261
}
262
#TF
263
{
264
  # Load results from previous sections
265
  mae <- get(load("mae.rda"))
266
  load("result/getMotif.both.enriched.motifs12.rda")
267
  TF <- get.TFs(data = mae, 
268
                group.col = "Sample_Group",
269
                group1 =  "chf",
270
                group2 = "nochf",
271
                mode = "unsupervised",
272
                enriched.motif = enriched.motif,
273
                dir.out = "result", 
274
                cores = 1, 
275
                label = "both",
276
                save =  FALSE )
277
  ##"getTF.hypo.TFs.with.motif.pvalue.rda"             
278
  ##"getTF.hypo.significant.TFs.with.motif.summary.csv"
279
  write.table(TF,"result/getTF.both.significant.TFs.with.motif.summary.csv")
280
  save(TF,file="result/getTF.both.TFs.with.motif.pvalue.rda")
281
}
282
#Scatter plots
283
{
284
  mae <- get(load("mae.rda"))
285
  load("result/getMotif.hypo.enriched.motifs.rda")
286
  scatter.plot(data = mae,
287
               byProbe = list(probe = c("cg27401945"), numFlankingGenes = 20), 
288
               category = "Sample_Group", 
289
               lm = TRUE, # Draw linear regression curve
290
               save = TRUE) 
291
  scatter.plot(data = mae,
292
               byPair = list(probe = c("cg27401945"), gene = c("ENSG00000148704")), 
293
               category = "Sample_Group", save = TRUE, lm_line = TRUE) 
294
  
295
  load("result/getMotif.hypo.enriched.motifs.rda")
296
  names(enriched.motif)[1]
297
  scatter.plot(data = mae,
298
               byTF = list(TF = c("VAX1","DIP2C"),
299
                           probe = enriched.motif[[names(enriched.motif)[2]]]), 
300
               category = "Sample_Group",
301
               save = TRUE, 
302
               lm_line = TRUE)
303
}
304
#XX 
305
{
306
  # Load results from previous sections
307
  mae <- get(load("mae.rda"))
308
  #pair <- read.csv("result/getPair.hypo.pairs.significant.csv")
309
  schematic.plot(pair = pair, 
310
                 data = mae,
311
                 group.col = "Sample_Group",
312
                 byProbe = pair$Probe[1],
313
                 save = FALSE)
314
  schematic.plot(pair = pair, 
315
                 data = mae,   
316
                 group.col = "Sample_Group", 
317
                 byGene = pair$GeneID[1],
318
                 save = FALSE)
319
}
320
#XX motif
321
{
322
  motif.enrichment.plot(motif.enrichment = enriched.motif,
323
                        significant = list(OR = 1.5,lowerOR = 1.3),
324
                        label = "both",
325
                        save = FALSE)
326
  motif.enrichment.plot(motif.enrichment = "result/getMotif.hypo.motif.enrichment.csv",
327
                        significant = list(OR = 1.5,lowerOR = 1.3),
328
                        label = "hypo",
329
                        summary = TRUE,
330
                        save = FALSE)  
331
}
332
#TF
333
{
334
  load("result/getTF.hypo.TFs.with.motif.pvalue.rda")
335
  motif <- colnames(TF.meth.cor)[1]
336
  TF.rank.plot(motif.pvalue = TF.meth.cor,
337
               motif = motif,
338
               save = FALSE)
339
}
340
#XX heatmap
341
{
342
  # Load results from previous sections
343
  mae <- get(load("mae.rda"))
344
  pair <- read.csv("result/getPair.hypo.pairs.significant.csv")
345
  heatmapPairs(data = mae, 
346
               group.col = "Sample_Group",
347
               group1 = "Chf", 
348
               #annotation.col = c("years_smoked","gender"),
349
               group2 = "Nochf",
350
               pairs = pair,
351
               filename =  NULL)
352
}
353
354
{
355
  library(plyr)
356
  A<-strsplit(as.character(names(enriched.motif)), "_")
357
  tmp2 <- mapply( cbind,  A )
358
  df0 <- ldply (tmp2[1,], data.frame)
359
  #median level of methylation estimated from all distal probes
360
  length(enriched.motif$FOSL2_HUMAN.H11MO.0.A)
361
  FOSL2 = enriched.motif$FOSL2_HUMAN.H11MO.0.A
362
  FOSL1 = enriched.motif$FOSL1_HUMAN.H11MO.0.A
363
}