a b/downstream_analysis/analyze_rna_atac.R
1
###
2
source('./utilities.R')
3
num_topic=100
4
version='rna_atac'
5
6
#read gene by topic matrix, and all gene names
7
rna_topic=read.csv(paste0('../Result/',version,'/topic_rna.csv'),header=T)
8
rna_names=rna_topic[,1]
9
rna_topic=rna_topic[,-1]
10
rownames(rna_topic)=rna_names
11
colnames(rna_topic)=paste0('topic',1:num_topic)
12
13
#read protein by topic matrix, and all protein names
14
atac_topic=read.csv(paste0('../Result/',version,'/topic_atac.csv'),header=T)
15
peak_names=atac_topic[,1]
16
atac_topic=atac_topic[,-1]
17
rownames(atac_topic)=peak_names
18
colnames(atac_topic)=paste0('topic',1:num_topic)
19
20
##read all gene locations
21
gene2loci=read.csv('../useful_file/rna_atac/gene2loci2.csv',header=T)
22
23
##split peak name
24
peak_split=c()
25
for (i in peak_names){peak_split=rbind(peak_split,strsplit(i,'-')[[1]])}
26
peak_split=data.frame(peak_split)
27
colnames(peak_split)=c('chr_name','start','end')
28
peak_split$start=as.numeric(peak_split$start)
29
peak_split$end=as.numeric(peak_split$end)
30
31
##
32
library('GenomicRanges')
33
peak_gr=GRanges(seqnames = peak_split$chr_name,
34
                ranges = IRanges(peak_split$start, end = peak_split$end))
35
rna_gr=GRanges(seqnames = gene2loci$chr_name,
36
               ranges = IRanges(gene2loci$start_position, 
37
                                end = gene2loci$end_position,
38
                                names = gene2loci$gene_name))
39
40
simu_topic_rna=c()
41
for (i in 1:100){#take about 1hr
42
  topic=c()#converted topic_i x gene
43
  peak_topic_v=atac_topic[,i]#1 x num_peak
44
  for (j in rna_names){#gene names
45
    d=distance(rna_gr[j],peak_gr)
46
    d=ifelse(d<=150000,1,0)#distance constraint
47
    ind=which(d==1)#index for peak
48
    for (k in ind){#check non-0 peak correlation
49
      corr=cor(as.numeric(rna_topic[j,]),as.numeric(atac_topic[k,]))
50
      if (corr<=0){d[k]=0}#positive constraint
51
    }
52
    d[is.na(d)]=0
53
    topic=c(topic,peak_topic_v%*%d) 
54
  }
55
  simu_topic_rna=rbind(simu_topic_rna,topic)
56
}
57
write.csv(simu_topic_rna,file=paste0('../data/',version,'/simu_topic_rna.csv'),row.names = F,quote = F)
58
59
##
60
#imu_topic_rna=read.csv('../data/rna_atac/v2/simu_topic_rna.csv',header=T)
61
simu_rna_topic=t(simu_topic_rna)
62
rownames(simu_rna_topic)=rna_names
63
colnames(simu_rna_topic)=paste0('topic',1:num_topic)
64
65
### analyze rna_topic and simu_rna_topic
66
##correlation
67
plot_corr=c()
68
for (i in 1:num_topic){
69
  plot_corr=c(plot_corr,cor(rna_topic[,i],simu_rna_topic[,i]))
70
}
71
save_path=paste0('../plot/',version)
72
write.csv(plot_corr,file=paste0(save_path,'/plot_correlation.csv'),quote=F)
73
74
corr_color=rep(NA,100)
75
corr_color[plot_corr>=0]='blue'
76
corr_color[plot_corr<0]='red'
77
corr_plot=data.frame(x=1:100,y=plot_corr,color=corr_color)
78
save_name=paste0(save_path,'/gene_atac_corr.png')
79
plot_correlation(corr_plot,save_name)
80
81
## top features
82
gene_ids=get_feature_orders(rna_topic,num_topic=100)
83
simu_gene_ids=get_feature_orders(simu_rna_topic,num_topic=100)
84
85
topic_select=c(3,32,30,83,21,79,50)
86
87
top_feature_num=5
88
save_path=paste0('../plot/',version,'/')
89
90
plot_top_feature_in_selected_topic('atac',top_feature_num,simu_gene_ids,simu_rna_topic,topic_select,save_path,'select')
91
plot_top_feature_in_selected_topic('gene',top_feature_num,gene_ids,rna_topic,topic_select,save_path,'select')
92
93
## save top5 features as csv
94
top_rna=c()
95
top_atac=c()
96
for (i in topic_select){
97
  top_rna=cbind(top_rna,gene_ids[1:5,i])
98
  top_atac=cbind(top_atac,simu_gene_ids[1:5,i])
99
}
100
101
102
##sample x topic heatmap
103
version='rna_atac'
104
is_plot=T;is_diff_topic=F
105
type='rna_atac'
106
topic_select=','
107
is_all=F
108
args=c(num_topic,version,type,topic_select,is_plot,is_diff_topic,is_all)
109
system(paste(c('Rscript','./plot_sample_by_topic.R',args),collapse=' '))
110
#
111
is_plot=F;is_diff_topic=T
112
topic_select='30,78,3,72,84,51,55,92,50,73,39,46,33,59,12,21,79,16,98,32,25,26,38,83,88'
113
args=c(num_topic,version,type,topic_select,is_plot,is_diff_topic,is_all)
114
system(paste(c('Rscript','./plot_sample_by_topic.R',args),collapse=' '))
115
116
117
## prepare GSEA
118
############# prepare GSEA files
119
#rna_topic
120
#simu_rna_topic=t(simu_topic_rna)
121
save_path_g=paste0('../data/',version,'/rna/')
122
save_path_p=paste0('../data/',version,'/atac/')
123
124
for (i in 1:num_topic){
125
  value=cbind(rownames(rna_topic),rna_topic[,i])
126
  write.table(value,file=paste0(save_path_g,'geneName_topic',i,'.rnk'),quote=F,sep='\t',row.names = F,col.names=F)
127
  value=cbind(rownames(simu_rna_topic),simu_rna_topic[,i])
128
  write.table(value,file=paste0(save_path_p,'geneName_topic',i,'.rnk'),quote=F,sep='\t',row.names = F,col.names=F)
129
}
130
131
##### prepare MEME input
132
## top 1000 peaks
133
library(httr)
134
library(jsonlite)
135
library(xml2)
136
server <- "http://rest.ensembl.org"
137
138
###
139
topic_select=c(3,32,30,83,21,79,50)
140
#t=3
141
for (to in topic_select){
142
  peak=atac_topic[,to]
143
  top_peaks=peak_names[order(peak,decreasing=T)[1:100]]
144
  
145
  top_peaks_input=c()
146
  for (i in top_peaks){
147
    s=strsplit(i,'-')[[1]]#split
148
    t=paste0(s[1],':',s[2],'..',s[3],':1?')
149
    top_peaks_input=c(top_peaks_input,t)
150
  }
151
  
152
  #get sequences
153
  seqs=c()
154
  for (i in top_peaks_input){
155
    ext=paste0("/sequence/region/human/",i)
156
    r <- GET(paste(server, ext, sep = ""), content_type("text/x-fasta"))
157
    stop_for_status(r)
158
    seqs=c(seqs,content(r))
159
  }
160
  out_file=file(paste0("../data/rna_atac/seq/seqs100_topic",to,".fasta"))
161
  writeLines(seqs,out_file)
162
  close(out_file)
163
}
164