|
a |
|
b/downstream_analysis/analyze_rna_atac.R |
|
|
1 |
### |
|
|
2 |
source('./utilities.R') |
|
|
3 |
num_topic=100 |
|
|
4 |
version='rna_atac' |
|
|
5 |
|
|
|
6 |
#read gene by topic matrix, and all gene names |
|
|
7 |
rna_topic=read.csv(paste0('../Result/',version,'/topic_rna.csv'),header=T) |
|
|
8 |
rna_names=rna_topic[,1] |
|
|
9 |
rna_topic=rna_topic[,-1] |
|
|
10 |
rownames(rna_topic)=rna_names |
|
|
11 |
colnames(rna_topic)=paste0('topic',1:num_topic) |
|
|
12 |
|
|
|
13 |
#read protein by topic matrix, and all protein names |
|
|
14 |
atac_topic=read.csv(paste0('../Result/',version,'/topic_atac.csv'),header=T) |
|
|
15 |
peak_names=atac_topic[,1] |
|
|
16 |
atac_topic=atac_topic[,-1] |
|
|
17 |
rownames(atac_topic)=peak_names |
|
|
18 |
colnames(atac_topic)=paste0('topic',1:num_topic) |
|
|
19 |
|
|
|
20 |
##read all gene locations |
|
|
21 |
gene2loci=read.csv('../useful_file/rna_atac/gene2loci2.csv',header=T) |
|
|
22 |
|
|
|
23 |
##split peak name |
|
|
24 |
peak_split=c() |
|
|
25 |
for (i in peak_names){peak_split=rbind(peak_split,strsplit(i,'-')[[1]])} |
|
|
26 |
peak_split=data.frame(peak_split) |
|
|
27 |
colnames(peak_split)=c('chr_name','start','end') |
|
|
28 |
peak_split$start=as.numeric(peak_split$start) |
|
|
29 |
peak_split$end=as.numeric(peak_split$end) |
|
|
30 |
|
|
|
31 |
## |
|
|
32 |
library('GenomicRanges') |
|
|
33 |
peak_gr=GRanges(seqnames = peak_split$chr_name, |
|
|
34 |
ranges = IRanges(peak_split$start, end = peak_split$end)) |
|
|
35 |
rna_gr=GRanges(seqnames = gene2loci$chr_name, |
|
|
36 |
ranges = IRanges(gene2loci$start_position, |
|
|
37 |
end = gene2loci$end_position, |
|
|
38 |
names = gene2loci$gene_name)) |
|
|
39 |
|
|
|
40 |
simu_topic_rna=c() |
|
|
41 |
for (i in 1:100){#take about 1hr |
|
|
42 |
topic=c()#converted topic_i x gene |
|
|
43 |
peak_topic_v=atac_topic[,i]#1 x num_peak |
|
|
44 |
for (j in rna_names){#gene names |
|
|
45 |
d=distance(rna_gr[j],peak_gr) |
|
|
46 |
d=ifelse(d<=150000,1,0)#distance constraint |
|
|
47 |
ind=which(d==1)#index for peak |
|
|
48 |
for (k in ind){#check non-0 peak correlation |
|
|
49 |
corr=cor(as.numeric(rna_topic[j,]),as.numeric(atac_topic[k,])) |
|
|
50 |
if (corr<=0){d[k]=0}#positive constraint |
|
|
51 |
} |
|
|
52 |
d[is.na(d)]=0 |
|
|
53 |
topic=c(topic,peak_topic_v%*%d) |
|
|
54 |
} |
|
|
55 |
simu_topic_rna=rbind(simu_topic_rna,topic) |
|
|
56 |
} |
|
|
57 |
write.csv(simu_topic_rna,file=paste0('../data/',version,'/simu_topic_rna.csv'),row.names = F,quote = F) |
|
|
58 |
|
|
|
59 |
## |
|
|
60 |
#imu_topic_rna=read.csv('../data/rna_atac/v2/simu_topic_rna.csv',header=T) |
|
|
61 |
simu_rna_topic=t(simu_topic_rna) |
|
|
62 |
rownames(simu_rna_topic)=rna_names |
|
|
63 |
colnames(simu_rna_topic)=paste0('topic',1:num_topic) |
|
|
64 |
|
|
|
65 |
### analyze rna_topic and simu_rna_topic |
|
|
66 |
##correlation |
|
|
67 |
plot_corr=c() |
|
|
68 |
for (i in 1:num_topic){ |
|
|
69 |
plot_corr=c(plot_corr,cor(rna_topic[,i],simu_rna_topic[,i])) |
|
|
70 |
} |
|
|
71 |
save_path=paste0('../plot/',version) |
|
|
72 |
write.csv(plot_corr,file=paste0(save_path,'/plot_correlation.csv'),quote=F) |
|
|
73 |
|
|
|
74 |
corr_color=rep(NA,100) |
|
|
75 |
corr_color[plot_corr>=0]='blue' |
|
|
76 |
corr_color[plot_corr<0]='red' |
|
|
77 |
corr_plot=data.frame(x=1:100,y=plot_corr,color=corr_color) |
|
|
78 |
save_name=paste0(save_path,'/gene_atac_corr.png') |
|
|
79 |
plot_correlation(corr_plot,save_name) |
|
|
80 |
|
|
|
81 |
## top features |
|
|
82 |
gene_ids=get_feature_orders(rna_topic,num_topic=100) |
|
|
83 |
simu_gene_ids=get_feature_orders(simu_rna_topic,num_topic=100) |
|
|
84 |
|
|
|
85 |
topic_select=c(3,32,30,83,21,79,50) |
|
|
86 |
|
|
|
87 |
top_feature_num=5 |
|
|
88 |
save_path=paste0('../plot/',version,'/') |
|
|
89 |
|
|
|
90 |
plot_top_feature_in_selected_topic('atac',top_feature_num,simu_gene_ids,simu_rna_topic,topic_select,save_path,'select') |
|
|
91 |
plot_top_feature_in_selected_topic('gene',top_feature_num,gene_ids,rna_topic,topic_select,save_path,'select') |
|
|
92 |
|
|
|
93 |
## save top5 features as csv |
|
|
94 |
top_rna=c() |
|
|
95 |
top_atac=c() |
|
|
96 |
for (i in topic_select){ |
|
|
97 |
top_rna=cbind(top_rna,gene_ids[1:5,i]) |
|
|
98 |
top_atac=cbind(top_atac,simu_gene_ids[1:5,i]) |
|
|
99 |
} |
|
|
100 |
|
|
|
101 |
|
|
|
102 |
##sample x topic heatmap |
|
|
103 |
version='rna_atac' |
|
|
104 |
is_plot=T;is_diff_topic=F |
|
|
105 |
type='rna_atac' |
|
|
106 |
topic_select=',' |
|
|
107 |
is_all=F |
|
|
108 |
args=c(num_topic,version,type,topic_select,is_plot,is_diff_topic,is_all) |
|
|
109 |
system(paste(c('Rscript','./plot_sample_by_topic.R',args),collapse=' ')) |
|
|
110 |
# |
|
|
111 |
is_plot=F;is_diff_topic=T |
|
|
112 |
topic_select='30,78,3,72,84,51,55,92,50,73,39,46,33,59,12,21,79,16,98,32,25,26,38,83,88' |
|
|
113 |
args=c(num_topic,version,type,topic_select,is_plot,is_diff_topic,is_all) |
|
|
114 |
system(paste(c('Rscript','./plot_sample_by_topic.R',args),collapse=' ')) |
|
|
115 |
|
|
|
116 |
|
|
|
117 |
## prepare GSEA |
|
|
118 |
############# prepare GSEA files |
|
|
119 |
#rna_topic |
|
|
120 |
#simu_rna_topic=t(simu_topic_rna) |
|
|
121 |
save_path_g=paste0('../data/',version,'/rna/') |
|
|
122 |
save_path_p=paste0('../data/',version,'/atac/') |
|
|
123 |
|
|
|
124 |
for (i in 1:num_topic){ |
|
|
125 |
value=cbind(rownames(rna_topic),rna_topic[,i]) |
|
|
126 |
write.table(value,file=paste0(save_path_g,'geneName_topic',i,'.rnk'),quote=F,sep='\t',row.names = F,col.names=F) |
|
|
127 |
value=cbind(rownames(simu_rna_topic),simu_rna_topic[,i]) |
|
|
128 |
write.table(value,file=paste0(save_path_p,'geneName_topic',i,'.rnk'),quote=F,sep='\t',row.names = F,col.names=F) |
|
|
129 |
} |
|
|
130 |
|
|
|
131 |
##### prepare MEME input |
|
|
132 |
## top 1000 peaks |
|
|
133 |
library(httr) |
|
|
134 |
library(jsonlite) |
|
|
135 |
library(xml2) |
|
|
136 |
server <- "http://rest.ensembl.org" |
|
|
137 |
|
|
|
138 |
### |
|
|
139 |
topic_select=c(3,32,30,83,21,79,50) |
|
|
140 |
#t=3 |
|
|
141 |
for (to in topic_select){ |
|
|
142 |
peak=atac_topic[,to] |
|
|
143 |
top_peaks=peak_names[order(peak,decreasing=T)[1:100]] |
|
|
144 |
|
|
|
145 |
top_peaks_input=c() |
|
|
146 |
for (i in top_peaks){ |
|
|
147 |
s=strsplit(i,'-')[[1]]#split |
|
|
148 |
t=paste0(s[1],':',s[2],'..',s[3],':1?') |
|
|
149 |
top_peaks_input=c(top_peaks_input,t) |
|
|
150 |
} |
|
|
151 |
|
|
|
152 |
#get sequences |
|
|
153 |
seqs=c() |
|
|
154 |
for (i in top_peaks_input){ |
|
|
155 |
ext=paste0("/sequence/region/human/",i) |
|
|
156 |
r <- GET(paste(server, ext, sep = ""), content_type("text/x-fasta")) |
|
|
157 |
stop_for_status(r) |
|
|
158 |
seqs=c(seqs,content(r)) |
|
|
159 |
} |
|
|
160 |
out_file=file(paste0("../data/rna_atac/seq/seqs100_topic",to,".fasta")) |
|
|
161 |
writeLines(seqs,out_file) |
|
|
162 |
close(out_file) |
|
|
163 |
} |
|
|
164 |
|