|
a |
|
b/Figures/Figure3.R |
|
|
1 |
|
|
|
2 |
################################################ |
|
|
3 |
# File Name:Figure3.R |
|
|
4 |
# Author: Baoyan Bai |
|
|
5 |
|
|
|
6 |
|
|
|
7 |
################################################# |
|
|
8 |
|
|
|
9 |
####Integrate three different mutation files: |
|
|
10 |
#1. original mutation calling file from Sigve |
|
|
11 |
# this files has been filtered by Ole-Christian'script and by Jillian's TCGA PON |
|
|
12 |
#2. raw vcf files from TSD. Many mutations in serial biopsies were in fact called, but rejcted for different reasons. |
|
|
13 |
# I started to work this. Noticed some of mutations were still not called. But if looked at coverage, variant allele can be detected |
|
|
14 |
|
|
|
15 |
#3. Pyclone result. Some mutations are missing due to CN=0 by ascat. I tried to recovery those mutations |
|
|
16 |
# |
|
|
17 |
# two mutations targeting on TBL1XR1 in P46 have wrong AF by IGV inspection. Those mutations were also manually added |
|
|
18 |
|
|
|
19 |
#######Aim of |
|
|
20 |
|
|
|
21 |
#######1. calcuate which mutation is clonal, subclonal |
|
|
22 |
#######2. Check whether's possible to detect the variants in control DNA and which group of mutations can be detected in blood DNAs |
|
|
23 |
|
|
|
24 |
####data.frame empty.raw: raw mutation file. |
|
|
25 |
####this file: from raw VCF files. add additional columns to indicate the mutation is shared by all biopsy for a given patient. |
|
|
26 |
|
|
|
27 |
####data.frame empty pyclone result |
|
|
28 |
###flcombined imported from 16_combined/fl42_combined_updated010719.txt the latest mutation file, original file from Sigve |
|
|
29 |
|
|
|
30 |
library(plyr) |
|
|
31 |
library(dplyr) |
|
|
32 |
library(ggplot2) |
|
|
33 |
|
|
|
34 |
|
|
|
35 |
setwd("G:/FL_resequncing/FL_exome_final/fl_latest/11_pyclone/BED_bam/counts_new/CPC_codingPlusUTR_pyclone/All_codingPlusUTR/All_codingPlusUTR_i1") |
|
|
36 |
flcombined<- read.csv(file="../fl_latest/16_combined/fl44_reseq_final_060619.txt", sep="\t",header=T, stringsAsFactors = F) |
|
|
37 |
|
|
|
38 |
mutation<- unique(flcombined%>%select(Patient, Gene, SYMBOL,VAR_ID, CDS_CHANGE,HGVSp_short,Region, |
|
|
39 |
VARIANT,VARIANT_CLASS)) |
|
|
40 |
mutation$chr<- as.character(lapply(strsplit(as.character(mutation$VAR_ID), "\\_"),"[",1)) |
|
|
41 |
mutation$star<- as.character(lapply(strsplit(as.character(mutation$VAR_ID), "\\_"),"[",2)) |
|
|
42 |
mutation$mutation_id<- paste(mutation$chr, mutation$star,sep="_") |
|
|
43 |
|
|
|
44 |
|
|
|
45 |
###process JP6 seperately because JP6 does not have raw VCF file |
|
|
46 |
JP6_anno<- mutation%>%filter(Patient=="P6B") |
|
|
47 |
#########annotate pyclone result |
|
|
48 |
pyclone.anno<- merge(empty, mutation, by=c("Patient", "mutation_id")) |
|
|
49 |
|
|
|
50 |
###raw vcf df |
|
|
51 |
###mutation calling |
|
|
52 |
empty.raw$mutation_id<- paste(empty.raw$V1, empty.raw$V2,sep="_") |
|
|
53 |
|
|
|
54 |
raw.vcf<- empty.raw%>%select(Patient,SAMPLE,mutation_id, VAR_ID, V7,V8, num_biopsy,sharedbyAll ) |
|
|
55 |
|
|
|
56 |
raw.vcf.anno<- merge(raw.vcf, mutation, by=c("Patient", "VAR_ID")) |
|
|
57 |
|
|
|
58 |
##extract only coding |
|
|
59 |
raw.vcf.anno.coding<- raw.vcf.anno%>%filter((Region=="Coding")| (VARIANT=="3'UTR") | (VARIANT=="5'UTR")) |
|
|
60 |
|
|
|
61 |
######### |
|
|
62 |
fl32<- subset(raw.vcf.anno.coding, Patient %in% unique(pyclone.anno$ Patient)) |
|
|
63 |
names(fl32)[names(fl32)=="mutation_id.x"]<- "mutation_id" |
|
|
64 |
|
|
|
65 |
pyclone.df<- pyclone.anno%>%select(SAMPLE, mutation_id, cellular_prevalence,variant_allele_frequency, mean, class) |
|
|
66 |
|
|
|
67 |
|
|
|
68 |
fl32.pyclone<- merge(fl32, pyclone.df, by=c("SAMPLE", "mutation_id"), all.x = T) |
|
|
69 |
|
|
|
70 |
fl32.pyclone.removed<- fl32.pyclone%>%filter(is.na(cellular_prevalence)) |
|
|
71 |
missed_gene<- c("TAS1R3","ATAD3C","MMP23B","MORN1","HES2","TRAV8-6","SNRPA1","DLL4","VPS39","PAQR5","CEMIP","AMELY","ANHX","ATAD3C","B3GALT6","BRCA2","CBLB","CEMIP","DLL4","ESPN","FAM46A","HES2","IL5RA","IL9R","LCE1E","LEPROTL1","MAP1A","MMP23B","MORN1","PAQR5", |
|
|
72 |
"PPIAL4G","RAB7A","RORC","SNRPA1","TAS1R3","TNFRSF14","TRAV8-6","TRMT10A","TSPY1","USF3","USP9Y","VPREB1","VPS39","ZBTB20","ZNF268") |
|
|
73 |
|
|
|
74 |
fl32.missed<- subset(fl32.pyclone.removed, SYMBOL %in%missed_gene) |
|
|
75 |
fl32.ig<- setdiff(fl32.pyclone.removed, fl32.missed) |
|
|
76 |
|
|
|
77 |
#######have to manually add clonal informations for missed genes |
|
|
78 |
write.table(fl32.missed,file="clonal_gene/fl32.missed_genes.txt",sep="\t",row.names=F, quote=F ) |
|
|
79 |
########read manually revised genes |
|
|
80 |
fl32.missed<- read.table(file="clonal_gene/fl32.missed_genes_manulrevised.txt",sep="\t", quote="",header=T,stringsAsFactors = F) |
|
|
81 |
|
|
|
82 |
|
|
|
83 |
fl32.pyclone.kept<- fl32.pyclone%>%filter(!is.na(cellular_prevalence)) |
|
|
84 |
|
|
|
85 |
fl32.pyclone.new<- rbind(fl32.pyclone.kept, fl32.missed) |
|
|
86 |
|
|
|
87 |
########defining mutation order: early dominant, dominant, subclonal |
|
|
88 |
|
|
|
89 |
|
|
|
90 |
### |
|
|
91 |
biopsy_empty_new<- fl32.pyclone.new[-FALSE,] |
|
|
92 |
for (i in unique(fl32.pyclone.new$SAMPLE)){ |
|
|
93 |
|
|
|
94 |
biopsy<- subset(fl32.pyclone.new, SAMPLE==i) |
|
|
95 |
biopsy_specific<- biopsy%>%filter(variant_allele_frequency>0) |
|
|
96 |
biopsy_specific$clonal<- ifelse(biopsy_specific$cellular_prevalence>=0.5 | biopsy_specific$mean>=0.5, "dominant", "subclonal") |
|
|
97 |
biopsy_empty_new<- rbind(biopsy_empty_new, biopsy_specific) |
|
|
98 |
|
|
|
99 |
} |
|
|
100 |
|
|
|
101 |
|
|
|
102 |
removed<- fl32.pyclone.new%>%filter(variant_allele_frequency==0) |
|
|
103 |
##remove"TBL1XR1", discard_rescue_TBL1XR1 |
|
|
104 |
|
|
|
105 |
recover<- subset(removed, class=="discard_rescue_TBL1XR1") |
|
|
106 |
recover$clonal<- "dominant" |
|
|
107 |
|
|
|
108 |
fl32.missed$clonal<- fl32.missed$class |
|
|
109 |
|
|
|
110 |
###### |
|
|
111 |
fl32.all<- rbind( biopsy_empty_new, recover,fl32.missed) |
|
|
112 |
|
|
|
113 |
mutation_af<- ddply(fl32.all, .(mutation_id, Patient), summarise, min_CF=min(cellular_prevalence)) |
|
|
114 |
|
|
|
115 |
fl32.all.af<- merge(fl32.all, mutation_af,by=c("Patient","mutation_id")) |
|
|
116 |
|
|
|
117 |
|
|
|
118 |
fl32.all.af$clonal[fl32.all.af$min_CF>0.5 & fl32.all.af$sharedbyAll=="Y" ]<-"early_dominant" |
|
|
119 |
|
|
|
120 |
###corrected on April 5th 2019 |
|
|
121 |
fl32.all.af$clonal[fl32.all.af$class=="CPC_dominant_lost_LOH" & fl32.all.af$clonal=="dominant"]<-"early_dominant" |
|
|
122 |
fl32.all.af$clonal[fl32.all.af$class=="CPC_dominant"]<- "early_dominant" |
|
|
123 |
|
|
|
124 |
|
|
|
125 |
fl32.final<- fl32.all.af[,-c(16,17,18)] |
|
|
126 |
##########process JP6 ,see previous |
|
|
127 |
|
|
|
128 |
JP6_final<- JP6_df[,-c(4,5,7,9,11,13,14,15,16,17,18,19)] |
|
|
129 |
|
|
|
130 |
JP6_final<- ddply(JP6_final, .(mutation_id), mutate, min_CF=min(cellular_prevalence)) |
|
|
131 |
fl32.all.final<-rbind(fl32.final,JP6_final) |
|
|
132 |
|
|
|
133 |
########## |
|
|
134 |
write.table(fl32.all.final, file="clonal_gene/fl32.all.coding_040319.txt",sep="\t", row.names=F,quote=F) |
|
|
135 |
write.table(fl32.all.final, file="clonal_gene/fl32.all.coding_040519.txt",sep="\t", row.names=F,quote=F) |
|
|
136 |
|
|
|
137 |
|
|
|
138 |
####plotting |
|
|
139 |
fl32<- read.table(file="clonal_gene/fl32.all.coding_040519.txt",sep="\t",header=T,quote="",stringsAsFactors = F) |
|
|
140 |
fl32.dominant<- fl32%>%filter(clonal !="subclonal") |
|
|
141 |
fl32.dominant.ns<- fl32.dominant%>%filter(Region=="Coding" & VARIANT!="Silent")%>%filter(class!="discard") |
|
|
142 |
|
|
|
143 |
fl32.dominant.ns.df<- unique(fl32.dominant.ns%>%select(Patient, SYMBOL,clonal)) |
|
|
144 |
|
|
|
145 |
fl32.dominant.ns.df.count<- ddply(fl32.dominant.ns.df, .(SYMBOL),mutate, num_pat=length(unique(Patient))) |
|
|
146 |
|
|
|
147 |
clinic<- read.xlsx(file="../../../../../../2_clinic_infor/FL_Baoyan_clinical_file_FINAL_020719 - survival.xlsx",sheetIndex = 1) |
|
|
148 |
clinic$group<- ifelse(is.na(clinic$Date_transf), "nFL", "tFL") |
|
|
149 |
clinic.reduced<- clinic%>%select(Patient_ID, group, POD24) |
|
|
150 |
names(clinic.reduced)<- c("Patient", "group", "POD24") |
|
|
151 |
|
|
|
152 |
fl32.dominant.clinic<- merge(fl32.dominant.ns.df.count,clinic.reduced,by="Patient", all.x=T ) |
|
|
153 |
######### |
|
|
154 |
discard_genes<- c("TTN","MUC16","MUC12") |
|
|
155 |
|
|
|
156 |
fl32.dominant.clinic.clean<- subset(fl32.dominant.clinic, !SYMBOL %in% discard_genes) |
|
|
157 |
#######taking genes mutated in at least 4 cases |
|
|
158 |
|
|
|
159 |
|
|
|
160 |
dominant_4cases<- fl32.dominant.clinic.clean%>%filter(num_pat>=4) |
|
|
161 |
dominant_4cases$SYMBOL<- reorder(dominant_4cases$SYMBOL, dominant_4cases$num_pat) |
|
|
162 |
|
|
|
163 |
|
|
|
164 |
dominant_4cases$SYMBOL_ordered <- with(dominant_4cases, reorder(SYMBOL, SYMBOL, function(x) -length(x))) |
|
|
165 |
####sepearte nFL and tFL |
|
|
166 |
|
|
|
167 |
dominant_4cases.nFL<- dominant_4cases%>%filter(group=="nFL") |
|
|
168 |
|
|
|
169 |
tmp.nFL<-ddply(dominant_4cases.nFL, .(Patient,SYMBOL), mutate, status=length(Patient)) |
|
|
170 |
tmp.nFL$clonal_correct<-tmp.nFL$clonal |
|
|
171 |
tmp.nFL$clonal_correct[tmp.nFL$status=="2"]<-"Both" |
|
|
172 |
|
|
|
173 |
nFL_plot<- unique(tmp.nFL[, -c(3)]) |
|
|
174 |
nFL_plot$clonal_correct[nFL_plot$clonal_correct=="dominant"]<- "Late" |
|
|
175 |
nFL_plot$clonal_correct[nFL_plot$clonal_correct=="early_dominant"]<- "CPC" |
|
|
176 |
nFL_plot$clonal_correct<-factor(nFL_plot$clonal_correct, levels=c("Both","Late","CPC")) |
|
|
177 |
|
|
|
178 |
|
|
|
179 |
nFL<-ggplot(nFL_plot, aes(x=SYMBOL,fill=clonal_correct,drop=FALSE))+geom_bar(width=0.5) + |
|
|
180 |
scale_fill_manual(values=c("dark blue","pink","purple" ))+theme_bw()+coord_flip()+ |
|
|
181 |
theme(legend.position = "none", |
|
|
182 |
axis.text.y=element_blank(), |
|
|
183 |
axis.ticks.y = element_blank(), |
|
|
184 |
axis.text.x = element_text(size = 18,face="bold"), |
|
|
185 |
legend.text=element_text(size=8,face="bold"), |
|
|
186 |
legend.title = element_text(size=8, face="bold"), |
|
|
187 |
plot.margin = unit(c(0.2,0,0.2,0.2), "cm"), |
|
|
188 |
panel.grid.major = element_blank(), panel.grid.minor = element_blank(), |
|
|
189 |
panel.border= element_rect(color="blue",size=2))+ |
|
|
190 |
scale_x_discrete(drop=FALSE)+ |
|
|
191 |
scale_y_continuous(breaks = c(0,5,10,15))+xlab("")+ylab("") |
|
|
192 |
|
|
|
193 |
|
|
|
194 |
|
|
|
195 |
############separate tFL group |
|
|
196 |
|
|
|
197 |
dominant_4cases.tFL<- dominant_4cases%>%filter(group=="tFL") |
|
|
198 |
tmp.tFL<-ddply(dominant_4cases.tFL, .(Patient,SYMBOL), mutate, status=length(Patient)) |
|
|
199 |
tmp.tFL$clonal_correct<-tmp.tFL$clonal |
|
|
200 |
tmp.tFL$clonal_correct[tmp.tFL$status=="2"]<-"Both" |
|
|
201 |
|
|
|
202 |
tFL_plot<- unique(tmp.tFL[, -c(3)]) |
|
|
203 |
tFL_plot$clonal_correct[tFL_plot$clonal_correct=="dominant"]<- "Late" |
|
|
204 |
tFL_plot$clonal_correct[tFL_plot$clonal_correct=="early_dominant"]<- "CPC" |
|
|
205 |
tFL_plot$clonal_correct<-factor(tFL_plot$clonal_correct, levels=c("Both","Late","CPC")) |
|
|
206 |
|
|
|
207 |
tFL<-ggplot(tFL_plot, aes(x=SYMBOL,fill=clonal_correct))+geom_bar(width=0.5) + |
|
|
208 |
scale_fill_manual(values=c("dark blue","pink","purple" ))+coord_flip()+theme_bw()+ |
|
|
209 |
theme(legend.position = "none", |
|
|
210 |
axis.text.y=element_blank(), |
|
|
211 |
axis.ticks.y = element_blank(), |
|
|
212 |
axis.text.x = element_text(size = 18,face="bold"), |
|
|
213 |
legend.text=element_text(size=8,face="bold"), |
|
|
214 |
legend.title = element_text(size=8, face="bold"), |
|
|
215 |
plot.margin = unit(c(0.2,0.2,0.2,-0.1), "cm"), |
|
|
216 |
panel.grid.major = element_blank(), panel.grid.minor = element_blank(), |
|
|
217 |
panel.border=element_rect(color="red", size=2))+xlab("")+ylab("")+ |
|
|
218 |
scale_x_discrete(drop=FALSE)+scale_y_continuous(breaks = c(0,5,10,15)) |
|
|
219 |
|
|
|
220 |
pdf(file="CPC_plots/24genes_freq_groups_20191203.pdf",width=4, height=6) |
|
|
221 |
plot_grid(nFL, |
|
|
222 |
tFL, align="h" |
|
|
223 |
|
|
|
224 |
, nrow = 1,rel_widths=c(1,1)) |
|
|
225 |
dev.off() |
|
|
226 |
#######try to plot them together, waterfall plot, frequency plot, clinic data |
|
|
227 |
all<- rbind(nFL_plot, tFL_plot) |
|
|
228 |
|
|
|
229 |
patient_order_nFL<- unique(fl32.dominant.clinic.clean$Patient[fl32.dominant.clinic.clean$group=="nFL"]) |
|
|
230 |
|
|
|
231 |
patient_order_tFL<- unique(fl32.dominant.clinic.clean$Patient[fl32.dominant.clinic.clean$group=="tFL"]) |
|
|
232 |
|
|
|
233 |
all$Patient<- factor(all$Patient, levels=c(patient_order_nFL,patient_order_tFL)) |
|
|
234 |
all$clonal_correct<- factor(all$clonal_correct,levels=c("Both", "CPC", "Late", "NA")) |
|
|
235 |
all.tmp<- all%>%select(Patient, SYMBOL, clonal_correct) |
|
|
236 |
all.tmp.cast<- cast(all.tmp, Patient~SYMBOL) |
|
|
237 |
|
|
|
238 |
all.melt.again<- melt(all.tmp.cast, id.vars="Patient") |
|
|
239 |
all.melt.again$value<- factor(all.melt.again$value, levels=c("Both", "CPC", "Late")) |
|
|
240 |
all.plot<- ggplot(all.melt.again, aes(x=Patient, y=SYMBOL, fill=value))+ |
|
|
241 |
geom_tile(color="gray", size=0.1)+ scale_fill_manual(values=c("dark blue","purple","pink"),drop=FALSE)+scale_x_discrete(drop=FALSE)+ |
|
|
242 |
theme(panel.background = element_blank(), |
|
|
243 |
axis.text.x=element_blank(), |
|
|
244 |
axis.ticks.x=element_blank(), |
|
|
245 |
axis.text.y =element_text(size=15,face="bold"), |
|
|
246 |
legend.position = "NULL", |
|
|
247 |
plot.margin = unit(c(0.5,-0.5,-0.5,0.5), "cm"))+ xlab("")+ylab("") |
|
|
248 |
|
|
|
249 |
clinic.32<-unique(fl32.dominant.clinic.clean%>%select(Patient, group, POD24)) |
|
|
250 |
|
|
|
251 |
names(clinic.32)<- c("Patient", "Group", "POD24") |
|
|
252 |
clinic.32$POD24<- factor(clinic.32$POD24, levels=c(0,1)) |
|
|
253 |
clinic.32$Patient<- factor(clinic.32$Patient, levels=c(patient_order_nFL,patient_order_tFL)) |
|
|
254 |
clinic.32.melt<- melt(clinic.32, id.vars = "Patient") |
|
|
255 |
|
|
|
256 |
clinic_plot<- ggplot(clinic.32.melt, aes(x=Patient, y=variable, fill=value))+ |
|
|
257 |
geom_tile()+ |
|
|
258 |
scale_fill_manual(values=c("blue", "red", "grey","black"))+ |
|
|
259 |
|
|
|
260 |
theme(panel.background = element_blank(), |
|
|
261 |
axis.text.x = element_blank(), |
|
|
262 |
axis.text.y =element_text(size=16,face="bold"), |
|
|
263 |
axis.ticks.x=element_blank(), |
|
|
264 |
legend.position = "none", |
|
|
265 |
plot.margin = unit(c(-0.1,0.5,-0.5,0.5), "cm"))+ |
|
|
266 |
xlab("")+ylab("") |
|
|
267 |
|
|
|
268 |
library(egg) |
|
|
269 |
|
|
|
270 |
|
|
|
271 |
|
|
|
272 |
####legend |
|
|
273 |
tFL_plot$clonal_correct<- factor(tFL_plot$clonal_correct, levels=c("Both", "CPC", "Late")) |
|
|
274 |
pdf(file="CPC_plots/Figure4B_20191112_legend.pdf",width=12,height=13) |
|
|
275 |
ggplot(tFL_plot, aes(x=SYMBOL,fill=clonal_correct))+geom_bar(width=0.5) + |
|
|
276 |
scale_fill_manual(values=c("dark blue","purple","pink"))+coord_flip()+theme_bw()+ |
|
|
277 |
theme( |
|
|
278 |
axis.text.y=element_blank(), |
|
|
279 |
axis.ticks.y = element_blank(), |
|
|
280 |
axis.text.x = element_text(size = 18,face="bold"), |
|
|
281 |
legend.text=element_text(size=14,face="bold"), |
|
|
282 |
legend.title = element_text(size=16, face="bold"), |
|
|
283 |
plot.margin = unit(c(0.2,0.2,0.2,-0.2), "cm"), |
|
|
284 |
panel.grid.major = element_blank(), panel.grid.minor = element_blank())+xlab("")+ylab("")+labs(fill = "Timing")+ |
|
|
285 |
scale_x_discrete(drop=FALSE)+panel_border(color="red", size=2)+scale_y_continuous(breaks = c(0,5,10,15)) |
|
|
286 |
|
|
|
287 |
dev.off() |
|
|
288 |
save.image("G:/FL_resequncing/FL_exome_final/fl_latest/11_pyclone/BED_bam/counts_new/CPC_codingPlusUTR_pyclone/All_codingPlusUTR/All_codingPlusUTR_i1/Figure4B_20191112.RData") |
|
|
289 |
|
|
|
290 |
###updated on 2019.11.15 |
|
|
291 |
|
|
|
292 |
|
|
|
293 |
|
|
|
294 |
|
|
|
295 |
pdf(file="CPC_plots/24genes_groups_20191203.pdf",width=6, height=5.8) |
|
|
296 |
plot_grid(all.plot, |
|
|
297 |
clinic_plot, align="v" |
|
|
298 |
|
|
|
299 |
, nrow = 2,rel_heights =c(14,2)) |
|
|
300 |
dev.off() |