Diff of /Figures/Figure3.R [000000] .. [4ff317]

Switch to unified view

a b/Figures/Figure3.R
1
2
################################################
3
#  File Name:Figure3.R
4
#  Author: Baoyan Bai
5
6
7
#################################################
8
9
####Integrate three different mutation files:
10
#1. original mutation calling file from Sigve
11
#   this files has been filtered by Ole-Christian'script and by Jillian's TCGA PON 
12
#2. raw vcf files from TSD. Many mutations in serial biopsies were in fact called, but rejcted for different reasons. 
13
#   I started to work this. Noticed some of mutations were still not called. But if looked at coverage, variant allele can be detected
14
15
#3. Pyclone result. Some mutations are missing due to CN=0 by ascat. I tried to recovery those mutations
16
#
17
# two mutations targeting on TBL1XR1 in P46 have wrong AF by IGV inspection. Those mutations were also manually added
18
19
#######Aim of 
20
21
#######1. calcuate which mutation is clonal, subclonal
22
#######2. Check whether's possible to detect the variants in control DNA and which group of mutations can be detected in blood DNAs
23
24
####data.frame empty.raw: raw mutation file.
25
####this file: from raw VCF files. add additional columns to indicate the mutation is shared by all biopsy for a given patient.
26
27
####data.frame empty pyclone result
28
###flcombined imported from 16_combined/fl42_combined_updated010719.txt the latest mutation file, original file from Sigve
29
30
library(plyr)
31
library(dplyr)
32
library(ggplot2)
33
34
35
setwd("G:/FL_resequncing/FL_exome_final/fl_latest/11_pyclone/BED_bam/counts_new/CPC_codingPlusUTR_pyclone/All_codingPlusUTR/All_codingPlusUTR_i1")
36
flcombined<- read.csv(file="../fl_latest/16_combined/fl44_reseq_final_060619.txt", sep="\t",header=T, stringsAsFactors = F)
37
38
mutation<- unique(flcombined%>%select(Patient, Gene, SYMBOL,VAR_ID, CDS_CHANGE,HGVSp_short,Region,
39
                                      VARIANT,VARIANT_CLASS))
40
mutation$chr<- as.character(lapply(strsplit(as.character(mutation$VAR_ID), "\\_"),"[",1))
41
mutation$star<- as.character(lapply(strsplit(as.character(mutation$VAR_ID), "\\_"),"[",2))
42
mutation$mutation_id<- paste(mutation$chr, mutation$star,sep="_")
43
44
45
###process JP6 seperately because JP6 does not have raw VCF file
46
JP6_anno<- mutation%>%filter(Patient=="P6B")
47
#########annotate pyclone result
48
pyclone.anno<- merge(empty, mutation, by=c("Patient", "mutation_id"))
49
50
###raw vcf df
51
###mutation calling
52
empty.raw$mutation_id<- paste(empty.raw$V1, empty.raw$V2,sep="_")
53
54
raw.vcf<- empty.raw%>%select(Patient,SAMPLE,mutation_id, VAR_ID, V7,V8, num_biopsy,sharedbyAll )
55
56
raw.vcf.anno<- merge(raw.vcf, mutation, by=c("Patient", "VAR_ID"))
57
58
##extract only coding
59
raw.vcf.anno.coding<- raw.vcf.anno%>%filter((Region=="Coding")| (VARIANT=="3'UTR") | (VARIANT=="5'UTR"))
60
61
#########
62
fl32<- subset(raw.vcf.anno.coding, Patient %in% unique(pyclone.anno$ Patient))
63
names(fl32)[names(fl32)=="mutation_id.x"]<- "mutation_id"
64
65
pyclone.df<- pyclone.anno%>%select(SAMPLE, mutation_id, cellular_prevalence,variant_allele_frequency, mean, class)
66
67
68
fl32.pyclone<- merge(fl32, pyclone.df, by=c("SAMPLE", "mutation_id"), all.x = T)
69
70
fl32.pyclone.removed<- fl32.pyclone%>%filter(is.na(cellular_prevalence))
71
missed_gene<- c("TAS1R3","ATAD3C","MMP23B","MORN1","HES2","TRAV8-6","SNRPA1","DLL4","VPS39","PAQR5","CEMIP","AMELY","ANHX","ATAD3C","B3GALT6","BRCA2","CBLB","CEMIP","DLL4","ESPN","FAM46A","HES2","IL5RA","IL9R","LCE1E","LEPROTL1","MAP1A","MMP23B","MORN1","PAQR5",
72
                "PPIAL4G","RAB7A","RORC","SNRPA1","TAS1R3","TNFRSF14","TRAV8-6","TRMT10A","TSPY1","USF3","USP9Y","VPREB1","VPS39","ZBTB20","ZNF268")
73
74
fl32.missed<- subset(fl32.pyclone.removed, SYMBOL %in%missed_gene)
75
fl32.ig<- setdiff(fl32.pyclone.removed, fl32.missed)
76
77
#######have to manually add clonal informations for missed genes
78
write.table(fl32.missed,file="clonal_gene/fl32.missed_genes.txt",sep="\t",row.names=F, quote=F )
79
########read manually revised genes
80
fl32.missed<- read.table(file="clonal_gene/fl32.missed_genes_manulrevised.txt",sep="\t", quote="",header=T,stringsAsFactors = F)
81
82
83
fl32.pyclone.kept<- fl32.pyclone%>%filter(!is.na(cellular_prevalence))
84
85
fl32.pyclone.new<- rbind(fl32.pyclone.kept, fl32.missed)
86
87
########defining mutation order: early dominant, dominant, subclonal
88
89
90
###
91
biopsy_empty_new<- fl32.pyclone.new[-FALSE,]
92
for (i in unique(fl32.pyclone.new$SAMPLE)){
93
  
94
  biopsy<- subset(fl32.pyclone.new, SAMPLE==i)
95
  biopsy_specific<- biopsy%>%filter(variant_allele_frequency>0)
96
  biopsy_specific$clonal<- ifelse(biopsy_specific$cellular_prevalence>=0.5 | biopsy_specific$mean>=0.5, "dominant", "subclonal")
97
  biopsy_empty_new<- rbind(biopsy_empty_new, biopsy_specific)
98
  
99
}
100
101
102
removed<- fl32.pyclone.new%>%filter(variant_allele_frequency==0)
103
##remove"TBL1XR1", discard_rescue_TBL1XR1
104
105
recover<- subset(removed, class=="discard_rescue_TBL1XR1")
106
recover$clonal<- "dominant"
107
108
fl32.missed$clonal<- fl32.missed$class
109
110
######
111
fl32.all<- rbind( biopsy_empty_new, recover,fl32.missed)
112
113
mutation_af<- ddply(fl32.all, .(mutation_id, Patient), summarise, min_CF=min(cellular_prevalence))
114
115
fl32.all.af<- merge(fl32.all, mutation_af,by=c("Patient","mutation_id"))
116
117
118
fl32.all.af$clonal[fl32.all.af$min_CF>0.5 & fl32.all.af$sharedbyAll=="Y" ]<-"early_dominant"
119
120
###corrected on April 5th 2019
121
fl32.all.af$clonal[fl32.all.af$class=="CPC_dominant_lost_LOH" & fl32.all.af$clonal=="dominant"]<-"early_dominant"
122
fl32.all.af$clonal[fl32.all.af$class=="CPC_dominant"]<- "early_dominant"
123
124
125
fl32.final<- fl32.all.af[,-c(16,17,18)]
126
##########process JP6 ,see previous
127
128
JP6_final<- JP6_df[,-c(4,5,7,9,11,13,14,15,16,17,18,19)]
129
130
JP6_final<- ddply(JP6_final, .(mutation_id), mutate, min_CF=min(cellular_prevalence))
131
fl32.all.final<-rbind(fl32.final,JP6_final)
132
133
##########
134
write.table(fl32.all.final, file="clonal_gene/fl32.all.coding_040319.txt",sep="\t", row.names=F,quote=F)
135
write.table(fl32.all.final, file="clonal_gene/fl32.all.coding_040519.txt",sep="\t", row.names=F,quote=F)
136
137
138
####plotting
139
fl32<- read.table(file="clonal_gene/fl32.all.coding_040519.txt",sep="\t",header=T,quote="",stringsAsFactors = F)
140
fl32.dominant<- fl32%>%filter(clonal !="subclonal")
141
fl32.dominant.ns<- fl32.dominant%>%filter(Region=="Coding" & VARIANT!="Silent")%>%filter(class!="discard")
142
143
fl32.dominant.ns.df<- unique(fl32.dominant.ns%>%select(Patient, SYMBOL,clonal))
144
145
fl32.dominant.ns.df.count<- ddply(fl32.dominant.ns.df, .(SYMBOL),mutate, num_pat=length(unique(Patient)))
146
147
clinic<- read.xlsx(file="../../../../../../2_clinic_infor/FL_Baoyan_clinical_file_FINAL_020719 - survival.xlsx",sheetIndex = 1)
148
clinic$group<- ifelse(is.na(clinic$Date_transf), "nFL", "tFL")
149
clinic.reduced<- clinic%>%select(Patient_ID, group, POD24)
150
names(clinic.reduced)<- c("Patient", "group", "POD24")
151
152
fl32.dominant.clinic<- merge(fl32.dominant.ns.df.count,clinic.reduced,by="Patient", all.x=T )
153
#########
154
discard_genes<- c("TTN","MUC16","MUC12")
155
156
fl32.dominant.clinic.clean<- subset(fl32.dominant.clinic, !SYMBOL %in% discard_genes)
157
#######taking genes mutated in at least 4 cases
158
159
160
dominant_4cases<- fl32.dominant.clinic.clean%>%filter(num_pat>=4)
161
dominant_4cases$SYMBOL<- reorder(dominant_4cases$SYMBOL, dominant_4cases$num_pat)
162
163
164
dominant_4cases$SYMBOL_ordered <- with(dominant_4cases, reorder(SYMBOL, SYMBOL, function(x) -length(x)))
165
####sepearte nFL and tFL
166
167
dominant_4cases.nFL<- dominant_4cases%>%filter(group=="nFL")
168
169
tmp.nFL<-ddply(dominant_4cases.nFL, .(Patient,SYMBOL), mutate, status=length(Patient))
170
tmp.nFL$clonal_correct<-tmp.nFL$clonal
171
tmp.nFL$clonal_correct[tmp.nFL$status=="2"]<-"Both"
172
173
nFL_plot<- unique(tmp.nFL[, -c(3)])
174
nFL_plot$clonal_correct[nFL_plot$clonal_correct=="dominant"]<- "Late"
175
nFL_plot$clonal_correct[nFL_plot$clonal_correct=="early_dominant"]<- "CPC"
176
nFL_plot$clonal_correct<-factor(nFL_plot$clonal_correct, levels=c("Both","Late","CPC"))
177
178
179
nFL<-ggplot(nFL_plot, aes(x=SYMBOL,fill=clonal_correct,drop=FALSE))+geom_bar(width=0.5) +
180
  scale_fill_manual(values=c("dark blue","pink","purple" ))+theme_bw()+coord_flip()+
181
  theme(legend.position = "none",
182
        axis.text.y=element_blank(),
183
        axis.ticks.y = element_blank(),
184
        axis.text.x = element_text(size = 18,face="bold"),
185
        legend.text=element_text(size=8,face="bold"),
186
        legend.title = element_text(size=8, face="bold"),
187
        plot.margin = unit(c(0.2,0,0.2,0.2), "cm"),
188
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
189
        panel.border= element_rect(color="blue",size=2))+
190
  scale_x_discrete(drop=FALSE)+
191
  scale_y_continuous(breaks = c(0,5,10,15))+xlab("")+ylab("")
192
193
194
195
############separate tFL group
196
197
dominant_4cases.tFL<- dominant_4cases%>%filter(group=="tFL")
198
tmp.tFL<-ddply(dominant_4cases.tFL, .(Patient,SYMBOL), mutate, status=length(Patient))
199
tmp.tFL$clonal_correct<-tmp.tFL$clonal
200
tmp.tFL$clonal_correct[tmp.tFL$status=="2"]<-"Both"
201
202
tFL_plot<- unique(tmp.tFL[, -c(3)])
203
tFL_plot$clonal_correct[tFL_plot$clonal_correct=="dominant"]<- "Late"
204
tFL_plot$clonal_correct[tFL_plot$clonal_correct=="early_dominant"]<- "CPC"
205
tFL_plot$clonal_correct<-factor(tFL_plot$clonal_correct, levels=c("Both","Late","CPC"))
206
207
tFL<-ggplot(tFL_plot, aes(x=SYMBOL,fill=clonal_correct))+geom_bar(width=0.5) +
208
  scale_fill_manual(values=c("dark blue","pink","purple" ))+coord_flip()+theme_bw()+
209
  theme(legend.position = "none",
210
        axis.text.y=element_blank(),
211
        axis.ticks.y = element_blank(),
212
        axis.text.x = element_text(size = 18,face="bold"),
213
        legend.text=element_text(size=8,face="bold"),
214
        legend.title = element_text(size=8, face="bold"),
215
        plot.margin = unit(c(0.2,0.2,0.2,-0.1), "cm"),
216
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
217
        panel.border=element_rect(color="red", size=2))+xlab("")+ylab("")+
218
  scale_x_discrete(drop=FALSE)+scale_y_continuous(breaks = c(0,5,10,15))
219
220
pdf(file="CPC_plots/24genes_freq_groups_20191203.pdf",width=4, height=6)
221
plot_grid(nFL, 
222
          tFL, align="h"
223
          
224
          , nrow = 1,rel_widths=c(1,1))
225
dev.off()
226
#######try to plot them together, waterfall plot, frequency plot, clinic data
227
all<- rbind(nFL_plot, tFL_plot)
228
229
patient_order_nFL<- unique(fl32.dominant.clinic.clean$Patient[fl32.dominant.clinic.clean$group=="nFL"])
230
231
patient_order_tFL<- unique(fl32.dominant.clinic.clean$Patient[fl32.dominant.clinic.clean$group=="tFL"])
232
233
all$Patient<- factor(all$Patient, levels=c(patient_order_nFL,patient_order_tFL))
234
all$clonal_correct<- factor(all$clonal_correct,levels=c("Both", "CPC", "Late", "NA"))
235
all.tmp<- all%>%select(Patient, SYMBOL, clonal_correct)
236
all.tmp.cast<- cast(all.tmp, Patient~SYMBOL)
237
238
all.melt.again<- melt(all.tmp.cast, id.vars="Patient")
239
all.melt.again$value<- factor(all.melt.again$value, levels=c("Both", "CPC", "Late"))
240
all.plot<- ggplot(all.melt.again, aes(x=Patient, y=SYMBOL, fill=value))+
241
  geom_tile(color="gray", size=0.1)+ scale_fill_manual(values=c("dark blue","purple","pink"),drop=FALSE)+scale_x_discrete(drop=FALSE)+
242
  theme(panel.background = element_blank(),
243
        axis.text.x=element_blank(),
244
        axis.ticks.x=element_blank(),
245
        axis.text.y =element_text(size=15,face="bold"),
246
        legend.position = "NULL",
247
        plot.margin = unit(c(0.5,-0.5,-0.5,0.5), "cm"))+ xlab("")+ylab("")
248
249
clinic.32<-unique(fl32.dominant.clinic.clean%>%select(Patient, group, POD24))
250
251
names(clinic.32)<- c("Patient", "Group", "POD24")
252
clinic.32$POD24<- factor(clinic.32$POD24, levels=c(0,1))
253
clinic.32$Patient<- factor(clinic.32$Patient, levels=c(patient_order_nFL,patient_order_tFL))
254
clinic.32.melt<- melt(clinic.32, id.vars = "Patient")
255
256
clinic_plot<- ggplot(clinic.32.melt, aes(x=Patient, y=variable, fill=value))+
257
  geom_tile()+
258
  scale_fill_manual(values=c("blue", "red", "grey","black"))+
259
  
260
  theme(panel.background = element_blank(),
261
        axis.text.x = element_blank(),
262
        axis.text.y =element_text(size=16,face="bold"),
263
        axis.ticks.x=element_blank(),
264
        legend.position = "none",
265
        plot.margin = unit(c(-0.1,0.5,-0.5,0.5), "cm"))+
266
  xlab("")+ylab("")
267
268
library(egg)
269
270
271
272
####legend
273
tFL_plot$clonal_correct<- factor(tFL_plot$clonal_correct, levels=c("Both", "CPC", "Late"))
274
pdf(file="CPC_plots/Figure4B_20191112_legend.pdf",width=12,height=13)
275
ggplot(tFL_plot, aes(x=SYMBOL,fill=clonal_correct))+geom_bar(width=0.5) +
276
  scale_fill_manual(values=c("dark blue","purple","pink"))+coord_flip()+theme_bw()+
277
  theme(
278
    axis.text.y=element_blank(),
279
    axis.ticks.y = element_blank(),
280
    axis.text.x = element_text(size = 18,face="bold"),
281
    legend.text=element_text(size=14,face="bold"),
282
    legend.title = element_text(size=16, face="bold"),
283
    plot.margin = unit(c(0.2,0.2,0.2,-0.2), "cm"),
284
    panel.grid.major = element_blank(), panel.grid.minor = element_blank())+xlab("")+ylab("")+labs(fill = "Timing")+
285
  scale_x_discrete(drop=FALSE)+panel_border(color="red", size=2)+scale_y_continuous(breaks = c(0,5,10,15))
286
287
dev.off()
288
save.image("G:/FL_resequncing/FL_exome_final/fl_latest/11_pyclone/BED_bam/counts_new/CPC_codingPlusUTR_pyclone/All_codingPlusUTR/All_codingPlusUTR_i1/Figure4B_20191112.RData")
289
290
###updated on 2019.11.15
291
292
293
294
295
pdf(file="CPC_plots/24genes_groups_20191203.pdf",width=6, height=5.8)
296
plot_grid(all.plot, 
297
          clinic_plot, align="v"
298
          
299
          , nrow = 2,rel_heights =c(14,2))
300
dev.off()