Diff of /Figures/Figure3.R [000000] .. [4ff317]

Switch to side-by-side view

--- a
+++ b/Figures/Figure3.R
@@ -0,0 +1,300 @@
+
+################################################
+#  File Name:Figure3.R
+#  Author: Baoyan Bai
+
+
+#################################################
+
+####Integrate three different mutation files:
+#1. original mutation calling file from Sigve
+#   this files has been filtered by Ole-Christian'script and by Jillian's TCGA PON 
+#2. raw vcf files from TSD. Many mutations in serial biopsies were in fact called, but rejcted for different reasons. 
+#   I started to work this. Noticed some of mutations were still not called. But if looked at coverage, variant allele can be detected
+
+#3. Pyclone result. Some mutations are missing due to CN=0 by ascat. I tried to recovery those mutations
+#
+# two mutations targeting on TBL1XR1 in P46 have wrong AF by IGV inspection. Those mutations were also manually added
+
+#######Aim of 
+
+#######1. calcuate which mutation is clonal, subclonal
+#######2. Check whether's possible to detect the variants in control DNA and which group of mutations can be detected in blood DNAs
+
+####data.frame empty.raw: raw mutation file.
+####this file: from raw VCF files. add additional columns to indicate the mutation is shared by all biopsy for a given patient.
+
+####data.frame empty pyclone result
+###flcombined imported from 16_combined/fl42_combined_updated010719.txt the latest mutation file, original file from Sigve
+
+library(plyr)
+library(dplyr)
+library(ggplot2)
+
+
+setwd("G:/FL_resequncing/FL_exome_final/fl_latest/11_pyclone/BED_bam/counts_new/CPC_codingPlusUTR_pyclone/All_codingPlusUTR/All_codingPlusUTR_i1")
+flcombined<- read.csv(file="../fl_latest/16_combined/fl44_reseq_final_060619.txt", sep="\t",header=T, stringsAsFactors = F)
+
+mutation<- unique(flcombined%>%select(Patient, Gene, SYMBOL,VAR_ID, CDS_CHANGE,HGVSp_short,Region,
+                                      VARIANT,VARIANT_CLASS))
+mutation$chr<- as.character(lapply(strsplit(as.character(mutation$VAR_ID), "\\_"),"[",1))
+mutation$star<- as.character(lapply(strsplit(as.character(mutation$VAR_ID), "\\_"),"[",2))
+mutation$mutation_id<- paste(mutation$chr, mutation$star,sep="_")
+
+
+###process JP6 seperately because JP6 does not have raw VCF file
+JP6_anno<- mutation%>%filter(Patient=="P6B")
+#########annotate pyclone result
+pyclone.anno<- merge(empty, mutation, by=c("Patient", "mutation_id"))
+
+###raw vcf df
+###mutation calling
+empty.raw$mutation_id<- paste(empty.raw$V1, empty.raw$V2,sep="_")
+
+raw.vcf<- empty.raw%>%select(Patient,SAMPLE,mutation_id, VAR_ID, V7,V8, num_biopsy,sharedbyAll )
+
+raw.vcf.anno<- merge(raw.vcf, mutation, by=c("Patient", "VAR_ID"))
+
+##extract only coding
+raw.vcf.anno.coding<- raw.vcf.anno%>%filter((Region=="Coding")| (VARIANT=="3'UTR") | (VARIANT=="5'UTR"))
+
+#########
+fl32<- subset(raw.vcf.anno.coding, Patient %in% unique(pyclone.anno$ Patient))
+names(fl32)[names(fl32)=="mutation_id.x"]<- "mutation_id"
+
+pyclone.df<- pyclone.anno%>%select(SAMPLE, mutation_id, cellular_prevalence,variant_allele_frequency, mean, class)
+
+
+fl32.pyclone<- merge(fl32, pyclone.df, by=c("SAMPLE", "mutation_id"), all.x = T)
+
+fl32.pyclone.removed<- fl32.pyclone%>%filter(is.na(cellular_prevalence))
+missed_gene<- c("TAS1R3","ATAD3C","MMP23B","MORN1","HES2","TRAV8-6","SNRPA1","DLL4","VPS39","PAQR5","CEMIP","AMELY","ANHX","ATAD3C","B3GALT6","BRCA2","CBLB","CEMIP","DLL4","ESPN","FAM46A","HES2","IL5RA","IL9R","LCE1E","LEPROTL1","MAP1A","MMP23B","MORN1","PAQR5",
+                "PPIAL4G","RAB7A","RORC","SNRPA1","TAS1R3","TNFRSF14","TRAV8-6","TRMT10A","TSPY1","USF3","USP9Y","VPREB1","VPS39","ZBTB20","ZNF268")
+
+fl32.missed<- subset(fl32.pyclone.removed, SYMBOL %in%missed_gene)
+fl32.ig<- setdiff(fl32.pyclone.removed, fl32.missed)
+
+#######have to manually add clonal informations for missed genes
+write.table(fl32.missed,file="clonal_gene/fl32.missed_genes.txt",sep="\t",row.names=F, quote=F )
+########read manually revised genes
+fl32.missed<- read.table(file="clonal_gene/fl32.missed_genes_manulrevised.txt",sep="\t", quote="",header=T,stringsAsFactors = F)
+
+
+fl32.pyclone.kept<- fl32.pyclone%>%filter(!is.na(cellular_prevalence))
+
+fl32.pyclone.new<- rbind(fl32.pyclone.kept, fl32.missed)
+
+########defining mutation order: early dominant, dominant, subclonal
+
+
+###
+biopsy_empty_new<- fl32.pyclone.new[-FALSE,]
+for (i in unique(fl32.pyclone.new$SAMPLE)){
+  
+  biopsy<- subset(fl32.pyclone.new, SAMPLE==i)
+  biopsy_specific<- biopsy%>%filter(variant_allele_frequency>0)
+  biopsy_specific$clonal<- ifelse(biopsy_specific$cellular_prevalence>=0.5 | biopsy_specific$mean>=0.5, "dominant", "subclonal")
+  biopsy_empty_new<- rbind(biopsy_empty_new, biopsy_specific)
+  
+}
+
+
+removed<- fl32.pyclone.new%>%filter(variant_allele_frequency==0)
+##remove"TBL1XR1", discard_rescue_TBL1XR1
+
+recover<- subset(removed, class=="discard_rescue_TBL1XR1")
+recover$clonal<- "dominant"
+
+fl32.missed$clonal<- fl32.missed$class
+
+######
+fl32.all<- rbind( biopsy_empty_new, recover,fl32.missed)
+
+mutation_af<- ddply(fl32.all, .(mutation_id, Patient), summarise, min_CF=min(cellular_prevalence))
+
+fl32.all.af<- merge(fl32.all, mutation_af,by=c("Patient","mutation_id"))
+
+
+fl32.all.af$clonal[fl32.all.af$min_CF>0.5 & fl32.all.af$sharedbyAll=="Y" ]<-"early_dominant"
+
+###corrected on April 5th 2019
+fl32.all.af$clonal[fl32.all.af$class=="CPC_dominant_lost_LOH" & fl32.all.af$clonal=="dominant"]<-"early_dominant"
+fl32.all.af$clonal[fl32.all.af$class=="CPC_dominant"]<- "early_dominant"
+
+
+fl32.final<- fl32.all.af[,-c(16,17,18)]
+##########process JP6 ,see previous
+
+JP6_final<- JP6_df[,-c(4,5,7,9,11,13,14,15,16,17,18,19)]
+
+JP6_final<- ddply(JP6_final, .(mutation_id), mutate, min_CF=min(cellular_prevalence))
+fl32.all.final<-rbind(fl32.final,JP6_final)
+
+##########
+write.table(fl32.all.final, file="clonal_gene/fl32.all.coding_040319.txt",sep="\t", row.names=F,quote=F)
+write.table(fl32.all.final, file="clonal_gene/fl32.all.coding_040519.txt",sep="\t", row.names=F,quote=F)
+
+
+####plotting
+fl32<- read.table(file="clonal_gene/fl32.all.coding_040519.txt",sep="\t",header=T,quote="",stringsAsFactors = F)
+fl32.dominant<- fl32%>%filter(clonal !="subclonal")
+fl32.dominant.ns<- fl32.dominant%>%filter(Region=="Coding" & VARIANT!="Silent")%>%filter(class!="discard")
+
+fl32.dominant.ns.df<- unique(fl32.dominant.ns%>%select(Patient, SYMBOL,clonal))
+
+fl32.dominant.ns.df.count<- ddply(fl32.dominant.ns.df, .(SYMBOL),mutate, num_pat=length(unique(Patient)))
+
+clinic<- read.xlsx(file="../../../../../../2_clinic_infor/FL_Baoyan_clinical_file_FINAL_020719 - survival.xlsx",sheetIndex = 1)
+clinic$group<- ifelse(is.na(clinic$Date_transf), "nFL", "tFL")
+clinic.reduced<- clinic%>%select(Patient_ID, group, POD24)
+names(clinic.reduced)<- c("Patient", "group", "POD24")
+
+fl32.dominant.clinic<- merge(fl32.dominant.ns.df.count,clinic.reduced,by="Patient", all.x=T )
+#########
+discard_genes<- c("TTN","MUC16","MUC12")
+
+fl32.dominant.clinic.clean<- subset(fl32.dominant.clinic, !SYMBOL %in% discard_genes)
+#######taking genes mutated in at least 4 cases
+
+
+dominant_4cases<- fl32.dominant.clinic.clean%>%filter(num_pat>=4)
+dominant_4cases$SYMBOL<- reorder(dominant_4cases$SYMBOL, dominant_4cases$num_pat)
+
+
+dominant_4cases$SYMBOL_ordered <- with(dominant_4cases, reorder(SYMBOL, SYMBOL, function(x) -length(x)))
+####sepearte nFL and tFL
+
+dominant_4cases.nFL<- dominant_4cases%>%filter(group=="nFL")
+
+tmp.nFL<-ddply(dominant_4cases.nFL, .(Patient,SYMBOL), mutate, status=length(Patient))
+tmp.nFL$clonal_correct<-tmp.nFL$clonal
+tmp.nFL$clonal_correct[tmp.nFL$status=="2"]<-"Both"
+
+nFL_plot<- unique(tmp.nFL[, -c(3)])
+nFL_plot$clonal_correct[nFL_plot$clonal_correct=="dominant"]<- "Late"
+nFL_plot$clonal_correct[nFL_plot$clonal_correct=="early_dominant"]<- "CPC"
+nFL_plot$clonal_correct<-factor(nFL_plot$clonal_correct, levels=c("Both","Late","CPC"))
+
+
+nFL<-ggplot(nFL_plot, aes(x=SYMBOL,fill=clonal_correct,drop=FALSE))+geom_bar(width=0.5) +
+  scale_fill_manual(values=c("dark blue","pink","purple" ))+theme_bw()+coord_flip()+
+  theme(legend.position = "none",
+        axis.text.y=element_blank(),
+        axis.ticks.y = element_blank(),
+        axis.text.x = element_text(size = 18,face="bold"),
+        legend.text=element_text(size=8,face="bold"),
+        legend.title = element_text(size=8, face="bold"),
+        plot.margin = unit(c(0.2,0,0.2,0.2), "cm"),
+        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+        panel.border= element_rect(color="blue",size=2))+
+  scale_x_discrete(drop=FALSE)+
+  scale_y_continuous(breaks = c(0,5,10,15))+xlab("")+ylab("")
+
+
+
+############separate tFL group
+
+dominant_4cases.tFL<- dominant_4cases%>%filter(group=="tFL")
+tmp.tFL<-ddply(dominant_4cases.tFL, .(Patient,SYMBOL), mutate, status=length(Patient))
+tmp.tFL$clonal_correct<-tmp.tFL$clonal
+tmp.tFL$clonal_correct[tmp.tFL$status=="2"]<-"Both"
+
+tFL_plot<- unique(tmp.tFL[, -c(3)])
+tFL_plot$clonal_correct[tFL_plot$clonal_correct=="dominant"]<- "Late"
+tFL_plot$clonal_correct[tFL_plot$clonal_correct=="early_dominant"]<- "CPC"
+tFL_plot$clonal_correct<-factor(tFL_plot$clonal_correct, levels=c("Both","Late","CPC"))
+
+tFL<-ggplot(tFL_plot, aes(x=SYMBOL,fill=clonal_correct))+geom_bar(width=0.5) +
+  scale_fill_manual(values=c("dark blue","pink","purple" ))+coord_flip()+theme_bw()+
+  theme(legend.position = "none",
+        axis.text.y=element_blank(),
+        axis.ticks.y = element_blank(),
+        axis.text.x = element_text(size = 18,face="bold"),
+        legend.text=element_text(size=8,face="bold"),
+        legend.title = element_text(size=8, face="bold"),
+        plot.margin = unit(c(0.2,0.2,0.2,-0.1), "cm"),
+        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
+        panel.border=element_rect(color="red", size=2))+xlab("")+ylab("")+
+  scale_x_discrete(drop=FALSE)+scale_y_continuous(breaks = c(0,5,10,15))
+
+pdf(file="CPC_plots/24genes_freq_groups_20191203.pdf",width=4, height=6)
+plot_grid(nFL, 
+          tFL, align="h"
+          
+          , nrow = 1,rel_widths=c(1,1))
+dev.off()
+#######try to plot them together, waterfall plot, frequency plot, clinic data
+all<- rbind(nFL_plot, tFL_plot)
+
+patient_order_nFL<- unique(fl32.dominant.clinic.clean$Patient[fl32.dominant.clinic.clean$group=="nFL"])
+
+patient_order_tFL<- unique(fl32.dominant.clinic.clean$Patient[fl32.dominant.clinic.clean$group=="tFL"])
+
+all$Patient<- factor(all$Patient, levels=c(patient_order_nFL,patient_order_tFL))
+all$clonal_correct<- factor(all$clonal_correct,levels=c("Both", "CPC", "Late", "NA"))
+all.tmp<- all%>%select(Patient, SYMBOL, clonal_correct)
+all.tmp.cast<- cast(all.tmp, Patient~SYMBOL)
+
+all.melt.again<- melt(all.tmp.cast, id.vars="Patient")
+all.melt.again$value<- factor(all.melt.again$value, levels=c("Both", "CPC", "Late"))
+all.plot<- ggplot(all.melt.again, aes(x=Patient, y=SYMBOL, fill=value))+
+  geom_tile(color="gray", size=0.1)+ scale_fill_manual(values=c("dark blue","purple","pink"),drop=FALSE)+scale_x_discrete(drop=FALSE)+
+  theme(panel.background = element_blank(),
+        axis.text.x=element_blank(),
+        axis.ticks.x=element_blank(),
+        axis.text.y =element_text(size=15,face="bold"),
+        legend.position = "NULL",
+        plot.margin = unit(c(0.5,-0.5,-0.5,0.5), "cm"))+ xlab("")+ylab("")
+
+clinic.32<-unique(fl32.dominant.clinic.clean%>%select(Patient, group, POD24))
+
+names(clinic.32)<- c("Patient", "Group", "POD24")
+clinic.32$POD24<- factor(clinic.32$POD24, levels=c(0,1))
+clinic.32$Patient<- factor(clinic.32$Patient, levels=c(patient_order_nFL,patient_order_tFL))
+clinic.32.melt<- melt(clinic.32, id.vars = "Patient")
+
+clinic_plot<- ggplot(clinic.32.melt, aes(x=Patient, y=variable, fill=value))+
+  geom_tile()+
+  scale_fill_manual(values=c("blue", "red", "grey","black"))+
+  
+  theme(panel.background = element_blank(),
+        axis.text.x = element_blank(),
+        axis.text.y =element_text(size=16,face="bold"),
+        axis.ticks.x=element_blank(),
+        legend.position = "none",
+        plot.margin = unit(c(-0.1,0.5,-0.5,0.5), "cm"))+
+  xlab("")+ylab("")
+
+library(egg)
+
+
+
+####legend
+tFL_plot$clonal_correct<- factor(tFL_plot$clonal_correct, levels=c("Both", "CPC", "Late"))
+pdf(file="CPC_plots/Figure4B_20191112_legend.pdf",width=12,height=13)
+ggplot(tFL_plot, aes(x=SYMBOL,fill=clonal_correct))+geom_bar(width=0.5) +
+  scale_fill_manual(values=c("dark blue","purple","pink"))+coord_flip()+theme_bw()+
+  theme(
+    axis.text.y=element_blank(),
+    axis.ticks.y = element_blank(),
+    axis.text.x = element_text(size = 18,face="bold"),
+    legend.text=element_text(size=14,face="bold"),
+    legend.title = element_text(size=16, face="bold"),
+    plot.margin = unit(c(0.2,0.2,0.2,-0.2), "cm"),
+    panel.grid.major = element_blank(), panel.grid.minor = element_blank())+xlab("")+ylab("")+labs(fill = "Timing")+
+  scale_x_discrete(drop=FALSE)+panel_border(color="red", size=2)+scale_y_continuous(breaks = c(0,5,10,15))
+
+dev.off()
+save.image("G:/FL_resequncing/FL_exome_final/fl_latest/11_pyclone/BED_bam/counts_new/CPC_codingPlusUTR_pyclone/All_codingPlusUTR/All_codingPlusUTR_i1/Figure4B_20191112.RData")
+
+###updated on 2019.11.15
+
+
+
+
+pdf(file="CPC_plots/24genes_groups_20191203.pdf",width=6, height=5.8)
+plot_grid(all.plot, 
+          clinic_plot, align="v"
+          
+          , nrow = 2,rel_heights =c(14,2))
+dev.off()