--- a +++ b/Analysis_1.R @@ -0,0 +1,703 @@ +library(openxlsx) +library(plyr) +library(dplyr) +library(data.table) +library(caret) +library(cvTools) + +## get the job ID +r <- as.numeric(Sys.getenv("SGE_TASK_ID")) +# r is the interger from 1 to 10 + +# the number of CV folds +cvRuns = 10 + +# load SNPs +load("SNPs_20171024.rda") +#============== +# input files +# file that contains information about blacklist mutations to exclude +#blacklistFile = "blacklist_from_July-17-2017.xlsx" +blacklistFile = "blacklist_from_October-23-2017.xlsx" + +# file with WBC data +wbcFile = "WBC Mutations (Filtered + Optical Duplicates Removed + Q30), October 6, 2017.csv" +# file with plasmaSeq data to analyze +#plasmaSeqDataFile = "MEGA PlasmaSeq Data for Lu, Filtered Mutations (Optical Duplicates Removed + Q30) with PT and WBC Data, October 6, 2017.csv" +plasmaSeqDataFile = "MEGA PlasmaSeq Data for Lu, Filtered Mutations (Optical Duplicates Removed + Q30) with PT and WBC Data, December 8, 2017.csv" + +#================================= +# START ANALYSIS +# blacklist mutations +blacklist = read.xlsx(blacklistFile, sheet = 1) +blacklist$mutID = paste(blacklist$Chrom,blacklist$Position,blacklist$BaseFrom,blacklist$BaseTo) +blacklist = blacklist$mutID + +#============== +# WBC data from separate file +#============== +wbcDataFromFile = fread(wbcFile) +wbcDataFromFile = data.frame(wbcDataFromFile) +wbcDataFromFile = wbcDataFromFile[!(wbcDataFromFile$ampMatchName %in% amplicon),] +colnames(wbcDataFromFile)[c(20,14,17)] = c("TotalUID", "UID1","UID2") +## all plasma samples with matched WBC +matched = unique(wbcDataFromFile$Matched.Plasma) +matched = matched[!is.na(matched)] + +wbcDataFromFile$mutID = paste(wbcDataFromFile$Chrom,wbcDataFromFile$Position,wbcDataFromFile$BaseFrom,wbcDataFromFile$BaseTo) + +## deleting NA rows and snps from WBC mutations +wbcDataFromFile = wbcDataFromFile[!is.na(wbcDataFromFile$Position),] +wbcDataFromFile = wbcDataFromFile[!(wbcDataFromFile$mutID %in% snps),] + +## remove duplicated row (same mutation in the same template) +wbcDataFromFile$remove = paste(wbcDataFromFile$Template,wbcDataFromFile$mutID) + +dupPairs <- names(which(table(wbcDataFromFile$remove)>1)) +newuniquenewnormal <- wbcDataFromFile%>% + filter(remove %in% dupPairs)%>% + group_by(remove)%>% + slice(which.max(TotalUID)) %>% + as.data.frame() + +wbcDataFromFile <- rbind(newuniquenewnormal, + wbcDataFromFile%>% + filter(!(remove %in% dupPairs))) + +wbcDataFromFile = data.frame(wbcDataFromFile) + +## numeric columns +col = c("SM1","UID1","SM2","UID2","TotalSM","TotalUID") + +wbcDataFromFile[,col] = apply(wbcDataFromFile[,col], 2, function(x) as.numeric(x)) +wbcDataFromFile$maf1 = wbcDataFromFile$SM1/wbcDataFromFile$UID1 +wbcDataFromFile$maf2 = wbcDataFromFile$SM2/wbcDataFromFile$UID2 + +############ if remove the requirement on the number of SM +## calculate number of positive wells defined more than 1 SM +wbcDataFromFile$Pwell = (wbcDataFromFile$SM1>1)+(wbcDataFromFile$SM2>1) +wbcDataFromFile = wbcDataFromFile[wbcDataFromFile$Pwell>=1,] +wbcDataFromFile = wbcDataFromFile[!is.na(wbcDataFromFile$Template),] + +## average MAF +wbcDataFromFile$aveMAF = wbcDataFromFile$TotalSM/wbcDataFromFile$TotalUID +maf = c("maf1","maf2") + +#wbcDataFromFile = wbcDataFromFile[wbcDataFromFile$TotalUID>=200,] + +## use wbc without matched plasma as control, excluding N18s +training = wbcDataFromFile[is.na(wbcDataFromFile$Matched.Plasma), ] +training = training[training$Template!="N18",] +#training = training[!(training$mutID %in% blacklist),] +training$SM3 = training$SM4 = training$SM5 = training$SM6 = NA +training$UID3 = training$UID4 = training$UID5 = training$UID6 = NA +training$maf3 = training$maf4 = training$maf5 = training$maf6 = NA + +## normal unmatched WBC mutations +training_wbc.nm = training + +## plasma templates with matched WBC +valid = wbcDataFromFile[!is.na(wbcDataFromFile$Matched.Plasma),] +## setting MAF = 0 for wells with less than 100 UIDs +valid$maf1[valid$UID1<100] = 0 +valid$maf2[valid$UID2<100] = 0 +## unique mutations ID (Plasma Template, mutID) +valid$plsid = paste(valid$Matched.Plasma,valid$mutID) + + +## calculate maximum MAF in the two wells for each mutation in the plasma +## could use average, instead of max +maxinWBC = ddply(valid, .(plsid), summarise, max = max(maf1,maf2)) + + +#============= +# Plasma Data +#============= +## read data from plasmas +plasmaData = fread(plasmaSeqDataFile) +plasmaData = data.frame(plasmaData) +plasmaData = plasmaData[!(plasmaData$ampMatchName %in% amplicon),] +colnames(plasmaData)[c(25,26,14,16,18,20,22,24)] = c("TotalSM","TotalUID", "UID1","UID2", "UID3","UID4","UID5","UID6") + +plasmaData$mutID = paste(plasmaData$Chrom,plasmaData$Position,plasmaData$BaseFrom,plasmaData$BaseTo) + +## deleting NA rows and snps +plasmaData = plasmaData[!is.na(plasmaData$Position),] +plasmaData = plasmaData[!(plasmaData$mutID %in% snps),] + +## remove duplicated row (same mutation in the same template) +plasmaData$remove = paste(plasmaData$Template,plasmaData$mutID) + +dupPairs <- names(which(table(plasmaData$remove)>1)) +newuniquenewnormal <- plasmaData%>% + filter(remove %in% dupPairs)%>% + group_by(remove)%>% + slice(which.max(TotalUID)) %>% + as.data.frame() + +data_corrected <- rbind(newuniquenewnormal, + plasmaData%>% + filter(!(remove %in% dupPairs))) + +data_corrected = data.frame(data_corrected) + +## numeric columns +col = c("SM1","UID1","SM2","UID2","SM3","UID3","SM4","UID4","SM5","UID5","SM6","UID6","TotalSM","TotalUID") + +#colnames(data)[c(17,20,23,26,29,32)] = c("maf1","maf2","maf3","maf4","maf5","maf6") + +## change to numeric +data_corrected[,col] = apply(data_corrected[,col], 2, function(x) as.numeric(x)) + +## calculate MAF +data_corrected$maf1 = data_corrected$SM1/data_corrected$UID1 +data_corrected$maf2 = data_corrected$SM2/data_corrected$UID2 +data_corrected$maf3 = data_corrected$SM3/data_corrected$UID3 +data_corrected$maf4 = data_corrected$SM4/data_corrected$UID4 +data_corrected$maf5 = data_corrected$SM5/data_corrected$UID5 +data_corrected$maf6 = data_corrected$SM6/data_corrected$UID6 + +## mark as character +data_corrected$remove = as.character(data_corrected$remove) + +## note some templates might only have 2 wells (like normal wbc in this study) +data_corrected$aveMAF = data_corrected$TotalSM/data_corrected$TotalUID +maf = c("maf1","maf2","maf3","maf4","maf5","maf6") +colnames(data_corrected)[6] = "Sample.Category" + +## remove water +data_corrected = data_corrected[data_corrected$Template!="Water",] + + +############# if remove the requirement on the number of SM +## calculating positive wells +data_corrected$Pwell = (data_corrected$SM1>1)+(data_corrected$SM2>1)+(data_corrected$SM3>1)+(data_corrected$SM4>1)+(data_corrected$SM5>1)+(data_corrected$SM6>1) +data_corrected = data_corrected[data_corrected$Pwell>=1,] +data_corrected = data_corrected[!is.na(data_corrected$Template),] + +## calculate indel length (SBS is 0) +## not used in this method +data_corrected$indel_length = 0 +data_corrected$indel_length[data_corrected$MutType=="Indel" & data_corrected$BaseFrom=="NULL"] = nchar(data_corrected$BaseTo[data_corrected$MutType=="Indel" & data_corrected$BaseFrom=="NULL"]) +data_corrected$indel_length[data_corrected$MutType=="Indel" & data_corrected$BaseTo=="NULL"] = nchar(data_corrected$BaseFrom[data_corrected$MutType=="Indel" & data_corrected$BaseTo=="NULL"]) + +data_corrected = data_corrected[!(data_corrected$Template %in% c("Water","N18")),] + +## all normals +# LD fix 10/16/2017 +nlpls = unique(data_corrected$Template[data_corrected$Sample.Category=="Normal"]) + +## mutatioins that have matched WBC in the WBC sheets +checkmut = data_corrected[!is.na(data_corrected$WBC.Sample),] +checkmut = checkmut[checkmut$Template %in% valid$Matched.Plasma,] + +## setting MAF=0 if less than 100 UIDs +checkmut$maf1[checkmut$UID1<100] = 0 +checkmut$maf2[checkmut$UID2<100] = 0 +checkmut$maf3[checkmut$UID3<100] = 0 +checkmut$maf4[checkmut$UID4<100] = 0 +checkmut$maf5[checkmut$UID5<100] = 0 +checkmut$maf6[checkmut$UID6<100] = 0 + +## max MAF in 6 wells from plasmas +maxinPls = ddply(checkmut, .(remove), summarise, max = max(maf1,maf2,maf3,maf4,maf5,maf6)) +## corresponding max in matched WBC +maxinPls$WBC = maxinWBC$max[match(maxinPls$remove,maxinWBC$plsid)] + +## mutations that are matched in plasma and WBC +maxinPls = maxinPls[!is.na(maxinPls$WBC),] +## calculate the ratio between matched plasma mutation and corresponding WBC mutation +maxinPls$ratio = maxinPls$max/maxinPls$WBC + +## "chip" mutations to be removed if the ratio is less than 100 +chip_r = 100 +## mutations to be removed +chip = maxinPls$remove[maxinPls$ratio<chip_r] + + + + set.seed(1234+r*10) + ## 10 folds cross validation + + ## cancer type considered in the study + cancer = c("CRC","Lung","Breast","Pancreas","Ovarian","Esophagus","Liver","Stomach","pancreas") + + ## all cancer templates in the cancer category + #pls.cancer = unique(data_corrected$Template[data_corrected$Category %in% cancer]) + # LD fix 10/16/2017 + pls.cancer = unique(data_corrected$Template[data_corrected$Sample.Category %in% cancer]) + + # cvRuns-fold cross-validation + index = cvFolds(length(nlpls), K=cvRuns) + index2 = cvFolds(length(pls.cancer), K=cvRuns) + + # run cross-validation + for( w in 1:cvRuns) + { + ## normals used as control + nlt = nlpls[index$subsets[index$which!=w]] + ## cancers used to build the cancer distribution + plsc.t = pls.cancer[index2$subsets[index2$which!=w]] + + ## normal plasma mutations + training_nlpls = data_corrected[data_corrected$Template %in% nlt,] + ## only mutations that are not on blacklist or caution list will be included in the normalization step and the distribution analysis + + ## WBC templates + wbc = unique(training_wbc.nm$Template) + col = intersect(colnames(training_wbc.nm),colnames(training_nlpls)) + + ## using the not matched wbc and 80% normal plasma as control to build the distribution for normals + training = rbind.data.frame(training_wbc.nm[,col],training_nlpls[col]) + + training$maf1[training$UID1<100] = 0 + training$maf2[training$UID2<100] = 0 + training$maf3[training$UID3<100] = 0 + training$maf4[training$UID4<100] = 0 + training$maf5[training$UID5<100] = 0 + training$maf6[training$UID6<100] = 0 + + ## eliminate "chip" mutations + training = training[!(training$remove %in% chip),] + ## calculate indel length + training$indel_length = 0 + training$indel_length[training$MutType=="Indel" & training$BaseFrom=="NULL"] = nchar(training$BaseTo[training$MutType=="Indel" & training$BaseFrom=="NULL"]) + training$indel_length[training$MutType=="Indel" & training$BaseTo=="NULL"] = nchar(training$BaseFrom[training$MutType=="Indel" & training$BaseTo=="NULL"]) + + ## 1 base pair indel + indels1 = unique(training$mutID[training$indel_length==1]) + ## more than 1 base pair indel + indels2 = unique(training$mutID[training$indel_length>=2]) + ## sbs mutations + sbs = unique(training$mutID[training$indel_length==0]) + + + + ########### Normalization ########### + mut = unique(training$mutID) + ## NB!!! 2 since normal wbc only have 2 wells + ## create matrix with mutations in normals vs the number of normals times 2 (because of 2 wells) + cc = 2*length(wbc) + 6*length(nlt) + mutation_info = matrix(0, nrow = length(mut), ncol = cc) + + ## fill in the matrix + maf_wbc = c("maf1","maf2","maf3","maf4","maf5","maf6") + for(i in 1:nrow(mutation_info)){ + temp = training[training$mutID==mut[i],] + temp1 = as.vector(t(temp[,maf_wbc])) + temp1 = temp1[!is.na(temp1)] + mutation_info[i,1:(length(temp1))] = temp1 + } + + ## count how many times the mutation is present in training. use for normalization only if present more than once + count = table(c(training$mutID[training$SM1>0], training$mutID[training$SM2>0], training$mutID[training$SM3>0],training$mutID[training$SM4>0],training$mutID[training$SM5>0],training$mutID[training$SM6>0])) + valid = names(count)[count>1] + + ## calculate the mean and max MAF for mutations in normals + normalization_mutation = data.frame(mutID = mut, stringsAsFactors = FALSE) + normalization_mutation$mean = 0 + normalization_mutation$max = 0 + for(i in 1:nrow(normalization_mutation)){ + normalization_mutation$mean[i] = mean(mutation_info[i,], na.rm = T) + normalization_mutation$max[i] = max(mutation_info[i,], na.rm = T) + } + + normalization_mutation = normalization_mutation[normalization_mutation$mean>0,] + ## normalization is done on all mutations, including blacklist mutations + + ## if the mutation has more than 1 positive well in WBCs + normalization_mutation$valid = (normalization_mutation$mutID %in% valid) + ## if the mutation is on the blacklist, 1=not on blacklist, 0=yes + ## normalization_mutation$bl = (!(normalization_mutation$mutID %in% blacklist)) + ## if the mutation should be included for the normalization step, 1=yes, 0=no + ## normalization_mutation$include = (normalization_mutation$valid)*(normalization_mutation$bl) + normalization_mutation$include = (normalization_mutation$valid) + + normalization_mutation$mutType = "" + normalization_mutation$mutType[normalization_mutation$mutID %in% indels1] = "Indel1" + normalization_mutation$mutType[normalization_mutation$mutID %in% indels2] = "Indel2" + normalization_mutation$mutType[normalization_mutation$mutID %in% sbs] = "SBS" + + training$adj1 = training$maf1 + training$adj2 = training$maf2 + training$adj3 = training$maf3 + training$adj4 = training$maf4 + training$adj5 = training$maf5 + training$adj6 = training$maf6 + + + ## calculating 25th percentile of the mean MAFs in training mutations (excluding blacklist) + base = quantile(normalization_mutation$mean[normalization_mutation$include==1],0.25) + ## not distinguish between indels/SBS + base1 = base + base2 = base + base3 = base + + ### distinguish between Indels/SBS (normalized separately) + #base1 = quantile(normalization_mutation$mean[normalization_mutation$include==1 & normalization_mutation$mutType=="SBS"],0.25) + #base2 = quantile(normalization_mutation$mean[normalization_mutation$include==1 & normalization_mutation$mutType=="Indel1"],0.25) + #base3 = quantile(normalization_mutation$mean[normalization_mutation$include==1 & normalization_mutation$mutType=="Indel2"],0.25) + + ## calculating adjusted MAF in normals + for(i in 1:nrow(normalization_mutation)){ + if((normalization_mutation$include[i]==1) & (normalization_mutation$mutType[i]=="SBS")){ + training$adj1[training$mutID==normalization_mutation$mutID[i]] = training$maf1[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base1) + training$adj2[training$mutID==normalization_mutation$mutID[i]] = training$maf2[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base1) + training$adj3[training$mutID==normalization_mutation$mutID[i]] = training$maf3[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base1) + training$adj4[training$mutID==normalization_mutation$mutID[i]] = training$maf4[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base1) + training$adj5[training$mutID==normalization_mutation$mutID[i]] = training$maf5[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base1) + training$adj6[training$mutID==normalization_mutation$mutID[i]] = training$maf6[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base1) + }else if((normalization_mutation$include[i]==1) & (normalization_mutation$mutType[i]=="Indel1")){ + training$adj1[training$mutID==normalization_mutation$mutID[i]] = training$maf1[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base2) + training$adj2[training$mutID==normalization_mutation$mutID[i]] = training$maf2[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base2) + training$adj3[training$mutID==normalization_mutation$mutID[i]] = training$maf3[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base2) + training$adj4[training$mutID==normalization_mutation$mutID[i]] = training$maf4[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base2) + training$adj5[training$mutID==normalization_mutation$mutID[i]] = training$maf5[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base2) + training$adj6[training$mutID==normalization_mutation$mutID[i]] = training$maf6[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base2) + } else if((normalization_mutation$include[i]==1) & (normalization_mutation$mutType[i]=="Indel2")){ + training$adj1[training$mutID==normalization_mutation$mutID[i]] = training$maf1[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base3) + training$adj2[training$mutID==normalization_mutation$mutID[i]] = training$maf2[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base3) + training$adj3[training$mutID==normalization_mutation$mutID[i]] = training$maf3[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base3) + training$adj4[training$mutID==normalization_mutation$mutID[i]] = training$maf4[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base3) + training$adj5[training$mutID==normalization_mutation$mutID[i]] = training$maf5[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base3) + training$adj6[training$mutID==normalization_mutation$mutID[i]] = training$maf6[training$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base3) + } + } + + ## build the distribution based on adjusted MAF in controls + ## no separation between Indel/SBS + ## here training includes all non-matched WBC and normal plasma controls + training1 = training + + ### Separate between Indel/SBS + #training1 = training[training$MutType=="SBS",] + #training2 = training[training$MutType=="Indel" & training$indel_length==1,] + #training3 = training[training$MutType=="Indel" & training$indel_length>=2,] + + data2 = data.frame(perc = unlist(c(training1$adj1,training1$adj2,training1$adj3,training1$adj4,training1$adj5,training1$adj6))) + data2$UID = unlist(c(training1$UID1,training1$UID2,training1$UID3,training1$UID4,training1$UID5,training1$UID6)) + + data2 = data2[!is.na(data2$perc),] + data2$perc = as.numeric(data2$perc) + + s1 = 1000 + each = 1000 + + r1 = data2[data2$UID<s1 & data2$UID>=100,] + r1$perc.log = as.numeric(log10(r1$perc)) + + r2 = data2[data2$UID>=s1 & data2$UID<(s1+each*1),] + r2$perc.log = as.numeric(log10(r2$perc)) + + r3 = data2[data2$UID>=(s1+each*1) & data2$UID<(s1+each*2),] + r3$perc.log = as.numeric(log10(r3$perc)) + + r4 = data2[data2$UID>=(s1+each*2) & data2$UID<(s1+each*3),] + r4$perc.log = as.numeric(log10(r4$perc)) + + r5 = data2[data2$UID>=(s1+each*3) & data2$UID<(s1+each*4),] + r5$perc.log = as.numeric(log10(r5$perc)) + + r6 = data2[data2$UID>=(s1+each*4) & data2$UID<(s1+each*5),] + r6$perc.log = as.numeric(log10(r6$perc)) + + r7 = data2[data2$UID>=(s1+each*5) & data2$UID<(s1+each*6),] + r7$perc.log = as.numeric(log10(r7$perc)) + + r8 = data2[data2$UID>=(s1+each*6) & data2$UID<(s1+each*7),] + r8$perc.log = as.numeric(log10(r8$perc)) + + r9 = data2[data2$UID>=(s1+each*7) & data2$UID<(s1+each*8),] + r9$perc.log = as.numeric(log10(r9$perc)) + + r10 = data2[data2$UID>=(s1+each*8),] + r10$perc.log = as.numeric(log10(r10$perc)) + + d1 = density(r1$perc.log) + d2 = density(r2$perc.log) + d3 = density(r3$perc.log) + d4 = density(r4$perc.log) + d5 = density(r5$perc.log) + d6 = density(r6$perc.log) + d7 = density(r7$perc.log) + d8 = density(r8$perc.log) + d9 = density(r9$perc.log) + d10 = density(r10$perc.log) + + list_func1 = list(d1, d2, d3, d4, d5, d6, d7,d8,d9,d10) + list_data1 = list(r1,r2,r3,r4,r5,r6,r7,r8,r9,r10) + + l1 = length(list_func1) + + + testSet = data_corrected + + testSet = data.frame(testSet) + ## remove matched plasma mutations and WBC mutations + testSet = testSet[!(testSet$remove %in% chip),] + + ## calculate adjusted MAF for all templates + testSet$maf1[is.na(testSet$UID1)] = 0 + testSet$maf2[is.na(testSet$UID2)] = 0 + testSet$maf3[is.na(testSet$UID3)] = 0 + testSet$maf4[is.na(testSet$UID4)] = 0 + testSet$maf5[is.na(testSet$UID5)] = 0 + testSet$maf6[is.na(testSet$UID6)] = 0 + + testSet$maf1[testSet$UID1==0] = 0 + testSet$maf2[testSet$UID2==0] = 0 + testSet$maf3[testSet$UID3==0] = 0 + testSet$maf4[testSet$UID4==0] = 0 + testSet$maf5[testSet$UID5==0] = 0 + testSet$maf6[testSet$UID6==0] = 0 + + testSet$adj1 = testSet$maf1 + testSet$adj2 = testSet$maf2 + testSet$adj3 = testSet$maf3 + testSet$adj4 = testSet$maf4 + testSet$adj5 = testSet$maf5 + testSet$adj6 = testSet$maf6 + + testSet$max = 0 + + ## normalization of mutations in the test set + for(i in 1:nrow(normalization_mutation)){ + testSet$max[testSet$mutID==normalization_mutation$mutID[i]] = normalization_mutation$max[i] + if((normalization_mutation$include[i]==1) & (normalization_mutation$mutType[i]=="SBS")){ + testSet$adj1[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf1[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base1) + testSet$adj2[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf2[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base1) + testSet$adj3[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf3[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base1) + testSet$adj4[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf4[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base1) + testSet$adj5[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf5[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base1) + testSet$adj6[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf6[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base1) + }else if((normalization_mutation$include[i]==1) & (normalization_mutation$mutType[i]=="Indel1")){ + testSet$adj1[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf1[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base2) + testSet$adj2[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf2[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base2) + testSet$adj3[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf3[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base2) + testSet$adj4[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf4[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base2) + testSet$adj5[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf5[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base2) + testSet$adj6[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf6[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base2) + }else if((normalization_mutation$include[i]==1) & (normalization_mutation$mutType[i]=="Indel2")){ + testSet$adj1[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf1[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base3) + testSet$adj2[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf2[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base3) + testSet$adj3[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf3[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base3) + testSet$adj4[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf4[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base3) + testSet$adj5[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf5[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base3) + testSet$adj6[testSet$mutID==normalization_mutation$mutID[i]] = testSet$maf6[testSet$mutID==normalization_mutation$mutID[i]]/(normalization_mutation$mean[i]/base3) + } + } + + testSet$perc1 = log10(testSet$adj1) + testSet$perc2 = log10(testSet$adj2) + testSet$perc3 = log10(testSet$adj3) + testSet$perc4 = log10(testSet$adj4) + testSet$perc5 = log10(testSet$adj5) + testSet$perc6 = log10(testSet$adj6) + + ## cancer templates in cancer distribution + cancerPT = testSet[testSet$PT.Avg.MAF>5 & (testSet$Template %in% plsc.t),] + cancerPT = cancerPT[!is.na(cancerPT$Template),] + cancermaf = data.frame(UID = c(cancerPT$UID1,cancerPT$UID2,cancerPT$UID3,cancerPT$UID4,cancerPT$UID5,cancerPT$UID6), perc = c(cancerPT$adj1,cancerPT$adj2,cancerPT$adj3,cancerPT$adj4,cancerPT$adj5,cancerPT$adj6)) + + data2 = cancermaf + data2 = data2[!is.na(data2$perc),] + data2$perc = as.numeric(data2$perc) + + s1 = 1000 + each = 1000 + + r1 = data2[data2$UID<s1 & data2$UID>=100,] + r1$perc.log = as.numeric(log10(r1$perc)) + + r2 = data2[data2$UID>=s1 & data2$UID<(s1+each*1),] + r2$perc.log = as.numeric(log10(r2$perc)) + + r3 = data2[data2$UID>=(s1+each*1) & data2$UID<(s1+each*2),] + r3$perc.log = as.numeric(log10(r3$perc)) + + r4 = data2[data2$UID>=(s1+each*2) & data2$UID<(s1+each*3),] + r4$perc.log = as.numeric(log10(r4$perc)) + + r5 = data2[data2$UID>=(s1+each*3) & data2$UID<(s1+each*4),] + r5$perc.log = as.numeric(log10(r5$perc)) + + r6 = data2[data2$UID>=(s1+each*4) & data2$UID<(s1+each*5),] + r6$perc.log = as.numeric(log10(r6$perc)) + + r7 = data2[data2$UID>=(s1+each*5) & data2$UID<(s1+each*6),] + r7$perc.log = as.numeric(log10(r7$perc)) + + r8 = data2[data2$UID>=(s1+each*6) & data2$UID<(s1+each*7),] + r8$perc.log = as.numeric(log10(r8$perc)) + + r9 = data2[data2$UID>=(s1+each*7) & data2$UID<(s1+each*8),] + r9$perc.log = as.numeric(log10(r9$perc)) + + r10 = data2[data2$UID>=(s1+each*8),] + r10$perc.log = as.numeric(log10(r10$perc)) + + d1 = density(r1$perc.log) + d2 = density(r2$perc.log) + d3 = density(r3$perc.log) + d4 = density(r4$perc.log) + d5 = density(r5$perc.log) + d6 = density(r6$perc.log) + d7 = density(r7$perc.log) + d8 = density(r8$perc.log) + d9 = density(r9$perc.log) + d10 = density(r10$perc.log) + + + list_func2 = list(d1, d2, d3, d4, d5, d6, d7,d8,d9,d10) + list_data2 = list(r1,r2,r3,r4,r5,r6,r7,r8,r9,r10) + + ## list of data and distribution for normals + list_func =list(list_func1) + list_data = list(list_data1) + + ## list of data and distribution for cancers + list_func.c = list(list_func2) + list_data.c = list(list_data2) + + + testSet$p.value1 = 0.5 + testSet$p.value2 = 0.5 + testSet$p.value3 = 0.5 + testSet$p.value4 = 0.5 + testSet$p.value5 = 0.5 + testSet$p.value6 = 0.5 + + ## adding the UID group numbers + testSet$temp1 = apply(testSet[,c("UID1","UID2","UID3","UID4","UID5","UID6")], 1, function(x) min(ceiling((x[1]-(s1-1))/each)+1,l1)) + testSet$temp2 = apply(testSet[,c("UID1","UID2","UID3","UID4","UID5","UID6")], 1, function(x) min(ceiling((x[2]-(s1-1))/each)+1,l1)) + testSet$temp3 = apply(testSet[,c("UID1","UID2","UID3","UID4","UID5","UID6")], 1, function(x) min(ceiling((x[3]-(s1-1))/each)+1,l1)) + testSet$temp4 = apply(testSet[,c("UID1","UID2","UID3","UID4","UID5","UID6")], 1, function(x) min(ceiling((x[4]-(s1-1))/each)+1,l1)) + testSet$temp5 = apply(testSet[,c("UID1","UID2","UID3","UID4","UID5","UID6")], 1, function(x) min(ceiling((x[5]-(s1-1))/each)+1,l1)) + testSet$temp6 = apply(testSet[,c("UID1","UID2","UID3","UID4","UID5","UID6")], 1, function(x) min(ceiling((x[6]-(s1-1))/each)+1,l1)) + + + testSet$indel_group = 1 + + ## calculate the p values + testSet$adj1_1 = testSet$adj1>0 + testSet$adj2_1 = testSet$adj2>0 + testSet$adj3_1 = testSet$adj3>0 + testSet$adj4_1 = testSet$adj4>0 + testSet$adj5_1 = testSet$adj5>0 + testSet$adj6_1 = testSet$adj6>0 + + ## calculate p values if the well is from normals + for(i in 1:l1){ + testSet$p.value1[testSet$temp1==i & testSet$adj1_1==TRUE] = apply(testSet[testSet$temp1==i & testSet$adj1_1==TRUE,c("perc1", "perc2","perc3", "perc4","perc5", "perc6","indel_group")], 1, function(x) return(mean(pnorm(x[1]-list_data[[x[7]]][[i]]$perc.log, sd = list_func[[x[7]]][[i]]$bw, lower.tail=FALSE)))) + testSet$p.value2[testSet$temp2==i & testSet$adj2_1==TRUE] = apply(testSet[testSet$temp2==i & testSet$adj2_1==TRUE,c("perc1", "perc2","perc3", "perc4","perc5", "perc6","indel_group")], 1, function(x) return(mean(pnorm(x[2]-list_data[[x[7]]][[i]]$perc.log, sd = list_func[[x[7]]][[i]]$bw, lower.tail=FALSE)))) + testSet$p.value3[testSet$temp3==i & testSet$adj3_1==TRUE] = apply(testSet[testSet$temp3==i & testSet$adj3_1==TRUE,c("perc1", "perc2","perc3", "perc4","perc5", "perc6","indel_group")], 1, function(x) return(mean(pnorm(x[3]-list_data[[x[7]]][[i]]$perc.log, sd = list_func[[x[7]]][[i]]$bw, lower.tail=FALSE)))) + testSet$p.value4[testSet$temp4==i & testSet$adj4_1==TRUE] = apply(testSet[testSet$temp4==i & testSet$adj4_1==TRUE,c("perc1", "perc2","perc3", "perc4","perc5", "perc6","indel_group")], 1, function(x) return(mean(pnorm(x[4]-list_data[[x[7]]][[i]]$perc.log, sd = list_func[[x[7]]][[i]]$bw, lower.tail=FALSE)))) + testSet$p.value5[testSet$temp5==i & testSet$adj5_1==TRUE] = apply(testSet[testSet$temp5==i & testSet$adj5_1==TRUE,c("perc1", "perc2","perc3", "perc4","perc5", "perc6","indel_group")], 1, function(x) return(mean(pnorm(x[5]-list_data[[x[7]]][[i]]$perc.log, sd = list_func[[x[7]]][[i]]$bw, lower.tail=FALSE)))) + testSet$p.value6[testSet$temp6==i & testSet$adj6_1==TRUE] = apply(testSet[testSet$temp6==i & testSet$adj6_1==TRUE,c("perc1", "perc2","perc3", "perc4","perc5", "perc6","indel_group")], 1, function(x) return(mean(pnorm(x[6]-list_data[[x[7]]][[i]]$perc.log, sd = list_func[[x[7]]][[i]]$bw, lower.tail=FALSE)))) + } + + ## setting p values to 1-10^-8 if 0 SM + testSet$p.value1[testSet$adj1_1==FALSE] = 1-10^-8 + testSet$p.value2[testSet$adj2_1==FALSE] = 1-10^-8 + testSet$p.value3[testSet$adj3_1==FALSE] = 1-10^-8 + testSet$p.value4[testSet$adj4_1==FALSE] = 1-10^-8 + testSet$p.value5[testSet$adj5_1==FALSE] = 1-10^-8 + testSet$p.value6[testSet$adj6_1==FALSE] = 1-10^-8 + + ## pvalues for wells with less than 100 UIDs are calculate using binomial, p is the aveMAF from 6 wells + testSet$p.value1[!is.na(testSet$UID1) & testSet$UID1<100] = apply(testSet[!is.na(testSet$UID1) & testSet$UID1<100, c("SM1","maf1","UID1","aveMAF")], 1, function(x) return(pbinom(max(0,x[1]-1), x[3], x[4],lower.tail = FALSE))) + testSet$p.value2[!is.na(testSet$UID2) & testSet$UID2<100] = apply(testSet[!is.na(testSet$UID2) & testSet$UID2<100, c("SM2","maf2","UID2","aveMAF")], 1, function(x) return(pbinom(max(0,x[1]-1), x[3], x[4],lower.tail = FALSE))) + testSet$p.value3[!is.na(testSet$UID3) & testSet$UID3<100] = apply(testSet[!is.na(testSet$UID3) & testSet$UID3<100, c("SM3","maf3","UID3","aveMAF")], 1, function(x) return(pbinom(max(0,x[1]-1), x[3], x[4],lower.tail = FALSE))) + testSet$p.value4[!is.na(testSet$UID4) & testSet$UID4<100] = apply(testSet[!is.na(testSet$UID4) & testSet$UID4<100, c("SM4","maf4","UID4","aveMAF")], 1, function(x) return(pbinom(max(0,x[1]-1), x[3], x[4],lower.tail = FALSE))) + testSet$p.value5[!is.na(testSet$UID5) & testSet$UID5<100] = apply(testSet[!is.na(testSet$UID5) & testSet$UID5<100, c("SM5","maf5","UID5","aveMAF")], 1, function(x) return(pbinom(max(0,x[1]-1), x[3], x[4],lower.tail = FALSE))) + testSet$p.value6[!is.na(testSet$UID6) & testSet$UID6<100] = apply(testSet[!is.na(testSet$UID6) & testSet$UID6<100, c("SM6","maf6","UID6","aveMAF")], 1, function(x) return(pbinom(max(0,x[1]-1), x[3], x[4],lower.tail = FALSE))) + + ## 0 wells with <100 UIDs are assigned p values 1-10^-8 (theoretically it is 1) + testSet$p.value1[!is.na(testSet$UID1) & testSet$UID1<100 & testSet$SM1==0] = 1-10^-8 + testSet$p.value2[!is.na(testSet$UID2) & testSet$UID2<100 & testSet$SM2==0] = 1-10^-8 + testSet$p.value3[!is.na(testSet$UID3) & testSet$UID3<100 & testSet$SM3==0] = 1-10^-8 + testSet$p.value4[!is.na(testSet$UID4) & testSet$UID4<100 & testSet$SM4==0] = 1-10^-8 + testSet$p.value5[!is.na(testSet$UID5) & testSet$UID5<100 & testSet$SM5==0] = 1-10^-8 + testSet$p.value6[!is.na(testSet$UID6) & testSet$UID6<100 & testSet$SM6==0] = 1-10^-8 + + ## p values for wells with 0 UID are assigned as 0.5 (when inverting it has 0) + testSet$p.value1[testSet$UID1==0] = 0.5 + testSet$p.value2[testSet$UID2==0] = 0.5 + testSet$p.value3[testSet$UID3==0] = 0.5 + testSet$p.value4[testSet$UID4==0] = 0.5 + testSet$p.value5[testSet$UID5==0] = 0.5 + testSet$p.value6[testSet$UID6==0] = 0.5 + + + ### calculating p values if the well is from cancers + testSet$p.value1.c = 0.5 + testSet$p.value2.c = 0.5 + testSet$p.value3.c = 0.5 + testSet$p.value4.c = 0.5 + testSet$p.value5.c = 0.5 + testSet$p.value6.c = 0.5 + + + for(i in 1:l1){ + testSet$p.value1.c[testSet$temp1==i & testSet$adj1_1==TRUE] = apply(testSet[testSet$temp1==i & testSet$adj1_1==TRUE,c("perc1", "perc2","perc3", "perc4","perc5", "perc6","indel_group")], 1, function(x) return(mean(pnorm(x[1]-list_data.c[[x[7]]][[i]]$perc.log, sd = list_func.c[[x[7]]][[i]]$bw, lower.tail=FALSE)))) + testSet$p.value2.c[testSet$temp2==i & testSet$adj2_1==TRUE] = apply(testSet[testSet$temp2==i & testSet$adj2_1==TRUE,c("perc1", "perc2","perc3", "perc4","perc5", "perc6","indel_group")], 1, function(x) return(mean(pnorm(x[2]-list_data.c[[x[7]]][[i]]$perc.log, sd = list_func.c[[x[7]]][[i]]$bw, lower.tail=FALSE)))) + testSet$p.value3.c[testSet$temp3==i & testSet$adj3_1==TRUE] = apply(testSet[testSet$temp3==i & testSet$adj3_1==TRUE,c("perc1", "perc2","perc3", "perc4","perc5", "perc6","indel_group")], 1, function(x) return(mean(pnorm(x[3]-list_data.c[[x[7]]][[i]]$perc.log, sd = list_func.c[[x[7]]][[i]]$bw, lower.tail=FALSE)))) + testSet$p.value4.c[testSet$temp4==i & testSet$adj4_1==TRUE] = apply(testSet[testSet$temp4==i & testSet$adj4_1==TRUE,c("perc1", "perc2","perc3", "perc4","perc5", "perc6","indel_group")], 1, function(x) return(mean(pnorm(x[4]-list_data.c[[x[7]]][[i]]$perc.log, sd = list_func.c[[x[7]]][[i]]$bw, lower.tail=FALSE)))) + testSet$p.value5.c[testSet$temp5==i & testSet$adj5_1==TRUE] = apply(testSet[testSet$temp5==i & testSet$adj5_1==TRUE,c("perc1", "perc2","perc3", "perc4","perc5", "perc6","indel_group")], 1, function(x) return(mean(pnorm(x[5]-list_data.c[[x[7]]][[i]]$perc.log, sd = list_func.c[[x[7]]][[i]]$bw, lower.tail=FALSE)))) + testSet$p.value6.c[testSet$temp6==i & testSet$adj6_1==TRUE] = apply(testSet[testSet$temp6==i & testSet$adj6_1==TRUE,c("perc1", "perc2","perc3", "perc4","perc5", "perc6","indel_group")], 1, function(x) return(mean(pnorm(x[6]-list_data.c[[x[7]]][[i]]$perc.log, sd = list_func.c[[x[7]]][[i]]$bw, lower.tail=FALSE)))) + } + + + testSet$p.value1.c[testSet$adj1_1==FALSE] = 1-10^-8 + testSet$p.value2.c[testSet$adj2_1==FALSE] = 1-10^-8 + testSet$p.value3.c[testSet$adj3_1==FALSE] = 1-10^-8 + testSet$p.value4.c[testSet$adj4_1==FALSE] = 1-10^-8 + testSet$p.value5.c[testSet$adj5_1==FALSE] = 1-10^-8 + testSet$p.value6.c[testSet$adj6_1==FALSE] = 1-10^-8 + + ## pvalues for wells with less than 100 UIDs are calculate using binomial + testSet$p.value1.c[!is.na(testSet$UID1) & testSet$UID1<100] = apply(testSet[!is.na(testSet$UID1) & testSet$UID1<100, c("SM1","maf1","UID1","aveMAF")], 1, function(x) return(pbinom(max(0,x[1]-1), x[3], x[4],lower.tail = FALSE))) + testSet$p.value2.c[!is.na(testSet$UID2) & testSet$UID2<100] = apply(testSet[!is.na(testSet$UID2) & testSet$UID2<100, c("SM2","maf2","UID2","aveMAF")], 1, function(x) return(pbinom(max(0,x[1]-1), x[3], x[4],lower.tail = FALSE))) + testSet$p.value3.c[!is.na(testSet$UID3) & testSet$UID3<100] = apply(testSet[!is.na(testSet$UID3) & testSet$UID3<100, c("SM3","maf3","UID3","aveMAF")], 1, function(x) return(pbinom(max(0,x[1]-1), x[3], x[4],lower.tail = FALSE))) + testSet$p.value4.c[!is.na(testSet$UID4) & testSet$UID4<100] = apply(testSet[!is.na(testSet$UID4) & testSet$UID4<100, c("SM4","maf4","UID4","aveMAF")], 1, function(x) return(pbinom(max(0,x[1]-1), x[3], x[4],lower.tail = FALSE))) + testSet$p.value5.c[!is.na(testSet$UID5) & testSet$UID5<100] = apply(testSet[!is.na(testSet$UID5) & testSet$UID5<100, c("SM5","maf5","UID5","aveMAF")], 1, function(x) return(pbinom(max(0,x[1]-1), x[3], x[4],lower.tail = FALSE))) + testSet$p.value6.c[!is.na(testSet$UID6) & testSet$UID6<100] = apply(testSet[!is.na(testSet$UID6) & testSet$UID6<100, c("SM6","maf6","UID6","aveMAF")], 1, function(x) return(pbinom(max(0,x[1]-1), x[3], x[4],lower.tail = FALSE))) + + ## 0 wells with <100 UIDs are assigned p values 1-10^-8 (theoretically it is 1) + testSet$p.value1.c[!is.na(testSet$UID1) & testSet$UID1<100 & testSet$SM1==0] = 1-10^-8 + testSet$p.value2.c[!is.na(testSet$UID2) & testSet$UID2<100 & testSet$SM2==0] = 1-10^-8 + testSet$p.value3.c[!is.na(testSet$UID3) & testSet$UID3<100 & testSet$SM3==0] = 1-10^-8 + testSet$p.value4.c[!is.na(testSet$UID4) & testSet$UID4<100 & testSet$SM4==0] = 1-10^-8 + testSet$p.value5.c[!is.na(testSet$UID5) & testSet$UID5<100 & testSet$SM5==0] = 1-10^-8 + testSet$p.value6.c[!is.na(testSet$UID6) & testSet$UID6<100 & testSet$SM6==0] = 1-10^-8 + + ## p values for wells with 0 UID are assigned as 0.5 (when inverting it is 0) + testSet$p.value1.c[testSet$UID1==0] = 0.5 + testSet$p.value2.c[testSet$UID2==0] = 0.5 + testSet$p.value3.c[testSet$UID3==0] = 0.5 + testSet$p.value4.c[testSet$UID4==0] = 0.5 + testSet$p.value5.c[testSet$UID5==0] = 0.5 + testSet$p.value6.c[testSet$UID6==0] = 0.5 + + ## ratio p.cancer/p.normal + testSet$r1 = testSet$p.value1.c/testSet$p.value1 + testSet$r2 = testSet$p.value2.c/testSet$p.value2 + testSet$r3 = testSet$p.value3.c/testSet$p.value3 + testSet$r4 = testSet$p.value4.c/testSet$p.value4 + testSet$r5 = testSet$p.value5.c/testSet$p.value5 + testSet$r6 = testSet$p.value6.c/testSet$p.value6 + + + testSet$blacklist = testSet$mutID %in% blacklist + + result = testSet[testSet$TotalUID>=200,] + result = result[!(result$Template %in% c(nlt,"Water","N18",plsc.t)),] + result$Template = as.character(result$Template) + + save.image(file = paste0("PvalueRatio1209cv_5perc_bl_",r,"_", w,"_1006data.rda")) + } + + + + +