--- a +++ b/EarlyStableCN/determine_allCNbreaks.R @@ -0,0 +1,104 @@ +################################################ +# File Name:determine_allCNbreaks.r +# Author: jillianwise +# Mail: jwise7@mgh.harvard.edu +# Created Time: Mon 22 August 2022 9:25 AM EST +################################################# +#' Given multiple samples determine the CN state at all site breaks present in the dataset +#' +#' Given a data frame of CNV segments of DNA with rows representing each unique segment and columns representing the CN states and sample information break each sample into all available segments +#' @chr column of a dataframe that lists the numeric (please reformat X and Y to 23 and 24) chromosome number of the alignment +#' @start column of a dataframe representing the start site of a segment +#' @end column of a dataframe representing the end site of a segment +#' @SegCN The CN state in Numeric CN, either integer or summed raw copy states of both alleles +#' @sample the sample name used in segmentation and expression files +#' @case the case name used for identifying multiple related samples +#' @ploidy the estimated ploidy from any number of CN tools (ASCAT, facets, etc) +#' Given an a directory to write the dataframe to + + +determine_allCNbreaks <- function(df,outdir) { + print("converting CN to log2-1 ploidy adjusted states") + ###check numeric SegCN +df$SegCN <- as.numeric(df$SegCN) + ###Now create SegCNploidyadjusted +df$tumor_ploidyc <- round(df$ploidy-2) +df$SegCNa <- (log2(df$SegCN-df$tumor_ploidyc)-1) + ###Lets replace -inf or other negative errors with -10 a +df$SegCNa <- ifelse(df$SegCNa=="NaN", "-10", paste(df$SegCNa)) +df$SegCNa <- ifelse(df$SegCNa=="-Inf", "-10", paste(df$SegCNa)) +df$SegCNa <- as.numeric(df$SegCNa) + +#Make list of every unique start and end for every chr, regardless of case, so one set of segments for all cases to be compared across# +df2 <- NULL +print("Finding All Break Points") +for (chr in unique(df$chr)) { + print(chr) + x=unique(df$start[df$chr == chr]) + y=unique(df$end[df$chr == chr]) + pos=sort(c(x,y)) + df2 = rbind(df2, data.frame(chr=rep(chr,length(pos)), pos=pos)) +} + +df2$end <- dplyr::lead(df2$pos) +for(i in 1:nrow(df2)){ + df2$end[i] <- ifelse(df2$chr[i] != df2$chr[i+1], 0, df2$end[i])} +df2 <- filter(df2, df2$end !=0) + +###For each patient make a column of their CN state in that segment +require(GenomicRanges) + +all <- makeGRangesFromDataFrame(df2, + keep.extra.columns = TRUE, + ignore.strand = FALSE, + start.field="pos", + end.field="end", + seqnames = "chr") +################### +df3 <- list() +print("Making GRanges Object") +for (i in unique(df$sample)){ + print(i) + tmp <- makeGRangesFromDataFrame(df[df$sample == i,], + keep.extra.columns=TRUE, + ignore.strand=FALSE, + start.field="start", + end.field="end", + seqnames = "chr") + df3[[i]]=tmp +} + +colnames <- names(df3) +df2$idx <- rownames(df2) +df2$idx <- as.numeric(df2$idx) + +#####Run granges overlap between the total segment site list and each patient segmentation granges object +tmp2 <- NULL +tmp <- NULL +print("Finding Copy State For Each Break") +for (i in (1:length(df3))){ + print(i) + over <- findOverlaps(all, df3[[i]], minoverlap=2) + tmp <- all[queryHits(over)] + tmp$SegCNa <- df3[[i]]$SegCNa[subjectHits(over)] + tmp$sample <- df3[[i]]$sample[subjectHits(over)] + tmp <- as.data.frame(tmp, row.names = NULL) + #####Next is to index which results segments match the total segmentation list + for (j in (1:length(tmp$seqnames))){ + tmp2[j] <- which(tmp$seqnames[j]==df2$chr & tmp$start[j]==df2$pos & tmp$end[j]==df2$end) + } + tmp2 <- as.data.frame(tmp2) + tmp2$SegCNa <- as.numeric(tmp$SegCNa) + tmp2 <- tmp2 %>% arrange(tmp2) + #####For each sample make a column and for the index matches add in the correct CNstate + for (z in (1:length(tmp2$tmp2))){ + df2[tmp2$tmp2[z],paste(colnames[i],sep="")] <- tmp2$SegCNa[z]} + tmp <- NULL + tmp2 <- NULL +} +write.table(df2, file = paste(outdir,"allCNbreaks.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE) +return(df2) +} + + +