Switch to unified view

a b/EarlyStableCN/determine_allCNbreaks.R
1
################################################
2
#  File Name:determine_allCNbreaks.r
3
#  Author: jillianwise
4
#  Mail: jwise7@mgh.harvard.edu
5
#  Created Time: Mon 22 August 2022 9:25 AM EST
6
#################################################
7
#' Given multiple samples determine the CN state at all site breaks present in the dataset
8
#'
9
#' Given a data frame of CNV segments of DNA with rows representing each unique segment and columns representing the CN states and sample information break each sample into all available segments
10
#' @chr column of a dataframe that lists the numeric (please reformat X and Y to 23 and 24) chromosome number of the alignment
11
#' @start column of a dataframe representing the start site of a segment
12
#' @end column of a dataframe representing the end site of a segment
13
#' @SegCN The CN state in Numeric CN, either integer or summed raw copy states of both alleles
14
#' @sample the sample name used in segmentation and expression files
15
#' @case the case name used for identifying multiple related samples
16
#' @ploidy the estimated ploidy from any number of CN tools (ASCAT, facets, etc)
17
#' Given an a directory to write the dataframe to
18
19
20
determine_allCNbreaks <- function(df,outdir) {
21
  print("converting CN to log2-1 ploidy adjusted states")
22
  ###check numeric SegCN
23
df$SegCN <- as.numeric(df$SegCN)
24
  ###Now create SegCNploidyadjusted
25
df$tumor_ploidyc <- round(df$ploidy-2)
26
df$SegCNa <- (log2(df$SegCN-df$tumor_ploidyc)-1)
27
  ###Lets replace -inf or other negative errors with -10 a
28
df$SegCNa <- ifelse(df$SegCNa=="NaN", "-10", paste(df$SegCNa)) 
29
df$SegCNa <- ifelse(df$SegCNa=="-Inf", "-10", paste(df$SegCNa))
30
df$SegCNa <- as.numeric(df$SegCNa)
31
32
#Make list of every unique start and end for every chr, regardless of case, so one set of segments for all cases to be compared across#
33
df2 <- NULL
34
print("Finding All Break Points")
35
for (chr in unique(df$chr)) {
36
  print(chr)
37
  x=unique(df$start[df$chr == chr])
38
  y=unique(df$end[df$chr == chr])
39
  pos=sort(c(x,y))
40
  df2 = rbind(df2, data.frame(chr=rep(chr,length(pos)), pos=pos))
41
}
42
43
df2$end <- dplyr::lead(df2$pos)
44
for(i in 1:nrow(df2)){
45
  df2$end[i] <- ifelse(df2$chr[i] != df2$chr[i+1], 0, df2$end[i])}
46
df2 <- filter(df2, df2$end !=0)
47
48
###For each patient make a column of their CN state in that segment
49
require(GenomicRanges)
50
51
all <- makeGRangesFromDataFrame(df2,
52
                                keep.extra.columns = TRUE,
53
                                ignore.strand = FALSE,
54
                                start.field="pos",
55
                                end.field="end", 
56
                                seqnames = "chr")
57
###################
58
df3 <- list()
59
print("Making GRanges Object")
60
for (i in unique(df$sample)){
61
  print(i)
62
  tmp <- makeGRangesFromDataFrame(df[df$sample == i,],
63
                                  keep.extra.columns=TRUE,
64
                                  ignore.strand=FALSE,
65
                                  start.field="start",
66
                                  end.field="end", 
67
                                  seqnames = "chr") 
68
  df3[[i]]=tmp
69
}
70
71
colnames <- names(df3)
72
df2$idx <- rownames(df2)
73
df2$idx <- as.numeric(df2$idx)
74
75
#####Run granges overlap between the total segment site list and each patient segmentation granges object
76
tmp2 <- NULL
77
tmp <- NULL
78
print("Finding Copy State For Each Break")
79
for (i in (1:length(df3))){
80
  print(i)
81
  over <- findOverlaps(all, df3[[i]], minoverlap=2)
82
  tmp <- all[queryHits(over)]
83
  tmp$SegCNa <- df3[[i]]$SegCNa[subjectHits(over)]
84
  tmp$sample <- df3[[i]]$sample[subjectHits(over)]
85
  tmp <- as.data.frame(tmp, row.names = NULL)
86
  #####Next is to index which results segments match the total segmentation list
87
  for (j in (1:length(tmp$seqnames))){
88
    tmp2[j] <- which(tmp$seqnames[j]==df2$chr & tmp$start[j]==df2$pos & tmp$end[j]==df2$end)
89
  }
90
  tmp2 <- as.data.frame(tmp2)
91
  tmp2$SegCNa <- as.numeric(tmp$SegCNa)
92
  tmp2 <- tmp2 %>% arrange(tmp2)
93
  #####For each sample make a column and for the index matches add in the correct CNstate
94
  for (z in (1:length(tmp2$tmp2))){
95
    df2[tmp2$tmp2[z],paste(colnames[i],sep="")] <- tmp2$SegCNa[z]}
96
  tmp <- NULL
97
  tmp2 <- NULL
98
}
99
write.table(df2, file = paste(outdir,"allCNbreaks.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE)
100
return(df2)
101
}
102
103
104