|
a |
|
b/EarlyStableCN/determine_allCNbreaks.R |
|
|
1 |
################################################ |
|
|
2 |
# File Name:determine_allCNbreaks.r |
|
|
3 |
# Author: jillianwise |
|
|
4 |
# Mail: jwise7@mgh.harvard.edu |
|
|
5 |
# Created Time: Mon 22 August 2022 9:25 AM EST |
|
|
6 |
################################################# |
|
|
7 |
#' Given multiple samples determine the CN state at all site breaks present in the dataset |
|
|
8 |
#' |
|
|
9 |
#' Given a data frame of CNV segments of DNA with rows representing each unique segment and columns representing the CN states and sample information break each sample into all available segments |
|
|
10 |
#' @chr column of a dataframe that lists the numeric (please reformat X and Y to 23 and 24) chromosome number of the alignment |
|
|
11 |
#' @start column of a dataframe representing the start site of a segment |
|
|
12 |
#' @end column of a dataframe representing the end site of a segment |
|
|
13 |
#' @SegCN The CN state in Numeric CN, either integer or summed raw copy states of both alleles |
|
|
14 |
#' @sample the sample name used in segmentation and expression files |
|
|
15 |
#' @case the case name used for identifying multiple related samples |
|
|
16 |
#' @ploidy the estimated ploidy from any number of CN tools (ASCAT, facets, etc) |
|
|
17 |
#' Given an a directory to write the dataframe to |
|
|
18 |
|
|
|
19 |
|
|
|
20 |
determine_allCNbreaks <- function(df,outdir) { |
|
|
21 |
print("converting CN to log2-1 ploidy adjusted states") |
|
|
22 |
###check numeric SegCN |
|
|
23 |
df$SegCN <- as.numeric(df$SegCN) |
|
|
24 |
###Now create SegCNploidyadjusted |
|
|
25 |
df$tumor_ploidyc <- round(df$ploidy-2) |
|
|
26 |
df$SegCNa <- (log2(df$SegCN-df$tumor_ploidyc)-1) |
|
|
27 |
###Lets replace -inf or other negative errors with -10 a |
|
|
28 |
df$SegCNa <- ifelse(df$SegCNa=="NaN", "-10", paste(df$SegCNa)) |
|
|
29 |
df$SegCNa <- ifelse(df$SegCNa=="-Inf", "-10", paste(df$SegCNa)) |
|
|
30 |
df$SegCNa <- as.numeric(df$SegCNa) |
|
|
31 |
|
|
|
32 |
#Make list of every unique start and end for every chr, regardless of case, so one set of segments for all cases to be compared across# |
|
|
33 |
df2 <- NULL |
|
|
34 |
print("Finding All Break Points") |
|
|
35 |
for (chr in unique(df$chr)) { |
|
|
36 |
print(chr) |
|
|
37 |
x=unique(df$start[df$chr == chr]) |
|
|
38 |
y=unique(df$end[df$chr == chr]) |
|
|
39 |
pos=sort(c(x,y)) |
|
|
40 |
df2 = rbind(df2, data.frame(chr=rep(chr,length(pos)), pos=pos)) |
|
|
41 |
} |
|
|
42 |
|
|
|
43 |
df2$end <- dplyr::lead(df2$pos) |
|
|
44 |
for(i in 1:nrow(df2)){ |
|
|
45 |
df2$end[i] <- ifelse(df2$chr[i] != df2$chr[i+1], 0, df2$end[i])} |
|
|
46 |
df2 <- filter(df2, df2$end !=0) |
|
|
47 |
|
|
|
48 |
###For each patient make a column of their CN state in that segment |
|
|
49 |
require(GenomicRanges) |
|
|
50 |
|
|
|
51 |
all <- makeGRangesFromDataFrame(df2, |
|
|
52 |
keep.extra.columns = TRUE, |
|
|
53 |
ignore.strand = FALSE, |
|
|
54 |
start.field="pos", |
|
|
55 |
end.field="end", |
|
|
56 |
seqnames = "chr") |
|
|
57 |
################### |
|
|
58 |
df3 <- list() |
|
|
59 |
print("Making GRanges Object") |
|
|
60 |
for (i in unique(df$sample)){ |
|
|
61 |
print(i) |
|
|
62 |
tmp <- makeGRangesFromDataFrame(df[df$sample == i,], |
|
|
63 |
keep.extra.columns=TRUE, |
|
|
64 |
ignore.strand=FALSE, |
|
|
65 |
start.field="start", |
|
|
66 |
end.field="end", |
|
|
67 |
seqnames = "chr") |
|
|
68 |
df3[[i]]=tmp |
|
|
69 |
} |
|
|
70 |
|
|
|
71 |
colnames <- names(df3) |
|
|
72 |
df2$idx <- rownames(df2) |
|
|
73 |
df2$idx <- as.numeric(df2$idx) |
|
|
74 |
|
|
|
75 |
#####Run granges overlap between the total segment site list and each patient segmentation granges object |
|
|
76 |
tmp2 <- NULL |
|
|
77 |
tmp <- NULL |
|
|
78 |
print("Finding Copy State For Each Break") |
|
|
79 |
for (i in (1:length(df3))){ |
|
|
80 |
print(i) |
|
|
81 |
over <- findOverlaps(all, df3[[i]], minoverlap=2) |
|
|
82 |
tmp <- all[queryHits(over)] |
|
|
83 |
tmp$SegCNa <- df3[[i]]$SegCNa[subjectHits(over)] |
|
|
84 |
tmp$sample <- df3[[i]]$sample[subjectHits(over)] |
|
|
85 |
tmp <- as.data.frame(tmp, row.names = NULL) |
|
|
86 |
#####Next is to index which results segments match the total segmentation list |
|
|
87 |
for (j in (1:length(tmp$seqnames))){ |
|
|
88 |
tmp2[j] <- which(tmp$seqnames[j]==df2$chr & tmp$start[j]==df2$pos & tmp$end[j]==df2$end) |
|
|
89 |
} |
|
|
90 |
tmp2 <- as.data.frame(tmp2) |
|
|
91 |
tmp2$SegCNa <- as.numeric(tmp$SegCNa) |
|
|
92 |
tmp2 <- tmp2 %>% arrange(tmp2) |
|
|
93 |
#####For each sample make a column and for the index matches add in the correct CNstate |
|
|
94 |
for (z in (1:length(tmp2$tmp2))){ |
|
|
95 |
df2[tmp2$tmp2[z],paste(colnames[i],sep="")] <- tmp2$SegCNa[z]} |
|
|
96 |
tmp <- NULL |
|
|
97 |
tmp2 <- NULL |
|
|
98 |
} |
|
|
99 |
write.table(df2, file = paste(outdir,"allCNbreaks.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE) |
|
|
100 |
return(df2) |
|
|
101 |
} |
|
|
102 |
|
|
|
103 |
|
|
|
104 |
|