Switch to unified view

a b/EarlyStableCN/find_earlystable.R
1
################################################
2
#  File Name:find_earlystable.r
3
#  Author: jillianwise
4
#  Mail: jwise7@mgh.harvard.edu
5
#  Created Time: Mon 22 August 2022 9:25 AM EST
6
#################################################
7
#' Given a list of CN states at all avaliable site breaks determine which CN States are stable throughout sampling/time
8
#'
9
#' Given a data frame of CNV segments in which all breaks are reported for all cases (see determine_allCNbreaks.R) and a sample information file- output the early stable CN regions for each case
10
#' allCNbreaks file
11
#' @chr column of a dataframe that lists the numeric (please reformat X and Y to 23 and 24) chromosome number of the alignment
12
#' @start column of a dataframe representing the start site of a segment
13
#' @end column of a dataframe representing the end site of a segment
14
#' @idx determined from determine_allCNbreaks.R this gives a unique id to each break region for each sample
15
#' The following columns should be the SegCN adjusted for ploidy values for every sample as a column (log2(df$SegCN-round(df$ploidy-2))-1)
16
#' The second file should represent a sample manifest sheet
17
#' @sample each samples name, matching those used to find all CN breaks
18
#' @case the case name used for identifying multiple related samples
19
#' @ploidy the estimated ploidy from any number of CN tools (ASCAT, facets, etc)
20
#' Given an output directory
21
#' Given the choice of modeling for Stable regions: 
22
#' plusminus allows for CN states to range between +- a threshold typically set at 0.3 while allowing for a copy state abberation (+-0.3) at initial biopsy to further increase or decrease.
23
#' mahalanobis uses a calculated distribution for each sample and determines the distance of each point from that distribution
24
#' linearpc1 calculates distances from the first principal component
25
#' Given a threshold (thresh) for plusminus modeling (default=0.3)
26
#' Given an option for returning a merged/unionized case segmentation file representing the early stable copy states
27
28
29
find_earlystable <- function(df,names,outdir,change="plusminus",thresh=0.3,return_segmentfile=FALSE) {
30
  require(dplyr)
31
  df2 <- subset(df, select = c("chr", "pos", "end", "idx"))
32
  df2[,unique(names$case)] <- NA
33
  if(change != "plusminus"){
34
    ####I report ploidy adjust CN as (log2(df$SegCN-round(df$ploidy-2))-1)
35
    ####To run Mahalanobis and PC1 We need total CN
36
    tmp <- df[,-c(1:4)]
37
    tmp <- (2^(tmp+1))
38
    tmp3 <- tmp 
39
    tmp3[,unique(names$sample)] <- NA
40
    for (i in colnames(tmp)){
41
      tmp2 <- tmp[,i]
42
      tmp2 <- tmp2+(names[names$sample==i,3]-2)
43
      tmp3[,i] <- tmp2
44
    }
45
    tmp3 <- cbind(df[,c(1:4)],tmp3)
46
    df <- tmp3
47
  }
48
  
49
  ######Check overlap between names files and CN file
50
  print("Checking Sample Intersection")
51
  rem <- setdiff(names$sample,colnames(df))
52
  if (length(rem)!=0){
53
    print(paste("removing ",rem, " from sample file",sep=""))
54
    names <- dplyr::filter(names, names$sample != rem)
55
  }
56
  
57
  ########################
58
  ##Check for any columns with all NAs
59
  len <- dim(df)[1]
60
  df <- df[,colSums(is.na(df)) != len]
61
  ########################
62
  
63
  #########################
64
  ###Remove any unpaired samples
65
  print("Checking for Unpaired Samples")
66
  tmpnames <- dplyr::filter(names, names$sample %in% colnames(df))
67
  tmp <- tmpnames %>% dplyr::group_by(case) %>% dplyr::count()
68
  rem2 <- as.data.frame(tmp[tmp$n < 2,1])
69
  print(paste("Sample", rem2$case," has no Pairs, Removing"))
70
  rem3 <- as.data.frame(names[names$case==rem2$case,1])
71
  colnames(rem3) <- c("sample")
72
  drop <- c(rem3$sample)
73
  df <- df[,!(names(df) %in% drop)]
74
  
75
  #####################
76
  ##Update names after sample removal
77
  names <- dplyr::filter(names, names$sample %in% colnames(df))
78
  
79
  ###For each case (based on the first biopsy) make a loss dataframe and gain (If one uses absolute value a switch to gain or loss would be counted)
80
  if(change == "plusminus"){
81
    if (is.na(plot)) {
82
      plot = F
83
    } else {
84
      fname = plot
85
      plot = T
86
    }
87
  for (i in unique(names$case)){
88
    print(i)
89
    namestmp <- dplyr::filter(names, names$case == i)  
90
    tmp <- unique(namestmp$sample)
91
    tmp <- c(tmp)
92
    tmpdf2 <- df %>% dplyr::select(tmp)
93
    tmpdf3 <- tmpdf2 %>% dplyr::filter(tmpdf2[,1] > thresh) #This dataframe is all gains above .3 in first biopsy
94
    tmpdf4 <- tmpdf2 %>% dplyr::filter(tmpdf2[,1] < -(thresh)) #This dataframe is all losses below  -.3 in first biopsy
95
    greatergain <- function(x) all((x >= x[1]))
96
    greaterloss <- function(x) all((x <= x[1]))
97
    plusminus <- function(x) all((abs(x - x[1]) < thresh))
98
    tmpdf2[, "Consensus"] <- apply(tmpdf2, 1, plusminus)
99
    tmpdf3[, "GreaterGains"] <- apply(tmpdf3, 1, greatergain)
100
    tmpdf4[, "GreaterLoss"] <- apply(tmpdf4, 1, greaterloss)
101
    ###add back in the idx
102
    tmpdf2$idx <- rownames(tmpdf2)
103
    tmpdf5 <- tmpdf2 %>% dplyr::filter(tmpdf2[idx,1] > thresh) %>% select(idx)
104
    tmpdf3$idx <-tmpdf5$idx
105
    tmpdf6 <- tmpdf2 %>% dplyr::filter(tmpdf2[idx,1] < -(thresh)) %>% select(idx)
106
    tmpdf4$idx <-tmpdf6$idx
107
    ####plot
108
    pldf <- full_join(tmpdf2,tmpdf3)
109
    pldf <- full_join(pldf,tmpdf4)
110
    pldf$color <- ifelse(pldf$Consensus==TRUE,"black",ifelse(pldf$GreaterGains==TRUE & pldf[,1] > thresh ,"black",ifelse(pldf$GreaterLoss==TRUE & pldf[,1] > -(thresh),"black","red")))
111
    pldf$color <- ifelse(is.na(pldf$color),"red",pldf$color)
112
    if (plot) {
113
      pdf(paste(outdir,i,"plusmins.pdf"))
114
    }
115
    if (plot) {
116
      plot(pldf[,1], pldf[,2],xlim = c(-2, 4),ylim = c(-2, 4),ann=FALSE)
117
      abline(h=0); abline(v=0)
118
      abline(coef=c(thresh,1), col="green", lty=2)
119
      abline(coef=c(-(thresh),1), col="green", lty=2)
120
      points(pldf[,1], pldf[,2], col=pldf[,"color"])
121
      title(main=paste("Case: ", i, sep=""),xlab=paste(colnames(pldf)[1]," Copy State", sep=""),ylab=paste(colnames(pldf)[2]," Copy State", sep=""))
122
      
123
    }
124
    if (plot) {
125
      dev.off()
126
    }
127
    df2[,i] <- ifelse(tmpdf2$Consensus == TRUE, tmpdf2[,1],NA)
128
    for (j in tmpdf3$idx){
129
      df2[j,i] <- ifelse(tmpdf3$GreaterGains[tmpdf3$idx==j] == TRUE, tmpdf3[as.numeric(which(tmpdf3$idx==j)),1],df2[j,i])}
130
    for (k in tmpdf4$idx){
131
      df2[k,i] <- ifelse(tmpdf4$GreaterLoss[tmpdf4$idx==k] == TRUE,  tmpdf4[as.numeric(which(tmpdf4$idx==k)),1],df2[k,i])}
132
    tmpdf2 <- NULL
133
    tmpdf3 <- NULL
134
    tmpdf4 <- NULL
135
    tmp <- NULL
136
  }
137
  write.table(df2, file = paste(outdir,"earlystableregions.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE)
138
  return(df2)
139
  }
140
  
141
  if(change == "mahalanobis"){
142
    require(tidyr)
143
    require(stats)
144
    require(rlang)
145
    require(stringr)
146
    if (is.na(plot)) {
147
      plot = F
148
    } else {
149
      fname = plot
150
      plot = T
151
    }
152
    ###############
153
    #####get_selected_vars and mahalanobis_Distance is written by Alboukadel Kassambra and from the rstatix package (version 0.7.0 )
154
    ###########
155
    get_selected_vars <- function(x, ..., vars = NULL){
156
      if(is_grouped_df(x))
157
        x <- x %>% dplyr::ungroup()
158
      dot.vars <- rlang::quos(...)
159
      if(length(vars) > 0){
160
        return(vars)
161
      }
162
      if (length(dot.vars) == 0) selected <- colnames(x)
163
      else selected <- tidyselect::vars_select(names(x), !!! dot.vars)
164
      selected %>% as.character()
165
    }
166
    mahalanobis_distance <- function(data, ...){
167
      if(is_grouped_df(data)){
168
        results <- data %>%
169
          doo(~mahalanobis_distance(.))
170
      }
171
      data <- data %>% select_if(is.numeric)
172
      data <- data %>% drop_na()
173
      vars <- data %>% get_selected_vars(...)
174
      n.vars <- length(vars)
175
      threshold <- stats::qchisq(0.999, n.vars)
176
      .data <- data %>% select(!!!syms(vars)) %>% as.matrix()
177
      distance <- stats::mahalanobis(.data,center = colMeans(.data),cov = cov(.data))
178
      results <- data %>% mutate(mahal.dist = round(distance, 3),is.outlier = .data$mahal.dist > threshold)
179
      results
180
    }
181
    for (i in unique(names$case)){
182
      tmpname <- dplyr::filter(names, names$case == i)  
183
      tmpname <- unique(tmpname$sample)
184
      tmp <- df[,tmpname]
185
      ##Adding in idx function back to dataframe
186
      tmp$idx <- df2$idx
187
      tmp <-mahalanobis_distance(tmp, -idx) 
188
      tmp <- tmp %>% dplyr::filter(is.outlier == TRUE)
189
      write.table(tmp, paste(outdir,as.character(i),'mahalanobisremovedegments.txt',sep=''), append = FALSE, quote = FALSE, sep = "\t",
190
                  eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE) 
191
      tmp <- NULL
192
      tmpname <- NULL
193
    }
194
    dfc <- df2[1:4]
195
    for (i in unique(names$case)){
196
      tmpname <- dplyr::filter(names, names$case == i)  
197
      tmpname <- unique(tmpname$sample)
198
      tmp <- df[,tmpname]
199
      ##Adding in idx function back to dataframe
200
      tmp$idx <- df2$idx
201
      tmp <- mahalanobis_distance(tmp, -idx) 
202
      tmp$color <- ifelse(tmp$is.outlier == "TRUE", "red", "black")
203
      if (plot) {
204
        pdf(paste(outdir,i,"mahal.pdf"))
205
      }
206
      if (plot) {
207
        plot(tmp[,1], tmp[,2],xlim = c(-2, 4),ylim = c(-2, 4),ann=FALSE)
208
        abline(h=0); abline(v=0)
209
        points(tmp[,1], tmp[,2], col=tmp[,"color"])
210
        title(main=paste("Case: ", i, sep=""),xlab=paste(colnames(tmp)[1]," Copy State", sep=""),ylab=paste(colnames(tmp)[2]," Copy State", sep=""))
211
      }
212
      if (plot) {
213
        dev.off()
214
      }
215
      tmp <- tmp %>% dplyr::filter(is.outlier == FALSE)
216
      tmp <- as.data.frame(tmp)
217
      for (j in 1:length(dfc$idx)){
218
        dfc[j, i] <- ifelse(any(tmp$idx == j) != 0, tmp[which(tmp$idx == j),1], NA)
219
      }
220
      tmp <- NULL
221
      tmpname <- NULL
222
    }
223
    
224
    write.table(dfc, file = paste(outdir,"earlystableregions_mahal.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE)
225
    return(dfc)
226
  }
227
  
228
  if(change == "linearpc1"){
229
    pcdist = function(x, outlier=0.9, plot=F, id=NA) {
230
      xc = as.matrix(x)
231
      for (j in 1:ncol(x)) {
232
        xc[,j] = x[,j] - mean(x[,j], na.rm=T)
233
      }
234
      udv = svd(xc)
235
      d = rep(NA, nrow(xc))
236
      for (i in 1:nrow(xc)) {
237
        d[i] = sqrt(sum((xc[i,] - udv$v[,1]*sum(xc[i,]*udv$v[,1]))^2))
238
      }
239
      prop.var = udv$d[1]^2/sum(udv$d^2)
240
      if (plot) {
241
        plot(xc[,1], xc[,2])
242
        abline(h=0); abline(v=0)
243
        dydx = udv$v[2,1]/udv$v[1,1]
244
        abline(coef=c(0,dydx), col="green")
245
        thresh = quantile(d, outlier)
246
        ok = d > thresh
247
        d0 = thresh/(cos(atan(dydx)))
248
        print(d0)
249
        print(dydx)
250
        abline(coef=c(d0,dydx), col="green", lty=2)
251
        abline(coef=c(-d0,dydx), col="green", lty=2)
252
        points(xc[ok,1], xc[ok,2], col="red")
253
        title(paste("Case: ", id, " (n=", ncol(x),"; PC1=", round(prop.var,2), ")", sep=""))
254
      }
255
      list(d=d, prop.var=prop.var, thresh=thresh)
256
    }
257
    
258
    # Distance function wrapper
259
    run_pcdist = function(df, groups, skip=c(1,2,3,4), outlier=0.9, plot=NA) {
260
      if (is.na(plot)) {
261
        plot = F
262
      } else {
263
        fname = plot
264
        plot = T
265
      }
266
      annot = df[,skip,drop=F]
267
      data = df[,-skip]
268
      groups = as.character(groups)
269
      ugroup = unique(groups)
270
      y = as.data.frame(matrix(NA, nrow(data), length(ugroup)))
271
      z = data.frame(matrix(NA,1,length(ugroup)))
272
      out = data.frame(matrix(NA, nrow(data), length(ugroup)))
273
      names(z) = ugroup
274
      names(y) = ugroup
275
      names(out) = ugroup
276
      if (plot) {
277
        pdf(fname)
278
      }
279
      for (e in ugroup) {
280
        ok = apply(data[,which(groups==e)], 1, function(x) all(!is.na(x)))
281
        res = pcdist(data[ok,which(groups==e)], outlier=outlier, plot=plot, id=e)
282
        y[ok,e] = res$d
283
        z[e] = res$prop.var
284
        out[ok,e] = res$d > res$thresh
285
      }
286
      if (plot) {
287
        dev.off()
288
      }
289
      res = cbind(annot,y)
290
      list(table=res, prop.var=z, outlier=out)
291
    }
292
    groups=names$case
293
    res = run_pcdist(df, groups, plot=paste(outdir,"pcdis_Figure1.pdf",sep=""))
294
    write.table(res, file = paste(outdir,"earlystableregions_predictions_pcdist.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE)
295
    #######Here I want to resturn a table of early stable CN regions based on these results 
296
    res_f <- as.data.frame(df2$idx)
297
    colnames(res_f) <- c("idx")
298
    print("combining results")
299
    for (k in unique(groups)){
300
      tmpname <- dplyr::filter(names, names$case == k)  
301
      tmpname <- unique(tmpname$sample)
302
      tmp <- df[,tmpname]
303
      length_f <- length(tmp)
304
      ##Adding in idx function back to dataframe
305
      #tmp$idx <- df2$idx
306
      ###filter outlier
307
      o <- res$outlier[,k]
308
      for (j in 1:length(o)){
309
        for (l in 1:length_f){
310
      tmp[j,l] <- ifelse(o[j] == TRUE,NA,tmp[j,l])
311
      }}
312
      res_f <- cbind(tmp,res_f)
313
      print(k)
314
      }
315
    res_f <- left_join(df2[,c(1:4)],res_f)
316
    write.table(res_f, file = paste(outdir,"earlystableregions_pcdist.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE)
317
    return(res_f)
318
  }
319
  
320
  if(return_segmentfile == TRUE){
321
    #####next lets turn it back into some resemblance of segment files
322
    ###Each row is a segment per sample
323
    datalist = list()
324
    for (i in 5:length(df2)){
325
      tmp <- df2[,c(1:4,i)]
326
      j=paste(colnames(tmp[5]))
327
      tmp <- tmp %>% dplyr::filter(!is.na(tmp[j]))
328
      #####fill in first chr and start
329
      ## Find the changes in chr or P1.
330
      tmp$diff <- c(NA, (tmp[2:nrow(tmp), 'chr'] != tmp[1:(nrow(tmp) - 1), 'chr']) |
331
                      (tmp[2:nrow(tmp), j] != tmp[1:(nrow(tmp) - 1), j]))
332
      ## Assign a run number to the rows that increments only with the changes found
333
      ## above.
334
      tmp$runNum <- NA
335
      runNum <- 1
336
      tmp[1, 'runNum'] <- runNum
337
      for (ii in 2:nrow(tmp)) {
338
        if (tmp[ii, 'diff']) runNum <- runNum + 1
339
        tmp[ii, 'runNum'] <- runNum
340
      }
341
      ##get rid of factors
342
      tmp$chr <- as.numeric(tmp$chr)
343
      ## Collect data from each run number to a single row.
344
      tmp2 <- data.frame(chr = tapply(tmp[, 'chr'], tmp[, 'runNum'], min),
345
                         pos = tapply(tmp[, 'pos'], tmp[, 'runNum'], min),
346
                         end = tapply(tmp[, 'end'], tmp[, 'runNum'], max),
347
                         idx = tapply(tmp[, 'idx'], tmp[, 'runNum'], min),
348
                         case = tapply(tmp[, j], tmp[, 'runNum'], min),
349
                         runNum = tapply(tmp[, 'runNum'], tmp[, 'runNum'], min),
350
                         stringsAsFactors = FALSE)
351
      tmp2 <- tmp2[order(tmp2[, 'runNum']), , drop = FALSE]
352
      ##get rid of idx and run num
353
      tmp2$runNum <- NULL
354
      tmp2$idx <- NULL
355
      tmp2$Sample <- j
356
      colnames(tmp2) <- c("Chromosome", "Start", "End","SegCNa", "Sample")
357
      datalist[[i]] <- tmp2
358
      tmp <- NULL
359
      tmp2 <- NULL
360
    }
361
    concensus_seg = do.call(rbind, datalist)
362
    return(concensus_seg)
363
    write.table(concensus_seg, file = paste(outdir,"earlystableregions_segmentationfile.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE)
364
  }
365
}
366