Switch to side-by-side view

--- a
+++ b/EarlyStableCN/find_earlystable.R
@@ -0,0 +1,366 @@
+################################################
+#  File Name:find_earlystable.r
+#  Author: jillianwise
+#  Mail: jwise7@mgh.harvard.edu
+#  Created Time: Mon 22 August 2022 9:25 AM EST
+#################################################
+#' Given a list of CN states at all avaliable site breaks determine which CN States are stable throughout sampling/time
+#'
+#' Given a data frame of CNV segments in which all breaks are reported for all cases (see determine_allCNbreaks.R) and a sample information file- output the early stable CN regions for each case
+#' allCNbreaks file
+#' @chr column of a dataframe that lists the numeric (please reformat X and Y to 23 and 24) chromosome number of the alignment
+#' @start column of a dataframe representing the start site of a segment
+#' @end column of a dataframe representing the end site of a segment
+#' @idx determined from determine_allCNbreaks.R this gives a unique id to each break region for each sample
+#' The following columns should be the SegCN adjusted for ploidy values for every sample as a column (log2(df$SegCN-round(df$ploidy-2))-1)
+#' The second file should represent a sample manifest sheet
+#' @sample each samples name, matching those used to find all CN breaks
+#' @case the case name used for identifying multiple related samples
+#' @ploidy the estimated ploidy from any number of CN tools (ASCAT, facets, etc)
+#' Given an output directory
+#' Given the choice of modeling for Stable regions: 
+#' plusminus allows for CN states to range between +- a threshold typically set at 0.3 while allowing for a copy state abberation (+-0.3) at initial biopsy to further increase or decrease.
+#' mahalanobis uses a calculated distribution for each sample and determines the distance of each point from that distribution
+#' linearpc1 calculates distances from the first principal component
+#' Given a threshold (thresh) for plusminus modeling (default=0.3)
+#' Given an option for returning a merged/unionized case segmentation file representing the early stable copy states
+
+
+find_earlystable <- function(df,names,outdir,change="plusminus",thresh=0.3,return_segmentfile=FALSE) {
+  require(dplyr)
+  df2 <- subset(df, select = c("chr", "pos", "end", "idx"))
+  df2[,unique(names$case)] <- NA
+  if(change != "plusminus"){
+    ####I report ploidy adjust CN as (log2(df$SegCN-round(df$ploidy-2))-1)
+    ####To run Mahalanobis and PC1 We need total CN
+    tmp <- df[,-c(1:4)]
+    tmp <- (2^(tmp+1))
+    tmp3 <- tmp 
+    tmp3[,unique(names$sample)] <- NA
+    for (i in colnames(tmp)){
+      tmp2 <- tmp[,i]
+      tmp2 <- tmp2+(names[names$sample==i,3]-2)
+      tmp3[,i] <- tmp2
+    }
+    tmp3 <- cbind(df[,c(1:4)],tmp3)
+    df <- tmp3
+  }
+  
+  ######Check overlap between names files and CN file
+  print("Checking Sample Intersection")
+  rem <- setdiff(names$sample,colnames(df))
+  if (length(rem)!=0){
+    print(paste("removing ",rem, " from sample file",sep=""))
+    names <- dplyr::filter(names, names$sample != rem)
+  }
+  
+  ########################
+  ##Check for any columns with all NAs
+  len <- dim(df)[1]
+  df <- df[,colSums(is.na(df)) != len]
+  ########################
+  
+  #########################
+  ###Remove any unpaired samples
+  print("Checking for Unpaired Samples")
+  tmpnames <- dplyr::filter(names, names$sample %in% colnames(df))
+  tmp <- tmpnames %>% dplyr::group_by(case) %>% dplyr::count()
+  rem2 <- as.data.frame(tmp[tmp$n < 2,1])
+  print(paste("Sample", rem2$case," has no Pairs, Removing"))
+  rem3 <- as.data.frame(names[names$case==rem2$case,1])
+  colnames(rem3) <- c("sample")
+  drop <- c(rem3$sample)
+  df <- df[,!(names(df) %in% drop)]
+  
+  #####################
+  ##Update names after sample removal
+  names <- dplyr::filter(names, names$sample %in% colnames(df))
+  
+  ###For each case (based on the first biopsy) make a loss dataframe and gain (If one uses absolute value a switch to gain or loss would be counted)
+  if(change == "plusminus"){
+    if (is.na(plot)) {
+      plot = F
+    } else {
+      fname = plot
+      plot = T
+    }
+  for (i in unique(names$case)){
+    print(i)
+    namestmp <- dplyr::filter(names, names$case == i)  
+    tmp <- unique(namestmp$sample)
+    tmp <- c(tmp)
+    tmpdf2 <- df %>% dplyr::select(tmp)
+    tmpdf3 <- tmpdf2 %>% dplyr::filter(tmpdf2[,1] > thresh) #This dataframe is all gains above .3 in first biopsy
+    tmpdf4 <- tmpdf2 %>% dplyr::filter(tmpdf2[,1] < -(thresh)) #This dataframe is all losses below  -.3 in first biopsy
+    greatergain <- function(x) all((x >= x[1]))
+    greaterloss <- function(x) all((x <= x[1]))
+    plusminus <- function(x) all((abs(x - x[1]) < thresh))
+    tmpdf2[, "Consensus"] <- apply(tmpdf2, 1, plusminus)
+    tmpdf3[, "GreaterGains"] <- apply(tmpdf3, 1, greatergain)
+    tmpdf4[, "GreaterLoss"] <- apply(tmpdf4, 1, greaterloss)
+    ###add back in the idx
+    tmpdf2$idx <- rownames(tmpdf2)
+    tmpdf5 <- tmpdf2 %>% dplyr::filter(tmpdf2[idx,1] > thresh) %>% select(idx)
+    tmpdf3$idx <-tmpdf5$idx
+    tmpdf6 <- tmpdf2 %>% dplyr::filter(tmpdf2[idx,1] < -(thresh)) %>% select(idx)
+    tmpdf4$idx <-tmpdf6$idx
+    ####plot
+    pldf <- full_join(tmpdf2,tmpdf3)
+    pldf <- full_join(pldf,tmpdf4)
+    pldf$color <- ifelse(pldf$Consensus==TRUE,"black",ifelse(pldf$GreaterGains==TRUE & pldf[,1] > thresh ,"black",ifelse(pldf$GreaterLoss==TRUE & pldf[,1] > -(thresh),"black","red")))
+    pldf$color <- ifelse(is.na(pldf$color),"red",pldf$color)
+    if (plot) {
+      pdf(paste(outdir,i,"plusmins.pdf"))
+    }
+    if (plot) {
+      plot(pldf[,1], pldf[,2],xlim = c(-2, 4),ylim = c(-2, 4),ann=FALSE)
+      abline(h=0); abline(v=0)
+      abline(coef=c(thresh,1), col="green", lty=2)
+      abline(coef=c(-(thresh),1), col="green", lty=2)
+      points(pldf[,1], pldf[,2], col=pldf[,"color"])
+      title(main=paste("Case: ", i, sep=""),xlab=paste(colnames(pldf)[1]," Copy State", sep=""),ylab=paste(colnames(pldf)[2]," Copy State", sep=""))
+      
+    }
+    if (plot) {
+      dev.off()
+    }
+    df2[,i] <- ifelse(tmpdf2$Consensus == TRUE, tmpdf2[,1],NA)
+    for (j in tmpdf3$idx){
+      df2[j,i] <- ifelse(tmpdf3$GreaterGains[tmpdf3$idx==j] == TRUE, tmpdf3[as.numeric(which(tmpdf3$idx==j)),1],df2[j,i])}
+    for (k in tmpdf4$idx){
+      df2[k,i] <- ifelse(tmpdf4$GreaterLoss[tmpdf4$idx==k] == TRUE,  tmpdf4[as.numeric(which(tmpdf4$idx==k)),1],df2[k,i])}
+    tmpdf2 <- NULL
+    tmpdf3 <- NULL
+    tmpdf4 <- NULL
+    tmp <- NULL
+  }
+  write.table(df2, file = paste(outdir,"earlystableregions.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE)
+  return(df2)
+  }
+  
+  if(change == "mahalanobis"){
+    require(tidyr)
+    require(stats)
+    require(rlang)
+    require(stringr)
+    if (is.na(plot)) {
+      plot = F
+    } else {
+      fname = plot
+      plot = T
+    }
+    ###############
+    #####get_selected_vars and mahalanobis_Distance is written by Alboukadel Kassambra and from the rstatix package (version 0.7.0 )
+    ###########
+    get_selected_vars <- function(x, ..., vars = NULL){
+      if(is_grouped_df(x))
+        x <- x %>% dplyr::ungroup()
+      dot.vars <- rlang::quos(...)
+      if(length(vars) > 0){
+        return(vars)
+      }
+      if (length(dot.vars) == 0) selected <- colnames(x)
+      else selected <- tidyselect::vars_select(names(x), !!! dot.vars)
+      selected %>% as.character()
+    }
+    mahalanobis_distance <- function(data, ...){
+      if(is_grouped_df(data)){
+        results <- data %>%
+          doo(~mahalanobis_distance(.))
+      }
+      data <- data %>% select_if(is.numeric)
+      data <- data %>% drop_na()
+      vars <- data %>% get_selected_vars(...)
+      n.vars <- length(vars)
+      threshold <- stats::qchisq(0.999, n.vars)
+      .data <- data %>% select(!!!syms(vars)) %>% as.matrix()
+      distance <- stats::mahalanobis(.data,center = colMeans(.data),cov = cov(.data))
+      results <- data %>% mutate(mahal.dist = round(distance, 3),is.outlier = .data$mahal.dist > threshold)
+      results
+    }
+    for (i in unique(names$case)){
+      tmpname <- dplyr::filter(names, names$case == i)  
+      tmpname <- unique(tmpname$sample)
+      tmp <- df[,tmpname]
+      ##Adding in idx function back to dataframe
+      tmp$idx <- df2$idx
+      tmp <-mahalanobis_distance(tmp, -idx) 
+      tmp <- tmp %>% dplyr::filter(is.outlier == TRUE)
+      write.table(tmp, paste(outdir,as.character(i),'mahalanobisremovedegments.txt',sep=''), append = FALSE, quote = FALSE, sep = "\t",
+                  eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE) 
+      tmp <- NULL
+      tmpname <- NULL
+    }
+    dfc <- df2[1:4]
+    for (i in unique(names$case)){
+      tmpname <- dplyr::filter(names, names$case == i)  
+      tmpname <- unique(tmpname$sample)
+      tmp <- df[,tmpname]
+      ##Adding in idx function back to dataframe
+      tmp$idx <- df2$idx
+      tmp <- mahalanobis_distance(tmp, -idx) 
+      tmp$color <- ifelse(tmp$is.outlier == "TRUE", "red", "black")
+      if (plot) {
+        pdf(paste(outdir,i,"mahal.pdf"))
+      }
+      if (plot) {
+        plot(tmp[,1], tmp[,2],xlim = c(-2, 4),ylim = c(-2, 4),ann=FALSE)
+        abline(h=0); abline(v=0)
+        points(tmp[,1], tmp[,2], col=tmp[,"color"])
+        title(main=paste("Case: ", i, sep=""),xlab=paste(colnames(tmp)[1]," Copy State", sep=""),ylab=paste(colnames(tmp)[2]," Copy State", sep=""))
+      }
+      if (plot) {
+        dev.off()
+      }
+      tmp <- tmp %>% dplyr::filter(is.outlier == FALSE)
+      tmp <- as.data.frame(tmp)
+      for (j in 1:length(dfc$idx)){
+        dfc[j, i] <- ifelse(any(tmp$idx == j) != 0, tmp[which(tmp$idx == j),1], NA)
+      }
+      tmp <- NULL
+      tmpname <- NULL
+    }
+    
+    write.table(dfc, file = paste(outdir,"earlystableregions_mahal.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE)
+    return(dfc)
+  }
+  
+  if(change == "linearpc1"){
+    pcdist = function(x, outlier=0.9, plot=F, id=NA) {
+      xc = as.matrix(x)
+      for (j in 1:ncol(x)) {
+        xc[,j] = x[,j] - mean(x[,j], na.rm=T)
+      }
+      udv = svd(xc)
+      d = rep(NA, nrow(xc))
+      for (i in 1:nrow(xc)) {
+        d[i] = sqrt(sum((xc[i,] - udv$v[,1]*sum(xc[i,]*udv$v[,1]))^2))
+      }
+      prop.var = udv$d[1]^2/sum(udv$d^2)
+      if (plot) {
+        plot(xc[,1], xc[,2])
+        abline(h=0); abline(v=0)
+        dydx = udv$v[2,1]/udv$v[1,1]
+        abline(coef=c(0,dydx), col="green")
+        thresh = quantile(d, outlier)
+        ok = d > thresh
+        d0 = thresh/(cos(atan(dydx)))
+        print(d0)
+        print(dydx)
+        abline(coef=c(d0,dydx), col="green", lty=2)
+        abline(coef=c(-d0,dydx), col="green", lty=2)
+        points(xc[ok,1], xc[ok,2], col="red")
+        title(paste("Case: ", id, " (n=", ncol(x),"; PC1=", round(prop.var,2), ")", sep=""))
+      }
+      list(d=d, prop.var=prop.var, thresh=thresh)
+    }
+    
+    # Distance function wrapper
+    run_pcdist = function(df, groups, skip=c(1,2,3,4), outlier=0.9, plot=NA) {
+      if (is.na(plot)) {
+        plot = F
+      } else {
+        fname = plot
+        plot = T
+      }
+      annot = df[,skip,drop=F]
+      data = df[,-skip]
+      groups = as.character(groups)
+      ugroup = unique(groups)
+      y = as.data.frame(matrix(NA, nrow(data), length(ugroup)))
+      z = data.frame(matrix(NA,1,length(ugroup)))
+      out = data.frame(matrix(NA, nrow(data), length(ugroup)))
+      names(z) = ugroup
+      names(y) = ugroup
+      names(out) = ugroup
+      if (plot) {
+        pdf(fname)
+      }
+      for (e in ugroup) {
+        ok = apply(data[,which(groups==e)], 1, function(x) all(!is.na(x)))
+        res = pcdist(data[ok,which(groups==e)], outlier=outlier, plot=plot, id=e)
+        y[ok,e] = res$d
+        z[e] = res$prop.var
+        out[ok,e] = res$d > res$thresh
+      }
+      if (plot) {
+        dev.off()
+      }
+      res = cbind(annot,y)
+      list(table=res, prop.var=z, outlier=out)
+    }
+    groups=names$case
+    res = run_pcdist(df, groups, plot=paste(outdir,"pcdis_Figure1.pdf",sep=""))
+    write.table(res, file = paste(outdir,"earlystableregions_predictions_pcdist.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE)
+    #######Here I want to resturn a table of early stable CN regions based on these results 
+    res_f <- as.data.frame(df2$idx)
+    colnames(res_f) <- c("idx")
+    print("combining results")
+    for (k in unique(groups)){
+      tmpname <- dplyr::filter(names, names$case == k)  
+      tmpname <- unique(tmpname$sample)
+      tmp <- df[,tmpname]
+      length_f <- length(tmp)
+      ##Adding in idx function back to dataframe
+      #tmp$idx <- df2$idx
+      ###filter outlier
+      o <- res$outlier[,k]
+      for (j in 1:length(o)){
+        for (l in 1:length_f){
+      tmp[j,l] <- ifelse(o[j] == TRUE,NA,tmp[j,l])
+      }}
+      res_f <- cbind(tmp,res_f)
+      print(k)
+      }
+    res_f <- left_join(df2[,c(1:4)],res_f)
+    write.table(res_f, file = paste(outdir,"earlystableregions_pcdist.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE)
+    return(res_f)
+  }
+  
+  if(return_segmentfile == TRUE){
+    #####next lets turn it back into some resemblance of segment files
+    ###Each row is a segment per sample
+    datalist = list()
+    for (i in 5:length(df2)){
+      tmp <- df2[,c(1:4,i)]
+      j=paste(colnames(tmp[5]))
+      tmp <- tmp %>% dplyr::filter(!is.na(tmp[j]))
+      #####fill in first chr and start
+      ## Find the changes in chr or P1.
+      tmp$diff <- c(NA, (tmp[2:nrow(tmp), 'chr'] != tmp[1:(nrow(tmp) - 1), 'chr']) |
+                      (tmp[2:nrow(tmp), j] != tmp[1:(nrow(tmp) - 1), j]))
+      ## Assign a run number to the rows that increments only with the changes found
+      ## above.
+      tmp$runNum <- NA
+      runNum <- 1
+      tmp[1, 'runNum'] <- runNum
+      for (ii in 2:nrow(tmp)) {
+        if (tmp[ii, 'diff']) runNum <- runNum + 1
+        tmp[ii, 'runNum'] <- runNum
+      }
+      ##get rid of factors
+      tmp$chr <- as.numeric(tmp$chr)
+      ## Collect data from each run number to a single row.
+      tmp2 <- data.frame(chr = tapply(tmp[, 'chr'], tmp[, 'runNum'], min),
+                         pos = tapply(tmp[, 'pos'], tmp[, 'runNum'], min),
+                         end = tapply(tmp[, 'end'], tmp[, 'runNum'], max),
+                         idx = tapply(tmp[, 'idx'], tmp[, 'runNum'], min),
+                         case = tapply(tmp[, j], tmp[, 'runNum'], min),
+                         runNum = tapply(tmp[, 'runNum'], tmp[, 'runNum'], min),
+                         stringsAsFactors = FALSE)
+      tmp2 <- tmp2[order(tmp2[, 'runNum']), , drop = FALSE]
+      ##get rid of idx and run num
+      tmp2$runNum <- NULL
+      tmp2$idx <- NULL
+      tmp2$Sample <- j
+      colnames(tmp2) <- c("Chromosome", "Start", "End","SegCNa", "Sample")
+      datalist[[i]] <- tmp2
+      tmp <- NULL
+      tmp2 <- NULL
+    }
+    concensus_seg = do.call(rbind, datalist)
+    return(concensus_seg)
+    write.table(concensus_seg, file = paste(outdir,"earlystableregions_segmentationfile.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE)
+  }
+}
+  
\ No newline at end of file