LymphomaFLserial / Git / [4ff317] /EarlyStableCN/find

Models:
WandaB/
LymphomaFLserial
Downloads: 1
[4ff317]: / EarlyStableCN / find_earlystable.R
History
Download this file
366 lines (352 with data), 15.2 kB

################################################
#  File Name:find_earlystable.r
#  Author: jillianwise
#  Mail: jwise7@mgh.harvard.edu
#  Created Time: Mon 22 August 2022 9:25 AM EST
#################################################
#' Given a list of CN states at all avaliable site breaks determine which CN States are stable throughout sampling/time
#'
#' Given a data frame of CNV segments in which all breaks are reported for all cases (see determine_allCNbreaks.R) and a sample information file- output the early stable CN regions for each case
#' allCNbreaks file
#' @chr column of a dataframe that lists the numeric (please reformat X and Y to 23 and 24) chromosome number of the alignment
#' @start column of a dataframe representing the start site of a segment
#' @end column of a dataframe representing the end site of a segment
#' @idx determined from determine_allCNbreaks.R this gives a unique id to each break region for each sample
#' The following columns should be the SegCN adjusted for ploidy values for every sample as a column (log2(df$SegCN-round(df$ploidy-2))-1)
#' The second file should represent a sample manifest sheet
#' @sample each samples name, matching those used to find all CN breaks
#' @case the case name used for identifying multiple related samples
#' @ploidy the estimated ploidy from any number of CN tools (ASCAT, facets, etc)
#' Given an output directory
#' Given the choice of modeling for Stable regions: 
#' plusminus allows for CN states to range between +- a threshold typically set at 0.3 while allowing for a copy state abberation (+-0.3) at initial biopsy to further increase or decrease.
#' mahalanobis uses a calculated distribution for each sample and determines the distance of each point from that distribution
#' linearpc1 calculates distances from the first principal component
#' Given a threshold (thresh) for plusminus modeling (default=0.3)
#' Given an option for returning a merged/unionized case segmentation file representing the early stable copy states


find_earlystable <- function(df,names,outdir,change="plusminus",thresh=0.3,return_segmentfile=FALSE) {
  require(dplyr)
  df2 <- subset(df, select = c("chr", "pos", "end", "idx"))
  df2[,unique(names$case)] <- NA
  if(change != "plusminus"){
    ####I report ploidy adjust CN as (log2(df$SegCN-round(df$ploidy-2))-1)
    ####To run Mahalanobis and PC1 We need total CN
    tmp <- df[,-c(1:4)]
    tmp <- (2^(tmp+1))
    tmp3 <- tmp 
    tmp3[,unique(names$sample)] <- NA
    for (i in colnames(tmp)){
      tmp2 <- tmp[,i]
      tmp2 <- tmp2+(names[names$sample==i,3]-2)
      tmp3[,i] <- tmp2
    }
    tmp3 <- cbind(df[,c(1:4)],tmp3)
    df <- tmp3
  }
  
  ######Check overlap between names files and CN file
  print("Checking Sample Intersection")
  rem <- setdiff(names$sample,colnames(df))
  if (length(rem)!=0){
    print(paste("removing ",rem, " from sample file",sep=""))
    names <- dplyr::filter(names, names$sample != rem)
  }
  
  ########################
  ##Check for any columns with all NAs
  len <- dim(df)[1]
  df <- df[,colSums(is.na(df)) != len]
  ########################
  
  #########################
  ###Remove any unpaired samples
  print("Checking for Unpaired Samples")
  tmpnames <- dplyr::filter(names, names$sample %in% colnames(df))
  tmp <- tmpnames %>% dplyr::group_by(case) %>% dplyr::count()
  rem2 <- as.data.frame(tmp[tmp$n < 2,1])
  print(paste("Sample", rem2$case," has no Pairs, Removing"))
  rem3 <- as.data.frame(names[names$case==rem2$case,1])
  colnames(rem3) <- c("sample")
  drop <- c(rem3$sample)
  df <- df[,!(names(df) %in% drop)]
  
  #####################
  ##Update names after sample removal
  names <- dplyr::filter(names, names$sample %in% colnames(df))
  
  ###For each case (based on the first biopsy) make a loss dataframe and gain (If one uses absolute value a switch to gain or loss would be counted)
  if(change == "plusminus"){
    if (is.na(plot)) {
      plot = F
    } else {
      fname = plot
      plot = T
    }
  for (i in unique(names$case)){
    print(i)
    namestmp <- dplyr::filter(names, names$case == i)  
    tmp <- unique(namestmp$sample)
    tmp <- c(tmp)
    tmpdf2 <- df %>% dplyr::select(tmp)
    tmpdf3 <- tmpdf2 %>% dplyr::filter(tmpdf2[,1] > thresh) #This dataframe is all gains above .3 in first biopsy
    tmpdf4 <- tmpdf2 %>% dplyr::filter(tmpdf2[,1] < -(thresh)) #This dataframe is all losses below  -.3 in first biopsy
    greatergain <- function(x) all((x >= x[1]))
    greaterloss <- function(x) all((x <= x[1]))
    plusminus <- function(x) all((abs(x - x[1]) < thresh))
    tmpdf2[, "Consensus"] <- apply(tmpdf2, 1, plusminus)
    tmpdf3[, "GreaterGains"] <- apply(tmpdf3, 1, greatergain)
    tmpdf4[, "GreaterLoss"] <- apply(tmpdf4, 1, greaterloss)
    ###add back in the idx
    tmpdf2$idx <- rownames(tmpdf2)
    tmpdf5 <- tmpdf2 %>% dplyr::filter(tmpdf2[idx,1] > thresh) %>% select(idx)
    tmpdf3$idx <-tmpdf5$idx
    tmpdf6 <- tmpdf2 %>% dplyr::filter(tmpdf2[idx,1] < -(thresh)) %>% select(idx)
    tmpdf4$idx <-tmpdf6$idx
    ####plot
    pldf <- full_join(tmpdf2,tmpdf3)
    pldf <- full_join(pldf,tmpdf4)
    pldf$color <- ifelse(pldf$Consensus==TRUE,"black",ifelse(pldf$GreaterGains==TRUE & pldf[,1] > thresh ,"black",ifelse(pldf$GreaterLoss==TRUE & pldf[,1] > -(thresh),"black","red")))
    pldf$color <- ifelse(is.na(pldf$color),"red",pldf$color)
    if (plot) {
      pdf(paste(outdir,i,"plusmins.pdf"))
    }
    if (plot) {
      plot(pldf[,1], pldf[,2],xlim = c(-2, 4),ylim = c(-2, 4),ann=FALSE)
      abline(h=0); abline(v=0)
      abline(coef=c(thresh,1), col="green", lty=2)
      abline(coef=c(-(thresh),1), col="green", lty=2)
      points(pldf[,1], pldf[,2], col=pldf[,"color"])
      title(main=paste("Case: ", i, sep=""),xlab=paste(colnames(pldf)[1]," Copy State", sep=""),ylab=paste(colnames(pldf)[2]," Copy State", sep=""))
      
    }
    if (plot) {
      dev.off()
    }
    df2[,i] <- ifelse(tmpdf2$Consensus == TRUE, tmpdf2[,1],NA)
    for (j in tmpdf3$idx){
      df2[j,i] <- ifelse(tmpdf3$GreaterGains[tmpdf3$idx==j] == TRUE, tmpdf3[as.numeric(which(tmpdf3$idx==j)),1],df2[j,i])}
    for (k in tmpdf4$idx){
      df2[k,i] <- ifelse(tmpdf4$GreaterLoss[tmpdf4$idx==k] == TRUE,  tmpdf4[as.numeric(which(tmpdf4$idx==k)),1],df2[k,i])}
    tmpdf2 <- NULL
    tmpdf3 <- NULL
    tmpdf4 <- NULL
    tmp <- NULL
  }
  write.table(df2, file = paste(outdir,"earlystableregions.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE)
  return(df2)
  }
  
  if(change == "mahalanobis"){
    require(tidyr)
    require(stats)
    require(rlang)
    require(stringr)
    if (is.na(plot)) {
      plot = F
    } else {
      fname = plot
      plot = T
    }
    ###############
    #####get_selected_vars and mahalanobis_Distance is written by Alboukadel Kassambra and from the rstatix package (version 0.7.0 )
    ###########
    get_selected_vars <- function(x, ..., vars = NULL){
      if(is_grouped_df(x))
        x <- x %>% dplyr::ungroup()
      dot.vars <- rlang::quos(...)
      if(length(vars) > 0){
        return(vars)
      }
      if (length(dot.vars) == 0) selected <- colnames(x)
      else selected <- tidyselect::vars_select(names(x), !!! dot.vars)
      selected %>% as.character()
    }
    mahalanobis_distance <- function(data, ...){
      if(is_grouped_df(data)){
        results <- data %>%
          doo(~mahalanobis_distance(.))
      }
      data <- data %>% select_if(is.numeric)
      data <- data %>% drop_na()
      vars <- data %>% get_selected_vars(...)
      n.vars <- length(vars)
      threshold <- stats::qchisq(0.999, n.vars)
      .data <- data %>% select(!!!syms(vars)) %>% as.matrix()
      distance <- stats::mahalanobis(.data,center = colMeans(.data),cov = cov(.data))
      results <- data %>% mutate(mahal.dist = round(distance, 3),is.outlier = .data$mahal.dist > threshold)
      results
    }
    for (i in unique(names$case)){
      tmpname <- dplyr::filter(names, names$case == i)  
      tmpname <- unique(tmpname$sample)
      tmp <- df[,tmpname]
      ##Adding in idx function back to dataframe
      tmp$idx <- df2$idx
      tmp <-mahalanobis_distance(tmp, -idx) 
      tmp <- tmp %>% dplyr::filter(is.outlier == TRUE)
      write.table(tmp, paste(outdir,as.character(i),'mahalanobisremovedegments.txt',sep=''), append = FALSE, quote = FALSE, sep = "\t",
                  eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE) 
      tmp <- NULL
      tmpname <- NULL
    }
    dfc <- df2[1:4]
    for (i in unique(names$case)){
      tmpname <- dplyr::filter(names, names$case == i)  
      tmpname <- unique(tmpname$sample)
      tmp <- df[,tmpname]
      ##Adding in idx function back to dataframe
      tmp$idx <- df2$idx
      tmp <- mahalanobis_distance(tmp, -idx) 
      tmp$color <- ifelse(tmp$is.outlier == "TRUE", "red", "black")
      if (plot) {
        pdf(paste(outdir,i,"mahal.pdf"))
      }
      if (plot) {
        plot(tmp[,1], tmp[,2],xlim = c(-2, 4),ylim = c(-2, 4),ann=FALSE)
        abline(h=0); abline(v=0)
        points(tmp[,1], tmp[,2], col=tmp[,"color"])
        title(main=paste("Case: ", i, sep=""),xlab=paste(colnames(tmp)[1]," Copy State", sep=""),ylab=paste(colnames(tmp)[2]," Copy State", sep=""))
      }
      if (plot) {
        dev.off()
      }
      tmp <- tmp %>% dplyr::filter(is.outlier == FALSE)
      tmp <- as.data.frame(tmp)
      for (j in 1:length(dfc$idx)){
        dfc[j, i] <- ifelse(any(tmp$idx == j) != 0, tmp[which(tmp$idx == j),1], NA)
      }
      tmp <- NULL
      tmpname <- NULL
    }
    
    write.table(dfc, file = paste(outdir,"earlystableregions_mahal.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE)
    return(dfc)
  }
  
  if(change == "linearpc1"){
    pcdist = function(x, outlier=0.9, plot=F, id=NA) {
      xc = as.matrix(x)
      for (j in 1:ncol(x)) {
        xc[,j] = x[,j] - mean(x[,j], na.rm=T)
      }
      udv = svd(xc)
      d = rep(NA, nrow(xc))
      for (i in 1:nrow(xc)) {
        d[i] = sqrt(sum((xc[i,] - udv$v[,1]*sum(xc[i,]*udv$v[,1]))^2))
      }
      prop.var = udv$d[1]^2/sum(udv$d^2)
      if (plot) {
        plot(xc[,1], xc[,2])
        abline(h=0); abline(v=0)
        dydx = udv$v[2,1]/udv$v[1,1]
        abline(coef=c(0,dydx), col="green")
        thresh = quantile(d, outlier)
        ok = d > thresh
        d0 = thresh/(cos(atan(dydx)))
        print(d0)
        print(dydx)
        abline(coef=c(d0,dydx), col="green", lty=2)
        abline(coef=c(-d0,dydx), col="green", lty=2)
        points(xc[ok,1], xc[ok,2], col="red")
        title(paste("Case: ", id, " (n=", ncol(x),"; PC1=", round(prop.var,2), ")", sep=""))
      }
      list(d=d, prop.var=prop.var, thresh=thresh)
    }
    
    # Distance function wrapper
    run_pcdist = function(df, groups, skip=c(1,2,3,4), outlier=0.9, plot=NA) {
      if (is.na(plot)) {
        plot = F
      } else {
        fname = plot
        plot = T
      }
      annot = df[,skip,drop=F]
      data = df[,-skip]
      groups = as.character(groups)
      ugroup = unique(groups)
      y = as.data.frame(matrix(NA, nrow(data), length(ugroup)))
      z = data.frame(matrix(NA,1,length(ugroup)))
      out = data.frame(matrix(NA, nrow(data), length(ugroup)))
      names(z) = ugroup
      names(y) = ugroup
      names(out) = ugroup
      if (plot) {
        pdf(fname)
      }
      for (e in ugroup) {
        ok = apply(data[,which(groups==e)], 1, function(x) all(!is.na(x)))
        res = pcdist(data[ok,which(groups==e)], outlier=outlier, plot=plot, id=e)
        y[ok,e] = res$d
        z[e] = res$prop.var
        out[ok,e] = res$d > res$thresh
      }
      if (plot) {
        dev.off()
      }
      res = cbind(annot,y)
      list(table=res, prop.var=z, outlier=out)
    }
    groups=names$case
    res = run_pcdist(df, groups, plot=paste(outdir,"pcdis_Figure1.pdf",sep=""))
    write.table(res, file = paste(outdir,"earlystableregions_predictions_pcdist.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE)
    #######Here I want to resturn a table of early stable CN regions based on these results 
    res_f <- as.data.frame(df2$idx)
    colnames(res_f) <- c("idx")
    print("combining results")
    for (k in unique(groups)){
      tmpname <- dplyr::filter(names, names$case == k)  
      tmpname <- unique(tmpname$sample)
      tmp <- df[,tmpname]
      length_f <- length(tmp)
      ##Adding in idx function back to dataframe
      #tmp$idx <- df2$idx
      ###filter outlier
      o <- res$outlier[,k]
      for (j in 1:length(o)){
        for (l in 1:length_f){
      tmp[j,l] <- ifelse(o[j] == TRUE,NA,tmp[j,l])
      }}
      res_f <- cbind(tmp,res_f)
      print(k)
      }
    res_f <- left_join(df2[,c(1:4)],res_f)
    write.table(res_f, file = paste(outdir,"earlystableregions_pcdist.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE)
    return(res_f)
  }
  
  if(return_segmentfile == TRUE){
    #####next lets turn it back into some resemblance of segment files
    ###Each row is a segment per sample
    datalist = list()
    for (i in 5:length(df2)){
      tmp <- df2[,c(1:4,i)]
      j=paste(colnames(tmp[5]))
      tmp <- tmp %>% dplyr::filter(!is.na(tmp[j]))
      #####fill in first chr and start
      ## Find the changes in chr or P1.
      tmp$diff <- c(NA, (tmp[2:nrow(tmp), 'chr'] != tmp[1:(nrow(tmp) - 1), 'chr']) |
                      (tmp[2:nrow(tmp), j] != tmp[1:(nrow(tmp) - 1), j]))
      ## Assign a run number to the rows that increments only with the changes found
      ## above.
      tmp$runNum <- NA
      runNum <- 1
      tmp[1, 'runNum'] <- runNum
      for (ii in 2:nrow(tmp)) {
        if (tmp[ii, 'diff']) runNum <- runNum + 1
        tmp[ii, 'runNum'] <- runNum
      }
      ##get rid of factors
      tmp$chr <- as.numeric(tmp$chr)
      ## Collect data from each run number to a single row.
      tmp2 <- data.frame(chr = tapply(tmp[, 'chr'], tmp[, 'runNum'], min),
                         pos = tapply(tmp[, 'pos'], tmp[, 'runNum'], min),
                         end = tapply(tmp[, 'end'], tmp[, 'runNum'], max),
                         idx = tapply(tmp[, 'idx'], tmp[, 'runNum'], min),
                         case = tapply(tmp[, j], tmp[, 'runNum'], min),
                         runNum = tapply(tmp[, 'runNum'], tmp[, 'runNum'], min),
                         stringsAsFactors = FALSE)
      tmp2 <- tmp2[order(tmp2[, 'runNum']), , drop = FALSE]
      ##get rid of idx and run num
      tmp2$runNum <- NULL
      tmp2$idx <- NULL
      tmp2$Sample <- j
      colnames(tmp2) <- c("Chromosome", "Start", "End","SegCNa", "Sample")
      datalist[[i]] <- tmp2
      tmp <- NULL
      tmp2 <- NULL
    }
    concensus_seg = do.call(rbind, datalist)
    return(concensus_seg)
    write.table(concensus_seg, file = paste(outdir,"earlystableregions_segmentationfile.txt",sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", col.names = TRUE, row.names = FALSE)
  }
}