deepStats / Git / [3186be] /bin/dsComputeBEDDensity

Models:
MarcoTheBlack/
deepStats
Downloads: 1
[3186be]: / bin / dsComputeBEDDensity
History
Download this file
154 lines (120 with data), 5.2 kB

#!/usr/bin/env Rscript
# BED TO BEDGRAPH GIVEN A WINDOW SIZE
suppressMessages( library( 'argparse' ) )
suppressMessages( library( 'vroom' ) )
suppressMessages( library( 'IRanges' ) )

options(scipen=999)

#### ARGUMENTS ####

parser <- ArgumentParser(description="From .BED files and a chromosome sizes file, compute the BED features density along the genome in bedGraph format using a given window size.",
                         usage = "dsComputeBEDDensity --input file1.bed file2.bed --chromSize chrom.sizes --windowSize 10000 --output file1 file2")

parser$add_argument("--input","-i", type="character",action="store", nargs='+', dest="input", metavar="character",
                    help="BED files from which features density will be calculated." )

parser$add_argument("--chromSize","-c", type="character", action="store",
                    help="A 2 columns tab-delimited file containing chromosome sizes, with one chromosome per line.", metavar="character")

parser$add_argument("--windowSize", "-w",  type="integer", action="store", metavar="number",default=1000,
                    help="Size of the window used to binify the genome and calculate bed files density. Default: 1000." )

parser$add_argument("--output","-o", type="character",action="store", nargs='+', dest="output", metavar="character",
                    help="bedGraph file(s) output prefix name(s) ('.bedGraph' is automatically added at the end of the given prefix, one bedGraph per input file)." )

args <- parser$parse_args()

#### SANITY CHECK ####

if ( is.null( args$input ) ) {
  parser$print_help()
  stop( "An input file must be supplied.", call.=FALSE )
}

if ( is.null( args$output ) ) {
  parser$print_help()
  stop( "An output prefix must be supplied.", call.=FALSE )
}

if ( is.null( args$chromSize ) ) {
  parser$print_help()
  stop( "A file containing the size of chromosomes must be supplied.", call.=FALSE )
}

#### SET WD AND GET ABS PATHS ####

args$input <- normalizePath( args$input )
args$output <- paste( normalizePath( dirname( args$output ) ),"/",basename( args$output ),sep="" )
args$chromSize <- normalizePath( args$chromSize )
setwd( normalizePath( dirname( args$output ) ) )

#### LOAD DATA ####

beds <- list()
for (file in 1:length(args$input)){
  cat( as.character( Sys.time() ),"Loading input file ",args$input[file],"\n")
  beds[[file]]<- vroom(args$input[[file]], col_names = F)[,1:3]
  cat("\n")
}

cat( as.character( Sys.time() ),"Loading chromosome sizes from ",args$chromSize,"\n")
chromSize <- as.data.frame(vroom(args$chromSize, col_names = F))
cat("\n")

#### RETAIN USEFUL CHR ####

chrs <- list()
retain_chr <- function(df){
  out <- as.data.frame(unique(df[,1]))
  return(out)
}
chrs <- lapply(beds,retain_chr)

#### BINIFY THE GENOME FOR USEFUL CHROMS ####

cat( as.character( Sys.time() ), "Binifying the genome to given window size",args$windowSize,"\n")

seqlast <- function (from, to, by)
{
  vec <- do.call(what = seq, args = list(from, to, by))
  if ( tail(vec, 1) != to ) {
    return(c(vec, to))
  } else {
    return(vec)
  }
}

bedGraph <- list()
count=0
for (bedfile in 1:length(beds)){
  bedGraph[[bedfile]] <- data.frame()
  count=count+1
  for (chrom in 1:nrow(chrs[[count]])){
    chromosome <- chrs[[count]][chrom,1]
    
    a = try(size <- chromSize[which(chromSize[,1] == chromosome),2], TRUE)
    if (length(a) == 0 ){
      stop("A chromosome is in the bed file which is not in the chromosome sizes file")
    }

    size <- chromSize[which(chromSize[,1] == chromosome),2]
    if (size <= args$windowSize){
      binsStart <- 0
      binsEnd   <- size
    } else {
      binsStart <- seqlast(from = 0, to = size - args$windowSize, by = args$windowSize)
      binsEnd   <- seqlast(from = args$windowSize, to = size, by = args$windowSize)
    }
      tmp_length <- length(binsStart)
      tmp <- data.frame(chrom=character(),start=integer(),end=integer(),stringsAsFactors=FALSE)
      tmp[1:tmp_length,1:3] <- 0
      tmp[,1] <- chromosome
      tmp[,2] <- binsStart
      tmp[,3] <- binsEnd
      bedGraph[[bedfile]] <- rbind(tmp,bedGraph[[bedfile]])
  }
}

#### ORDER BEDGRAPHS ####

for (bedfile in 1:length(beds)){
  bedGraph[[bedfile]] <- bedGraph[[bedfile]][
    order(
      bedGraph[[bedfile]][,1],
      bedGraph[[bedfile]][,2],
      bedGraph[[bedfile]][,3]
    )
  ,]
}

#### CALCULATE FEATURES DENSITY AND APPEND TO BEDGRAPHS ####

cat( as.character( Sys.time() ),"Features density computation... ")

featuresRanges <- list()
genomeRanges   <- list()
overlaps       <- list()
for (i in 1:length(beds)){
  featuresRanges[[i]] <- split(IRanges(beds[[i]]$X2, beds[[i]]$X3), beds[[i]]$X1)
  genomeRanges[[i]]   <- split(IRanges(bedGraph[[i]]$start, bedGraph[[i]]$end), bedGraph[[i]]$chrom)
  overlaps[[i]] <- as.data.frame(unlist(countOverlaps(genomeRanges[[i]],featuresRanges[[i]])))[,1]
  bedGraph[[i]]["counts"] <- overlaps[[i]]
}

cat("Done!\n")

#### EXPORT BEDGRAPHS ####

for (i in 1:length(bedGraph)){
  outname <- paste0(args$output[i],".bedGraph")
  cat( as.character( Sys.time() ), "Exporting results to ",outname,"\n")
  write.table(bedGraph[[i]],file = outname,quote = F,col.names = F,row.names = F,sep = "\t")
}