[d960e3]: / bin / dsComputeGCCoverage

Download this file

106 lines (89 with data), 4.0 kB

#!/usr/bin/env Rscript
# DETERMINES THE GC CONTENT ALONG THE GENOME: FROM FASTA TO BEDGRAPH
suppressMessages( library( 'argparse' ) )
suppressMessages( library( 'seqinr' ) )
suppressMessages( library( 'stringr' ) )
suppressMessages( library( 'Rsamtools' ) )

options(scipen=999)

#### ARGUMENTS ####

parser <- ArgumentParser(description="From .fasta files, compute their GC content along the genome in bedGraph format using a given window size.",
                         usage="dsComputeGCCoverage --input genome1.fa genome2.fa --windowSize 100 --output genome1 genome2")

parser$add_argument("--input","-i", type="character",action="store", nargs='+', dest="input", metavar="character",
                    help="Fasta files from which you want the GC content to be calculated." )

parser$add_argument("--windowSize", "-w",  type="integer", action="store", metavar="number",default=1000,
                    help="Size of the window used to binify the genome and calculate the GC content. Default: 1000." )

parser$add_argument("--output","-o", type="character",action="store", nargs='+', dest="output", metavar="character",
                    help="bedGraph file(s) output prefix name(s) ('.bedGraph' is automatically added at the end of the given prefix, one bedGraph per input file)." )

args <- parser$parse_args()

#### SANITY CHECK ####

if ( is.null( args$input ) ) {
  parser$print_help()
  stop( "An input file must be supplied.", call.=FALSE )
}

if ( is.null( args$output ) ) {
  parser$print_help()
  stop( "An output prefix must be supplied.", call.=FALSE )
}

#### SET WD AND GET ABS PATHS ####

args$input <- normalizePath( args$input )
args$output <- paste( normalizePath( dirname( args$output ) ),"/",basename( args$output ),sep="" )
setwd( normalizePath( dirname( args$output ) ) )

#### LOAD DATA ####

seqlast <- function (from, to, by)
{
  vec <- do.call(what = base::seq, args = list(from, to, by))
  if ( tail(vec, 1) != to ) {
    return(c(vec, to))
  } else {
    return(vec)
  }
}

bedGraphsOutput <- list()
for (fa in 1:length(args$input)){
  cat( as.character( Sys.time() ), "Loading fasta file",args$input[fa],"\n")
  fasta <- FaFile(args$input[[fa]])
  indexFa(fasta)
  fasta <- FaFile(args$input[[fa]])
  fasta_index <- scanFaIndex(fasta)
  chrsNames <- as.character(fasta_index@seqnames@values)
  chrsLengths <- as.numeric(fasta_index@seqinfo@seqlengths)
  bedGraph <- data.frame(chrom=character(),start=integer(),end=integer(),stringsAsFactors=FALSE)
  cat( as.character( Sys.time() ), "Making bedGraph structure\n")
  for (chr in 1:length(chrsNames)){
    size = chrsLengths[chr]
    if (size <= args$windowSize){
      binsStart <- 1
      binsEnd   <- size
    } else {
      binsStart <- seqlast(from = 1, to = size - args$windowSize, by = args$windowSize)
      binsEnd   <- seqlast(from = args$windowSize, to = size, by = args$windowSize)
    }
    tmp_length <- length(binsStart)
    tmp <- data.frame(chrom=character(),start=integer(),end=integer(),stringsAsFactors=FALSE)
    tmp[1:tmp_length,1:3] <- 0
    tmp[,1] <- chrsNames[chr]
    tmp[,2] <- binsStart
    tmp[,3] <- binsEnd
    bedGraph <- rbind(bedGraph,tmp)
  }
  bedGraphRanges <- makeGRangesFromDataFrame(df = bedGraph)
  cat( as.character( Sys.time() ), "Calculating GC content for each bin\n")
  GCcontent <- vector()
  for (seq in 1:length(bedGraphRanges)){
  dna <- scanFa(fasta,bedGraphRanges[seq,])
  gcvalue <- round(str_count(dna,"[GC]")/str_count(dna,"[ATGC]"),4)
  if (gcvalue == "NaN"){
    gcvalue <- 0
  }
  bedGraph[seq,"GCContent"] <- gcvalue
  }
  bedGraph[,2] <- bedGraph[,2] - 1
  bedGraph[,3] <- bedGraph[,3]
  bedGraphsOutput[[fa]] <- bedGraph
  cat( as.character( Sys.time() ), "Done\n")
}

for (fa in 1:length(args$input)){
  outname <- paste0(args$output[fa],".bedGraph")
  cat( as.character( Sys.time() ), "Exporting results to ",outname,"\n")
  write.table(bedGraphsOutput[[fa]],file = outname,quote = F,col.names = F,row.names = F,sep = "\t")
}