synthetic-data-genomics / Git / [befbfc] /research_paper

Models:
MarcoTheBlack/
synthetic-data-genomics
Downloads: 1
[befbfc]: / research_paper_code / src / read.data.R
History
Download this file
100 lines (90 with data), 4.5 kB

# SUMMARY
# -------
# This file contains several functions for reading the QTL experiment
# data from text files. Here is an overview of the functions defined
# in this file:
#
#   read.pheno(file)
#   read.map(file)
#   read.geno.dosage(file,n)
#
# FUNCTION DEFINITIONS
# ----------------------------------------------------------------------
# Loads the phenotype data stored in a CSV file.
read.pheno <- function (file) {
    
  # Read in the phenotype data from the CSV file. 
  pheno <- read.csv(file,quote = "",header = TRUE,check.names = FALSE,
                    stringsAsFactors = FALSE,comment.char = "#")

  # Convert some of the columns to factors.
  pheno <- transform(pheno,
                     id            = as.character(id),
                     round         = factor(round,paste0("SW",1:25)),
                     FCbox         = factor(FCbox),
                     PPIbox        = factor(PPIbox),
                     methcage      = factor(methcage),
                     methcycle     = factor(methcycle),
                     discard       = factor(discard),
                     mixup         = factor(mixup),
                     earpunch      = factor(earpunch),
                     abnormalbone  = factor(abnormalbone),
                     experimenters = factor(experimenters))

  # Convert some of the columns to double precision.
  pheno <- transform(pheno,
                     fastglucose       = as.double(fastglucose),
                     D1totaldist0to15  = as.double(D1totaldist0to15),
                     D1totaldist15to30 = as.double(D1totaldist15to30),
                     D1totaldist0to30  = as.double(D1totaldist0to30),
                     D2totaldist0to15  = as.double(D2totaldist0to15),
                     D2totaldist15to30 = as.double(D2totaldist15to30),
                     D2totaldist0to30  = as.double(D2totaldist0to30),
                     D3totaldist0to15  = as.double(D3totaldist0to15),
                     D3totaldist15to30 = as.double(D3totaldist15to30),
                     D3totaldist0to30  = as.double(D3totaldist0to30),

                     D1TOTDIST5  = as.double(D1TOTDIST5),
                     D1TOTDIST10 = as.double(D1TOTDIST10),
                     D1TOTDIST15 = as.double(D1TOTDIST15),
                     D1TOTDIST20 = as.double(D1TOTDIST20),
                     D1TOTDIST25 = as.double(D1TOTDIST25),
                     D1TOTDIST30 = as.double(D1TOTDIST30),

                     D2TOTDIST5  = as.double(D2TOTDIST5),
                     D2TOTDIST10 = as.double(D2TOTDIST10),
                     D2TOTDIST15 = as.double(D2TOTDIST15),
                     D2TOTDIST20 = as.double(D2TOTDIST20),
                     D2TOTDIST25 = as.double(D2TOTDIST25),
                     D2TOTDIST30 = as.double(D2TOTDIST30),

                     D3TOTDIST5  = as.double(D3TOTDIST5),
                     D3TOTDIST10 = as.double(D3TOTDIST10),
                     D3TOTDIST15 = as.double(D3TOTDIST15),
                     D3TOTDIST20 = as.double(D3TOTDIST20),
                     D3TOTDIST25 = as.double(D3TOTDIST25),
                     D3TOTDIST30 = as.double(D3TOTDIST30))

  # Return the phenotype data table.
  return(pheno)
}

# ----------------------------------------------------------------------
# Returns a data frame containing the marker data stored in a text
# file. Here I convert the chromosomes and alleles to factors manually
# to make sure that the chromosomes and bases are ordered properly in
# the factors.
read.map <- function (file) {
  bases <- c("A","T","G","C")
  map   <- read.table(file,sep = " ",header = TRUE,comment.char = "#",
                 stringsAsFactors = FALSE)
  return(transform(map,
                   chr = factor(chr,1:19),
                   ref = factor(ref,bases),
                   alt = factor(alt,bases)))
}

# ----------------------------------------------------------------------
# Returns a list object containing (1) a vector "discard" specifying
# samples that are from potentially mislabeled flow samples, (2) an n
# x p matrix of genotype "dosages" (expected allele counts), where n
# is the number of samples and p is the number of SNPs.
read.geno.dosage <- function (file, n) {
  out <- fread(file,sep = " ",header = TRUE,showProgress = FALSE,
               colClasses = c("character","character",rep("double",n)))
  class(out)    <- "data.frame"
  rownames(out) <- out$id
  return(list(discard = factor(out$discard),
              geno    = as.matrix(out[-(1:2)])))
}