a b/research_paper_code/src/read.data.R
1
# SUMMARY
2
# -------
3
# This file contains several functions for reading the QTL experiment
4
# data from text files. Here is an overview of the functions defined
5
# in this file:
6
#
7
#   read.pheno(file)
8
#   read.map(file)
9
#   read.geno.dosage(file,n)
10
#
11
# FUNCTION DEFINITIONS
12
# ----------------------------------------------------------------------
13
# Loads the phenotype data stored in a CSV file.
14
read.pheno <- function (file) {
15
    
16
  # Read in the phenotype data from the CSV file. 
17
  pheno <- read.csv(file,quote = "",header = TRUE,check.names = FALSE,
18
                    stringsAsFactors = FALSE,comment.char = "#")
19
20
  # Convert some of the columns to factors.
21
  pheno <- transform(pheno,
22
                     id            = as.character(id),
23
                     round         = factor(round,paste0("SW",1:25)),
24
                     FCbox         = factor(FCbox),
25
                     PPIbox        = factor(PPIbox),
26
                     methcage      = factor(methcage),
27
                     methcycle     = factor(methcycle),
28
                     discard       = factor(discard),
29
                     mixup         = factor(mixup),
30
                     earpunch      = factor(earpunch),
31
                     abnormalbone  = factor(abnormalbone),
32
                     experimenters = factor(experimenters))
33
34
  # Convert some of the columns to double precision.
35
  pheno <- transform(pheno,
36
                     fastglucose       = as.double(fastglucose),
37
                     D1totaldist0to15  = as.double(D1totaldist0to15),
38
                     D1totaldist15to30 = as.double(D1totaldist15to30),
39
                     D1totaldist0to30  = as.double(D1totaldist0to30),
40
                     D2totaldist0to15  = as.double(D2totaldist0to15),
41
                     D2totaldist15to30 = as.double(D2totaldist15to30),
42
                     D2totaldist0to30  = as.double(D2totaldist0to30),
43
                     D3totaldist0to15  = as.double(D3totaldist0to15),
44
                     D3totaldist15to30 = as.double(D3totaldist15to30),
45
                     D3totaldist0to30  = as.double(D3totaldist0to30),
46
47
                     D1TOTDIST5  = as.double(D1TOTDIST5),
48
                     D1TOTDIST10 = as.double(D1TOTDIST10),
49
                     D1TOTDIST15 = as.double(D1TOTDIST15),
50
                     D1TOTDIST20 = as.double(D1TOTDIST20),
51
                     D1TOTDIST25 = as.double(D1TOTDIST25),
52
                     D1TOTDIST30 = as.double(D1TOTDIST30),
53
54
                     D2TOTDIST5  = as.double(D2TOTDIST5),
55
                     D2TOTDIST10 = as.double(D2TOTDIST10),
56
                     D2TOTDIST15 = as.double(D2TOTDIST15),
57
                     D2TOTDIST20 = as.double(D2TOTDIST20),
58
                     D2TOTDIST25 = as.double(D2TOTDIST25),
59
                     D2TOTDIST30 = as.double(D2TOTDIST30),
60
61
                     D3TOTDIST5  = as.double(D3TOTDIST5),
62
                     D3TOTDIST10 = as.double(D3TOTDIST10),
63
                     D3TOTDIST15 = as.double(D3TOTDIST15),
64
                     D3TOTDIST20 = as.double(D3TOTDIST20),
65
                     D3TOTDIST25 = as.double(D3TOTDIST25),
66
                     D3TOTDIST30 = as.double(D3TOTDIST30))
67
68
  # Return the phenotype data table.
69
  return(pheno)
70
}
71
72
# ----------------------------------------------------------------------
73
# Returns a data frame containing the marker data stored in a text
74
# file. Here I convert the chromosomes and alleles to factors manually
75
# to make sure that the chromosomes and bases are ordered properly in
76
# the factors.
77
read.map <- function (file) {
78
  bases <- c("A","T","G","C")
79
  map   <- read.table(file,sep = " ",header = TRUE,comment.char = "#",
80
                 stringsAsFactors = FALSE)
81
  return(transform(map,
82
                   chr = factor(chr,1:19),
83
                   ref = factor(ref,bases),
84
                   alt = factor(alt,bases)))
85
}
86
87
# ----------------------------------------------------------------------
88
# Returns a list object containing (1) a vector "discard" specifying
89
# samples that are from potentially mislabeled flow samples, (2) an n
90
# x p matrix of genotype "dosages" (expected allele counts), where n
91
# is the number of samples and p is the number of SNPs.
92
read.geno.dosage <- function (file, n) {
93
  out <- fread(file,sep = " ",header = TRUE,showProgress = FALSE,
94
               colClasses = c("character","character",rep("double",n)))
95
  class(out)    <- "data.frame"
96
  rownames(out) <- out$id
97
  return(list(discard = factor(out$discard),
98
              geno    = as.matrix(out[-(1:2)])))
99
}