a b/research_paper_code/src/data.manip.R
1
# SUMMARY
2
# -------
3
# Some functions for manipulating QTL experiment data. Here is an
4
# overview of the functions defined in this file:
5
#
6
#    prepare.pheno(pheno)
7
#    remove.outliers(pheno,phenotype,covariates,outliers,verbose)
8
#    genotypes2counts(geno,A,B)
9
#    mapgeno(gp)
10
#    data2qtl(pheno,gp,map)
11
#    merge.gwscan.qtl(files)
12
#    get.thresholds(perms,alpha)
13
#
14
# FUNCTION DEFINITIONS
15
# ----------------------------------------------------------------------
16
# This function prepares the phenotype data for QTL mapping.
17
prepare.pheno <- function (pheno) {
18
19
  # Remove rows marked as "discard" and as possible sample mixups.
20
  pheno <- subset(pheno,discard == "no" & mixup == "no")
21
22
  # Create a new binary trait that is equal to 1 when the mouse has an
23
  # "abnormal" bone-mineral density, and 0 otherwise.
24
  pheno <- cbind(pheno,
25
                 data.frame(abBMD = factor(as.numeric(pheno$BMD > 90))))
26
27
  # Create some binary traits ("pretrainbin", "d1tonebin" and
28
  # "altcontextbin") for fear conditioning metrics that show a
29
  # "bimodal" freezing distribution. Each of these phenotypes is equal
30
  # to 1 when we observe some freezing, and 0 otherwise.
31
  pheno <-
32
    cbind(pheno,with(pheno,
33
        data.frame(pretrainbin   = factor(as.numeric(PreTrainD1     > 0.01)),
34
                   d1tonebin     = factor(as.numeric(AvToneD1       > 0.01)),
35
                   altcontextbin = factor(as.numeric(AvAltContextD3 > 0.01)))))
36
37
  # Create a binary trait from the p120b1 trait that indicates that
38
  # the mouse does not respond to the pulse, possibly indicating
39
  # deafness. I choose mice with p120b1 values on the "long tail" of
40
  # the distribution for this phenotype.
41
  pheno <- cbind(pheno,
42
                 data.frame(deaf = factor(as.numeric(pheno$p120b1 < 50))))
43
44
  # The deaf mice effectively add "noise" to the PPI phenotypes, in
45
  # part because the ratio has a denominator that would be close to
46
  # zero. So I remove these samples from the analysis.
47
  PPI.phenotypes <-
48
      c("p120b1","p120b2","p120b3","p120b4","pp3b1","pp3b2","pp3avg",
49
        "pp6b1","pp6b2","pp6avg","pp12b1","pp12b2","pp12avg","pp3PPIb1",
50
        "pp3PPIb2","pp3PPIavg","pp6PPIb1","pp6PPIb2","pp6PPIavg",
51
        "pp12PPIb1","pp12PPIb2","pp12PPIavg","PPIavg","startle")
52
  rows <- which(pheno$p120b1 < 10)
53
  pheno[rows,PPI.phenotypes] <- NA
54
  
55
  # Transform bone-mineral densities, which are ratios, to the
56
  # log-scale (in base 10), so that the distribution of this phenotype
57
  # looks roughly normal. (Although there is a long tail of high BMD
58
  # measurements that we will have to discard.)
59
  pheno <- transform(pheno,BMD = log10(BMD))
60
61
  # Transform the fear conditioning phenotypes, which are proportions
62
  # (numbers between 0 and 1) to the log-odds scale using the logit
63
  # function (in base 10).
64
  f     <- function (x) logit10(project.onto.interval(x,0.01,0.99))
65
  pheno <- transform(pheno,
66
                     PreTrainD1     = f(PreTrainD1),
67
                     AvToneD1       = f(AvToneD1),
68
                     AvContextD2    = f(AvContextD2),
69
                     AvAltContextD3 = f(AvAltContextD3),
70
                     AvToneD3       = f(AvToneD3),
71
                     D3.180         = f(D3.180),
72
                     D3.240         = f(D3.240),
73
                     D3.300         = f(D3.300),
74
                     D3.360         = f(D3.360))
75
76
  # Transform some of the prepulse inhibition (PPI) phenotypes, which
77
  # are proportions (numbers between 0 and 1, except for a few
78
  # negative values), to the log-odds scale.
79
  pheno <- transform(pheno,
80
                     pp3PPIavg  = f(pp3PPIavg),
81
                     pp6PPIavg  = f(pp6PPIavg),
82
                     pp12PPIavg = f(pp12PPIavg),
83
                     PPIavg     = f(PPIavg))
84
85
  # Transform the "center time" phenotypes so that they are
86
  # proportions, then transform the proportions to the (base 10) logit
87
  # scale.
88
  pheno <- transform(pheno,
89
                     D1ctrtime0to15 = f(D1ctrtime0to15/900),
90
                     D2ctrtime0to15 = f(D2ctrtime0to15/900),
91
                     D3ctrtime0to15 = f(D3ctrtime0to15/900),
92
                     
93
                     D1ctrtime0to30 = f(D1ctrtime0to30/1800),
94
                     D2ctrtime0to30 = f(D2ctrtime0to30/1800),
95
                     D3ctrtime0to30 = f(D3ctrtime0to30/1800))
96
97
  # Transform the "vertical activity" phenotypes so that they are
98
  # proportions, then transform the proportions to the (base 10) logit
99
  # scale.
100
  pheno <- transform(pheno,
101
                     D1vact0to15 = f(D1vact0to15/900),
102
                     D2vact0to15 = f(D2vact0to15/900),
103
                     D3vact0to15 = f(D3vact0to15/900),
104
105
                     D1vact0to30 = f(D1vact0to30/1800),
106
                     D2vact0to30 = f(D2vact0to30/1800),
107
                     D3vact0to30 = f(D3vact0to30/1800))
108
                     
109
  # Create a new phenotype defined to be the first principal component
110
  # of the muscle weights.
111
  mw.pheno           <- pheno[c("TA","EDL","soleus","plantaris","gastroc")]
112
  rows               <- which(rowSums(is.na(mw.pheno)) == 0)
113
  pheno              <- cbind(pheno,data.frame(mwpc = NA))
114
  pheno[rows,"mwpc"] <- prcomp(mw.pheno[rows,],center=TRUE,scale=TRUE)$x[,1]
115
116
  # Return the updated phenotype data table.
117
  return(pheno)
118
}
119
120
# ----------------------------------------------------------------------
121
# Remove outliers from the phenotype data for the given phenotype,
122
# optionally conditioning on covariates. If covariates are specified,
123
# outliers are determined according to the residual of the phenotype
124
# after regressing on the covariates.
125
remove.outliers <- function (pheno, phenotype, covariates, outliers,
126
                             verbose = TRUE) {
127
128
  # Specify the function for removing the outliers.
129
  is.outlier <- function (x) {
130
    y           <- outliers(x)
131
    y[is.na(y)] <- FALSE
132
    return(y)
133
  }
134
  
135
  # If we are conditioning on one or more covariates, get the
136
  # residuals of the phenotype conditioned on these covariates.
137
  if (length(covariates) > 0) {
138
    f <- formula(paste(phenotype,"~",paste(covariates,collapse="+")))
139
    r <- resid(lm(f,pheno,na.action = na.exclude))
140
  } else
141
    r <- pheno[[phenotype]]
142
143
  # If requested, report how many outlying data points are removed.
144
  if (verbose) {
145
    n <- sum(is.outlier(r))
146
    if (n == 0)
147
      cat("No outliers for",phenotype)
148
    else
149
      cat(n,"outliers for",phenotype)
150
    if (length(covariates) == 0)
151
      cat(" are removed.\n")
152
    else
153
      cat(" conditioned on",paste(covariates,collapse=" + "),"are removed.\n")
154
  }
155
    
156
  # Remove the outlying data points.
157
  pheno[is.outlier(r),phenotype] <- NA
158
159
  # Return the updated phenotype data table.
160
  return(pheno)
161
}
162
163
# ----------------------------------------------------------------------
164
# This function takes three inputs: a vector of strings representing
165
# genotypes, a character giving the reference allele, and another
166
# character giving the alternative allele. The return value is a
167
# vector of allele counts; precisely, the number of times the
168
# alternative (B) allele appears in the genotype.
169
genotypes2counts <- function (geno, A, B) {
170
171
  # Initialize the allele counts to be missing.
172
  g <- rep(NA,length(geno))
173
    
174
  # Set the allele counts according to the genotypes.
175
  g[geno == paste0(A,A)] <- 0
176
  g[geno == paste0(A,B)] <- 1
177
  g[geno == paste0(B,A)] <- 1
178
  g[geno == paste0(B,B)] <- 2
179
180
  # Return the allele counts.
181
  return(g)
182
}
183
184
# ----------------------------------------------------------------------
185
# Given the genotype probabilities, output the genotypes (allele
186
# counts) with the highest probabilities; i.e. the maximum a
187
# posteriori estimates of the genotypes.
188
mapgeno <- function (gp) {
189
190
  # Get the largest probability for each genotype.
191
  r <- pmax(gp$p0,gp$p1,gp$p2)
192
193
  # Initialize the output.
194
  geno   <- r
195
  geno[] <- 0
196
  
197
  # Get the genotypes corresponding to the largest probabilities.
198
  geno[gp$p1 == r] <- 1
199
  geno[gp$p2 == r] <- 2
200
201
  # Return the matrix of genotypes.
202
  return(geno)
203
}
204
205
# ----------------------------------------------------------------------
206
# Convert the QTL experiment data to the format used in R/qtl. The
207
# return value is a 'cross' object that keeps track of all the data in
208
# a single QTL experiment; for more details, see the help for function
209
# 'read.cross' in R/qtl. The alleles are labeled as A and B, where A
210
# is the reference allele, and B is the alternative allele. The qtl
211
# library requires a paternal grandmother ("pgm") phenotype, so a
212
# column in the table for this phenotype is included if it is missing,
213
# and the entries of this column are set to zero. IMPORTANT NOTE: this
214
# function does not work for markers on the X chromosome.
215
data2qtl <- function (pheno, gp, map) {
216
217
  # Get the number of markers (p) and the number of samples (n).
218
  p <- nrow(map)
219
  n <- nrow(pheno)
220
221
  # Get the set of chromosomes.
222
  chromosomes <- as.character(unique(map$chr))
223
  
224
  # Create a new matrix in which the genotypes are encoded as follows:
225
  # NA = missing, 1 = AA, 2 = AB, 3 = BB. Here, A refers to the
226
  # reference allele, and B refers to the alternative allele (this
227
  # matches the encoding for the F2 intercross; see the help for
228
  # 'read.cross' in the qtl library for more details). Any genotypes
229
  # for which we are somewhat uncertain (probability less than 0.95),
230
  # I set these genotypes to missing (NA). 
231
  cutoff <- 0.95
232
  geno   <- matrix(NA,nrow = n,ncol = p,dimnames = list(pheno$id,map$snp))
233
  geno[gp$p0 > cutoff] <- 1
234
  geno[gp$p1 > cutoff] <- 2
235
  geno[gp$p2 > cutoff] <- 3
236
237
  # Convert the genotype probabilities to an n x p x 3 array.
238
  prob <- array(NA,c(n,p,3),list(pheno$id,map$snp,c("AA","AB","BB")))
239
  prob[,,1] <- gp$p0
240
  prob[,,2] <- gp$p1
241
  prob[,,3] <- gp$p2
242
  rm(gp)
243
  
244
  # Add the paternal grandmother ("pgm") phenotype if it does not
245
  # already exist.
246
  if (!is.element("pgm",names(pheno)))
247
    pheno <- cbind(pheno,pgm = 0)
248
  
249
  # Initialize the cross object, setting the genotypes to an empty list.
250
  cross        <- list(geno = list(),pheno = pheno)
251
  class(cross) <- c("f2","cross")
252
253
  # Set the alleles.
254
  attributes(cross)$alleles <- c("A","B")
255
256
  # Split up the markers by chromosome.
257
  for (i in chromosomes) {
258
259
    # Get the markers on the current chromosome.
260
    markers <- which(map$chr == i)
261
262
    # Get the genotype and map data corresponding to these markers.
263
    # Note that I need to coerce the genotype data to be a matrix in
264
    # the exceptional case when there is only a single marker
265
    # genotyped on the chromosome.
266
    m <- map[markers,]
267
    d <- list(data = as.matrix(geno[,markers]),
268
              prob = prob[,markers,],
269
              map  = m$pos)    
270
    attributes(d$prob)$map <- d$map
271
    names(d$map)           <- m$snp
272
    class(d)               <- "A"
273
    
274
    # Store the genotypes for markers on the chromosome.
275
    cross$geno[[i]] <- d    
276
  }
277
278
  return(cross)
279
}
280
281
# ----------------------------------------------------------------------
282
# Merge the results of R/qtl genome-wide scans for multiple phenotypes into
283
# a single data table.
284
merge.gwscan.qtl <- function (files) {
285
286
  # First load the marker data, and initialize the data table
287
  # containing the QTL mapping results.
288
  load(files[[1]])
289
  class(gwscan.qtl)       <- "data.frame"
290
  gwscan.merged           <- gwscan.qtl[c("chr","pos")]
291
  rownames(gwscan.merged) <- map$snp
292
293
  # Initialize other data structures containing the merged mapping
294
  # results.
295
  phenotypes.merged <- list()
296
  covariates.merged <- list()
297
  perms.merged      <- list()
298
299
  # Repeat for each file containing QTL mapping results.
300
  for (analysis in names(files)) {
301
302
    # Load the QTL mapping results.
303
    load(files[[analysis]])
304
305
    # Get the phenotype and covariates.
306
    phenotypes.merged[analysis] <- list(phenotype)
307
    covariates.merged[analysis] <- list(covariates)
308
    
309
    # Get the QTL mapping results.
310
    class(gwscan.qtl)    <- "data.frame"
311
    names(gwscan.qtl)[3] <- analysis
312
    gwscan.merged        <- cbind(gwscan.merged,gwscan.qtl[analysis])
313
314
    # Get the permutations.
315
    class(perms.qtl)         <- "matrix"
316
    perms.merged[[analysis]] <- perms.qtl
317
  }
318
319
  # Convert the permutations to a matrix, in which matrix columns
320
  # correspond to phenotypes.
321
  perms.merged           <- do.call(cbind,perms.merged)
322
  colnames(perms.merged) <- names(files)
323
  
324
  # Output the merged data.
325
  phenotypes.merged    <- unlist(phenotypes.merged)
326
  class(gwscan.merged) <- c("scanone","data.frame")
327
  class(perms.merged)  <- c("scanoneperm","matrix")
328
  return(list(phenotypes = phenotypes.merged,
329
              covariates = covariates.merged,
330
              gwscan.qtl = gwscan.merged,
331
              perms.qtl  = perms.merged))
332
}
333
334
# ----------------------------------------------------------------------
335
# Get the genome-wide LOD thresholds estimated in R/qtl using a
336
# permutation-based test.
337
get.thresholds <- function (perms, alpha) {
338
  thresholds        <- summary(perms,alpha)
339
  cols              <- colnames(thresholds)
340
  thresholds        <- as.vector(thresholds)
341
  names(thresholds) <- cols
342
  return(thresholds)
343
}