[befbfc]: / research_paper_code / src / data.manip.R

Download this file

344 lines (290 with data), 13.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
# SUMMARY
# -------
# Some functions for manipulating QTL experiment data. Here is an
# overview of the functions defined in this file:
#
# prepare.pheno(pheno)
# remove.outliers(pheno,phenotype,covariates,outliers,verbose)
# genotypes2counts(geno,A,B)
# mapgeno(gp)
# data2qtl(pheno,gp,map)
# merge.gwscan.qtl(files)
# get.thresholds(perms,alpha)
#
# FUNCTION DEFINITIONS
# ----------------------------------------------------------------------
# This function prepares the phenotype data for QTL mapping.
prepare.pheno <- function (pheno) {
# Remove rows marked as "discard" and as possible sample mixups.
pheno <- subset(pheno,discard == "no" & mixup == "no")
# Create a new binary trait that is equal to 1 when the mouse has an
# "abnormal" bone-mineral density, and 0 otherwise.
pheno <- cbind(pheno,
data.frame(abBMD = factor(as.numeric(pheno$BMD > 90))))
# Create some binary traits ("pretrainbin", "d1tonebin" and
# "altcontextbin") for fear conditioning metrics that show a
# "bimodal" freezing distribution. Each of these phenotypes is equal
# to 1 when we observe some freezing, and 0 otherwise.
pheno <-
cbind(pheno,with(pheno,
data.frame(pretrainbin = factor(as.numeric(PreTrainD1 > 0.01)),
d1tonebin = factor(as.numeric(AvToneD1 > 0.01)),
altcontextbin = factor(as.numeric(AvAltContextD3 > 0.01)))))
# Create a binary trait from the p120b1 trait that indicates that
# the mouse does not respond to the pulse, possibly indicating
# deafness. I choose mice with p120b1 values on the "long tail" of
# the distribution for this phenotype.
pheno <- cbind(pheno,
data.frame(deaf = factor(as.numeric(pheno$p120b1 < 50))))
# The deaf mice effectively add "noise" to the PPI phenotypes, in
# part because the ratio has a denominator that would be close to
# zero. So I remove these samples from the analysis.
PPI.phenotypes <-
c("p120b1","p120b2","p120b3","p120b4","pp3b1","pp3b2","pp3avg",
"pp6b1","pp6b2","pp6avg","pp12b1","pp12b2","pp12avg","pp3PPIb1",
"pp3PPIb2","pp3PPIavg","pp6PPIb1","pp6PPIb2","pp6PPIavg",
"pp12PPIb1","pp12PPIb2","pp12PPIavg","PPIavg","startle")
rows <- which(pheno$p120b1 < 10)
pheno[rows,PPI.phenotypes] <- NA
# Transform bone-mineral densities, which are ratios, to the
# log-scale (in base 10), so that the distribution of this phenotype
# looks roughly normal. (Although there is a long tail of high BMD
# measurements that we will have to discard.)
pheno <- transform(pheno,BMD = log10(BMD))
# Transform the fear conditioning phenotypes, which are proportions
# (numbers between 0 and 1) to the log-odds scale using the logit
# function (in base 10).
f <- function (x) logit10(project.onto.interval(x,0.01,0.99))
pheno <- transform(pheno,
PreTrainD1 = f(PreTrainD1),
AvToneD1 = f(AvToneD1),
AvContextD2 = f(AvContextD2),
AvAltContextD3 = f(AvAltContextD3),
AvToneD3 = f(AvToneD3),
D3.180 = f(D3.180),
D3.240 = f(D3.240),
D3.300 = f(D3.300),
D3.360 = f(D3.360))
# Transform some of the prepulse inhibition (PPI) phenotypes, which
# are proportions (numbers between 0 and 1, except for a few
# negative values), to the log-odds scale.
pheno <- transform(pheno,
pp3PPIavg = f(pp3PPIavg),
pp6PPIavg = f(pp6PPIavg),
pp12PPIavg = f(pp12PPIavg),
PPIavg = f(PPIavg))
# Transform the "center time" phenotypes so that they are
# proportions, then transform the proportions to the (base 10) logit
# scale.
pheno <- transform(pheno,
D1ctrtime0to15 = f(D1ctrtime0to15/900),
D2ctrtime0to15 = f(D2ctrtime0to15/900),
D3ctrtime0to15 = f(D3ctrtime0to15/900),
D1ctrtime0to30 = f(D1ctrtime0to30/1800),
D2ctrtime0to30 = f(D2ctrtime0to30/1800),
D3ctrtime0to30 = f(D3ctrtime0to30/1800))
# Transform the "vertical activity" phenotypes so that they are
# proportions, then transform the proportions to the (base 10) logit
# scale.
pheno <- transform(pheno,
D1vact0to15 = f(D1vact0to15/900),
D2vact0to15 = f(D2vact0to15/900),
D3vact0to15 = f(D3vact0to15/900),
D1vact0to30 = f(D1vact0to30/1800),
D2vact0to30 = f(D2vact0to30/1800),
D3vact0to30 = f(D3vact0to30/1800))
# Create a new phenotype defined to be the first principal component
# of the muscle weights.
mw.pheno <- pheno[c("TA","EDL","soleus","plantaris","gastroc")]
rows <- which(rowSums(is.na(mw.pheno)) == 0)
pheno <- cbind(pheno,data.frame(mwpc = NA))
pheno[rows,"mwpc"] <- prcomp(mw.pheno[rows,],center=TRUE,scale=TRUE)$x[,1]
# Return the updated phenotype data table.
return(pheno)
}
# ----------------------------------------------------------------------
# Remove outliers from the phenotype data for the given phenotype,
# optionally conditioning on covariates. If covariates are specified,
# outliers are determined according to the residual of the phenotype
# after regressing on the covariates.
remove.outliers <- function (pheno, phenotype, covariates, outliers,
verbose = TRUE) {
# Specify the function for removing the outliers.
is.outlier <- function (x) {
y <- outliers(x)
y[is.na(y)] <- FALSE
return(y)
}
# If we are conditioning on one or more covariates, get the
# residuals of the phenotype conditioned on these covariates.
if (length(covariates) > 0) {
f <- formula(paste(phenotype,"~",paste(covariates,collapse="+")))
r <- resid(lm(f,pheno,na.action = na.exclude))
} else
r <- pheno[[phenotype]]
# If requested, report how many outlying data points are removed.
if (verbose) {
n <- sum(is.outlier(r))
if (n == 0)
cat("No outliers for",phenotype)
else
cat(n,"outliers for",phenotype)
if (length(covariates) == 0)
cat(" are removed.\n")
else
cat(" conditioned on",paste(covariates,collapse=" + "),"are removed.\n")
}
# Remove the outlying data points.
pheno[is.outlier(r),phenotype] <- NA
# Return the updated phenotype data table.
return(pheno)
}
# ----------------------------------------------------------------------
# This function takes three inputs: a vector of strings representing
# genotypes, a character giving the reference allele, and another
# character giving the alternative allele. The return value is a
# vector of allele counts; precisely, the number of times the
# alternative (B) allele appears in the genotype.
genotypes2counts <- function (geno, A, B) {
# Initialize the allele counts to be missing.
g <- rep(NA,length(geno))
# Set the allele counts according to the genotypes.
g[geno == paste0(A,A)] <- 0
g[geno == paste0(A,B)] <- 1
g[geno == paste0(B,A)] <- 1
g[geno == paste0(B,B)] <- 2
# Return the allele counts.
return(g)
}
# ----------------------------------------------------------------------
# Given the genotype probabilities, output the genotypes (allele
# counts) with the highest probabilities; i.e. the maximum a
# posteriori estimates of the genotypes.
mapgeno <- function (gp) {
# Get the largest probability for each genotype.
r <- pmax(gp$p0,gp$p1,gp$p2)
# Initialize the output.
geno <- r
geno[] <- 0
# Get the genotypes corresponding to the largest probabilities.
geno[gp$p1 == r] <- 1
geno[gp$p2 == r] <- 2
# Return the matrix of genotypes.
return(geno)
}
# ----------------------------------------------------------------------
# Convert the QTL experiment data to the format used in R/qtl. The
# return value is a 'cross' object that keeps track of all the data in
# a single QTL experiment; for more details, see the help for function
# 'read.cross' in R/qtl. The alleles are labeled as A and B, where A
# is the reference allele, and B is the alternative allele. The qtl
# library requires a paternal grandmother ("pgm") phenotype, so a
# column in the table for this phenotype is included if it is missing,
# and the entries of this column are set to zero. IMPORTANT NOTE: this
# function does not work for markers on the X chromosome.
data2qtl <- function (pheno, gp, map) {
# Get the number of markers (p) and the number of samples (n).
p <- nrow(map)
n <- nrow(pheno)
# Get the set of chromosomes.
chromosomes <- as.character(unique(map$chr))
# Create a new matrix in which the genotypes are encoded as follows:
# NA = missing, 1 = AA, 2 = AB, 3 = BB. Here, A refers to the
# reference allele, and B refers to the alternative allele (this
# matches the encoding for the F2 intercross; see the help for
# 'read.cross' in the qtl library for more details). Any genotypes
# for which we are somewhat uncertain (probability less than 0.95),
# I set these genotypes to missing (NA).
cutoff <- 0.95
geno <- matrix(NA,nrow = n,ncol = p,dimnames = list(pheno$id,map$snp))
geno[gp$p0 > cutoff] <- 1
geno[gp$p1 > cutoff] <- 2
geno[gp$p2 > cutoff] <- 3
# Convert the genotype probabilities to an n x p x 3 array.
prob <- array(NA,c(n,p,3),list(pheno$id,map$snp,c("AA","AB","BB")))
prob[,,1] <- gp$p0
prob[,,2] <- gp$p1
prob[,,3] <- gp$p2
rm(gp)
# Add the paternal grandmother ("pgm") phenotype if it does not
# already exist.
if (!is.element("pgm",names(pheno)))
pheno <- cbind(pheno,pgm = 0)
# Initialize the cross object, setting the genotypes to an empty list.
cross <- list(geno = list(),pheno = pheno)
class(cross) <- c("f2","cross")
# Set the alleles.
attributes(cross)$alleles <- c("A","B")
# Split up the markers by chromosome.
for (i in chromosomes) {
# Get the markers on the current chromosome.
markers <- which(map$chr == i)
# Get the genotype and map data corresponding to these markers.
# Note that I need to coerce the genotype data to be a matrix in
# the exceptional case when there is only a single marker
# genotyped on the chromosome.
m <- map[markers,]
d <- list(data = as.matrix(geno[,markers]),
prob = prob[,markers,],
map = m$pos)
attributes(d$prob)$map <- d$map
names(d$map) <- m$snp
class(d) <- "A"
# Store the genotypes for markers on the chromosome.
cross$geno[[i]] <- d
}
return(cross)
}
# ----------------------------------------------------------------------
# Merge the results of R/qtl genome-wide scans for multiple phenotypes into
# a single data table.
merge.gwscan.qtl <- function (files) {
# First load the marker data, and initialize the data table
# containing the QTL mapping results.
load(files[[1]])
class(gwscan.qtl) <- "data.frame"
gwscan.merged <- gwscan.qtl[c("chr","pos")]
rownames(gwscan.merged) <- map$snp
# Initialize other data structures containing the merged mapping
# results.
phenotypes.merged <- list()
covariates.merged <- list()
perms.merged <- list()
# Repeat for each file containing QTL mapping results.
for (analysis in names(files)) {
# Load the QTL mapping results.
load(files[[analysis]])
# Get the phenotype and covariates.
phenotypes.merged[analysis] <- list(phenotype)
covariates.merged[analysis] <- list(covariates)
# Get the QTL mapping results.
class(gwscan.qtl) <- "data.frame"
names(gwscan.qtl)[3] <- analysis
gwscan.merged <- cbind(gwscan.merged,gwscan.qtl[analysis])
# Get the permutations.
class(perms.qtl) <- "matrix"
perms.merged[[analysis]] <- perms.qtl
}
# Convert the permutations to a matrix, in which matrix columns
# correspond to phenotypes.
perms.merged <- do.call(cbind,perms.merged)
colnames(perms.merged) <- names(files)
# Output the merged data.
phenotypes.merged <- unlist(phenotypes.merged)
class(gwscan.merged) <- c("scanone","data.frame")
class(perms.merged) <- c("scanoneperm","matrix")
return(list(phenotypes = phenotypes.merged,
covariates = covariates.merged,
gwscan.qtl = gwscan.merged,
perms.qtl = perms.merged))
}
# ----------------------------------------------------------------------
# Get the genome-wide LOD thresholds estimated in R/qtl using a
# permutation-based test.
get.thresholds <- function (perms, alpha) {
thresholds <- summary(perms,alpha)
cols <- colnames(thresholds)
thresholds <- as.vector(thresholds)
names(thresholds) <- cols
return(thresholds)
}