|
a |
|
b/research_paper_code/src/data.manip.R |
|
|
1 |
# SUMMARY |
|
|
2 |
# ------- |
|
|
3 |
# Some functions for manipulating QTL experiment data. Here is an |
|
|
4 |
# overview of the functions defined in this file: |
|
|
5 |
# |
|
|
6 |
# prepare.pheno(pheno) |
|
|
7 |
# remove.outliers(pheno,phenotype,covariates,outliers,verbose) |
|
|
8 |
# genotypes2counts(geno,A,B) |
|
|
9 |
# mapgeno(gp) |
|
|
10 |
# data2qtl(pheno,gp,map) |
|
|
11 |
# merge.gwscan.qtl(files) |
|
|
12 |
# get.thresholds(perms,alpha) |
|
|
13 |
# |
|
|
14 |
# FUNCTION DEFINITIONS |
|
|
15 |
# ---------------------------------------------------------------------- |
|
|
16 |
# This function prepares the phenotype data for QTL mapping. |
|
|
17 |
prepare.pheno <- function (pheno) { |
|
|
18 |
|
|
|
19 |
# Remove rows marked as "discard" and as possible sample mixups. |
|
|
20 |
pheno <- subset(pheno,discard == "no" & mixup == "no") |
|
|
21 |
|
|
|
22 |
# Create a new binary trait that is equal to 1 when the mouse has an |
|
|
23 |
# "abnormal" bone-mineral density, and 0 otherwise. |
|
|
24 |
pheno <- cbind(pheno, |
|
|
25 |
data.frame(abBMD = factor(as.numeric(pheno$BMD > 90)))) |
|
|
26 |
|
|
|
27 |
# Create some binary traits ("pretrainbin", "d1tonebin" and |
|
|
28 |
# "altcontextbin") for fear conditioning metrics that show a |
|
|
29 |
# "bimodal" freezing distribution. Each of these phenotypes is equal |
|
|
30 |
# to 1 when we observe some freezing, and 0 otherwise. |
|
|
31 |
pheno <- |
|
|
32 |
cbind(pheno,with(pheno, |
|
|
33 |
data.frame(pretrainbin = factor(as.numeric(PreTrainD1 > 0.01)), |
|
|
34 |
d1tonebin = factor(as.numeric(AvToneD1 > 0.01)), |
|
|
35 |
altcontextbin = factor(as.numeric(AvAltContextD3 > 0.01))))) |
|
|
36 |
|
|
|
37 |
# Create a binary trait from the p120b1 trait that indicates that |
|
|
38 |
# the mouse does not respond to the pulse, possibly indicating |
|
|
39 |
# deafness. I choose mice with p120b1 values on the "long tail" of |
|
|
40 |
# the distribution for this phenotype. |
|
|
41 |
pheno <- cbind(pheno, |
|
|
42 |
data.frame(deaf = factor(as.numeric(pheno$p120b1 < 50)))) |
|
|
43 |
|
|
|
44 |
# The deaf mice effectively add "noise" to the PPI phenotypes, in |
|
|
45 |
# part because the ratio has a denominator that would be close to |
|
|
46 |
# zero. So I remove these samples from the analysis. |
|
|
47 |
PPI.phenotypes <- |
|
|
48 |
c("p120b1","p120b2","p120b3","p120b4","pp3b1","pp3b2","pp3avg", |
|
|
49 |
"pp6b1","pp6b2","pp6avg","pp12b1","pp12b2","pp12avg","pp3PPIb1", |
|
|
50 |
"pp3PPIb2","pp3PPIavg","pp6PPIb1","pp6PPIb2","pp6PPIavg", |
|
|
51 |
"pp12PPIb1","pp12PPIb2","pp12PPIavg","PPIavg","startle") |
|
|
52 |
rows <- which(pheno$p120b1 < 10) |
|
|
53 |
pheno[rows,PPI.phenotypes] <- NA |
|
|
54 |
|
|
|
55 |
# Transform bone-mineral densities, which are ratios, to the |
|
|
56 |
# log-scale (in base 10), so that the distribution of this phenotype |
|
|
57 |
# looks roughly normal. (Although there is a long tail of high BMD |
|
|
58 |
# measurements that we will have to discard.) |
|
|
59 |
pheno <- transform(pheno,BMD = log10(BMD)) |
|
|
60 |
|
|
|
61 |
# Transform the fear conditioning phenotypes, which are proportions |
|
|
62 |
# (numbers between 0 and 1) to the log-odds scale using the logit |
|
|
63 |
# function (in base 10). |
|
|
64 |
f <- function (x) logit10(project.onto.interval(x,0.01,0.99)) |
|
|
65 |
pheno <- transform(pheno, |
|
|
66 |
PreTrainD1 = f(PreTrainD1), |
|
|
67 |
AvToneD1 = f(AvToneD1), |
|
|
68 |
AvContextD2 = f(AvContextD2), |
|
|
69 |
AvAltContextD3 = f(AvAltContextD3), |
|
|
70 |
AvToneD3 = f(AvToneD3), |
|
|
71 |
D3.180 = f(D3.180), |
|
|
72 |
D3.240 = f(D3.240), |
|
|
73 |
D3.300 = f(D3.300), |
|
|
74 |
D3.360 = f(D3.360)) |
|
|
75 |
|
|
|
76 |
# Transform some of the prepulse inhibition (PPI) phenotypes, which |
|
|
77 |
# are proportions (numbers between 0 and 1, except for a few |
|
|
78 |
# negative values), to the log-odds scale. |
|
|
79 |
pheno <- transform(pheno, |
|
|
80 |
pp3PPIavg = f(pp3PPIavg), |
|
|
81 |
pp6PPIavg = f(pp6PPIavg), |
|
|
82 |
pp12PPIavg = f(pp12PPIavg), |
|
|
83 |
PPIavg = f(PPIavg)) |
|
|
84 |
|
|
|
85 |
# Transform the "center time" phenotypes so that they are |
|
|
86 |
# proportions, then transform the proportions to the (base 10) logit |
|
|
87 |
# scale. |
|
|
88 |
pheno <- transform(pheno, |
|
|
89 |
D1ctrtime0to15 = f(D1ctrtime0to15/900), |
|
|
90 |
D2ctrtime0to15 = f(D2ctrtime0to15/900), |
|
|
91 |
D3ctrtime0to15 = f(D3ctrtime0to15/900), |
|
|
92 |
|
|
|
93 |
D1ctrtime0to30 = f(D1ctrtime0to30/1800), |
|
|
94 |
D2ctrtime0to30 = f(D2ctrtime0to30/1800), |
|
|
95 |
D3ctrtime0to30 = f(D3ctrtime0to30/1800)) |
|
|
96 |
|
|
|
97 |
# Transform the "vertical activity" phenotypes so that they are |
|
|
98 |
# proportions, then transform the proportions to the (base 10) logit |
|
|
99 |
# scale. |
|
|
100 |
pheno <- transform(pheno, |
|
|
101 |
D1vact0to15 = f(D1vact0to15/900), |
|
|
102 |
D2vact0to15 = f(D2vact0to15/900), |
|
|
103 |
D3vact0to15 = f(D3vact0to15/900), |
|
|
104 |
|
|
|
105 |
D1vact0to30 = f(D1vact0to30/1800), |
|
|
106 |
D2vact0to30 = f(D2vact0to30/1800), |
|
|
107 |
D3vact0to30 = f(D3vact0to30/1800)) |
|
|
108 |
|
|
|
109 |
# Create a new phenotype defined to be the first principal component |
|
|
110 |
# of the muscle weights. |
|
|
111 |
mw.pheno <- pheno[c("TA","EDL","soleus","plantaris","gastroc")] |
|
|
112 |
rows <- which(rowSums(is.na(mw.pheno)) == 0) |
|
|
113 |
pheno <- cbind(pheno,data.frame(mwpc = NA)) |
|
|
114 |
pheno[rows,"mwpc"] <- prcomp(mw.pheno[rows,],center=TRUE,scale=TRUE)$x[,1] |
|
|
115 |
|
|
|
116 |
# Return the updated phenotype data table. |
|
|
117 |
return(pheno) |
|
|
118 |
} |
|
|
119 |
|
|
|
120 |
# ---------------------------------------------------------------------- |
|
|
121 |
# Remove outliers from the phenotype data for the given phenotype, |
|
|
122 |
# optionally conditioning on covariates. If covariates are specified, |
|
|
123 |
# outliers are determined according to the residual of the phenotype |
|
|
124 |
# after regressing on the covariates. |
|
|
125 |
remove.outliers <- function (pheno, phenotype, covariates, outliers, |
|
|
126 |
verbose = TRUE) { |
|
|
127 |
|
|
|
128 |
# Specify the function for removing the outliers. |
|
|
129 |
is.outlier <- function (x) { |
|
|
130 |
y <- outliers(x) |
|
|
131 |
y[is.na(y)] <- FALSE |
|
|
132 |
return(y) |
|
|
133 |
} |
|
|
134 |
|
|
|
135 |
# If we are conditioning on one or more covariates, get the |
|
|
136 |
# residuals of the phenotype conditioned on these covariates. |
|
|
137 |
if (length(covariates) > 0) { |
|
|
138 |
f <- formula(paste(phenotype,"~",paste(covariates,collapse="+"))) |
|
|
139 |
r <- resid(lm(f,pheno,na.action = na.exclude)) |
|
|
140 |
} else |
|
|
141 |
r <- pheno[[phenotype]] |
|
|
142 |
|
|
|
143 |
# If requested, report how many outlying data points are removed. |
|
|
144 |
if (verbose) { |
|
|
145 |
n <- sum(is.outlier(r)) |
|
|
146 |
if (n == 0) |
|
|
147 |
cat("No outliers for",phenotype) |
|
|
148 |
else |
|
|
149 |
cat(n,"outliers for",phenotype) |
|
|
150 |
if (length(covariates) == 0) |
|
|
151 |
cat(" are removed.\n") |
|
|
152 |
else |
|
|
153 |
cat(" conditioned on",paste(covariates,collapse=" + "),"are removed.\n") |
|
|
154 |
} |
|
|
155 |
|
|
|
156 |
# Remove the outlying data points. |
|
|
157 |
pheno[is.outlier(r),phenotype] <- NA |
|
|
158 |
|
|
|
159 |
# Return the updated phenotype data table. |
|
|
160 |
return(pheno) |
|
|
161 |
} |
|
|
162 |
|
|
|
163 |
# ---------------------------------------------------------------------- |
|
|
164 |
# This function takes three inputs: a vector of strings representing |
|
|
165 |
# genotypes, a character giving the reference allele, and another |
|
|
166 |
# character giving the alternative allele. The return value is a |
|
|
167 |
# vector of allele counts; precisely, the number of times the |
|
|
168 |
# alternative (B) allele appears in the genotype. |
|
|
169 |
genotypes2counts <- function (geno, A, B) { |
|
|
170 |
|
|
|
171 |
# Initialize the allele counts to be missing. |
|
|
172 |
g <- rep(NA,length(geno)) |
|
|
173 |
|
|
|
174 |
# Set the allele counts according to the genotypes. |
|
|
175 |
g[geno == paste0(A,A)] <- 0 |
|
|
176 |
g[geno == paste0(A,B)] <- 1 |
|
|
177 |
g[geno == paste0(B,A)] <- 1 |
|
|
178 |
g[geno == paste0(B,B)] <- 2 |
|
|
179 |
|
|
|
180 |
# Return the allele counts. |
|
|
181 |
return(g) |
|
|
182 |
} |
|
|
183 |
|
|
|
184 |
# ---------------------------------------------------------------------- |
|
|
185 |
# Given the genotype probabilities, output the genotypes (allele |
|
|
186 |
# counts) with the highest probabilities; i.e. the maximum a |
|
|
187 |
# posteriori estimates of the genotypes. |
|
|
188 |
mapgeno <- function (gp) { |
|
|
189 |
|
|
|
190 |
# Get the largest probability for each genotype. |
|
|
191 |
r <- pmax(gp$p0,gp$p1,gp$p2) |
|
|
192 |
|
|
|
193 |
# Initialize the output. |
|
|
194 |
geno <- r |
|
|
195 |
geno[] <- 0 |
|
|
196 |
|
|
|
197 |
# Get the genotypes corresponding to the largest probabilities. |
|
|
198 |
geno[gp$p1 == r] <- 1 |
|
|
199 |
geno[gp$p2 == r] <- 2 |
|
|
200 |
|
|
|
201 |
# Return the matrix of genotypes. |
|
|
202 |
return(geno) |
|
|
203 |
} |
|
|
204 |
|
|
|
205 |
# ---------------------------------------------------------------------- |
|
|
206 |
# Convert the QTL experiment data to the format used in R/qtl. The |
|
|
207 |
# return value is a 'cross' object that keeps track of all the data in |
|
|
208 |
# a single QTL experiment; for more details, see the help for function |
|
|
209 |
# 'read.cross' in R/qtl. The alleles are labeled as A and B, where A |
|
|
210 |
# is the reference allele, and B is the alternative allele. The qtl |
|
|
211 |
# library requires a paternal grandmother ("pgm") phenotype, so a |
|
|
212 |
# column in the table for this phenotype is included if it is missing, |
|
|
213 |
# and the entries of this column are set to zero. IMPORTANT NOTE: this |
|
|
214 |
# function does not work for markers on the X chromosome. |
|
|
215 |
data2qtl <- function (pheno, gp, map) { |
|
|
216 |
|
|
|
217 |
# Get the number of markers (p) and the number of samples (n). |
|
|
218 |
p <- nrow(map) |
|
|
219 |
n <- nrow(pheno) |
|
|
220 |
|
|
|
221 |
# Get the set of chromosomes. |
|
|
222 |
chromosomes <- as.character(unique(map$chr)) |
|
|
223 |
|
|
|
224 |
# Create a new matrix in which the genotypes are encoded as follows: |
|
|
225 |
# NA = missing, 1 = AA, 2 = AB, 3 = BB. Here, A refers to the |
|
|
226 |
# reference allele, and B refers to the alternative allele (this |
|
|
227 |
# matches the encoding for the F2 intercross; see the help for |
|
|
228 |
# 'read.cross' in the qtl library for more details). Any genotypes |
|
|
229 |
# for which we are somewhat uncertain (probability less than 0.95), |
|
|
230 |
# I set these genotypes to missing (NA). |
|
|
231 |
cutoff <- 0.95 |
|
|
232 |
geno <- matrix(NA,nrow = n,ncol = p,dimnames = list(pheno$id,map$snp)) |
|
|
233 |
geno[gp$p0 > cutoff] <- 1 |
|
|
234 |
geno[gp$p1 > cutoff] <- 2 |
|
|
235 |
geno[gp$p2 > cutoff] <- 3 |
|
|
236 |
|
|
|
237 |
# Convert the genotype probabilities to an n x p x 3 array. |
|
|
238 |
prob <- array(NA,c(n,p,3),list(pheno$id,map$snp,c("AA","AB","BB"))) |
|
|
239 |
prob[,,1] <- gp$p0 |
|
|
240 |
prob[,,2] <- gp$p1 |
|
|
241 |
prob[,,3] <- gp$p2 |
|
|
242 |
rm(gp) |
|
|
243 |
|
|
|
244 |
# Add the paternal grandmother ("pgm") phenotype if it does not |
|
|
245 |
# already exist. |
|
|
246 |
if (!is.element("pgm",names(pheno))) |
|
|
247 |
pheno <- cbind(pheno,pgm = 0) |
|
|
248 |
|
|
|
249 |
# Initialize the cross object, setting the genotypes to an empty list. |
|
|
250 |
cross <- list(geno = list(),pheno = pheno) |
|
|
251 |
class(cross) <- c("f2","cross") |
|
|
252 |
|
|
|
253 |
# Set the alleles. |
|
|
254 |
attributes(cross)$alleles <- c("A","B") |
|
|
255 |
|
|
|
256 |
# Split up the markers by chromosome. |
|
|
257 |
for (i in chromosomes) { |
|
|
258 |
|
|
|
259 |
# Get the markers on the current chromosome. |
|
|
260 |
markers <- which(map$chr == i) |
|
|
261 |
|
|
|
262 |
# Get the genotype and map data corresponding to these markers. |
|
|
263 |
# Note that I need to coerce the genotype data to be a matrix in |
|
|
264 |
# the exceptional case when there is only a single marker |
|
|
265 |
# genotyped on the chromosome. |
|
|
266 |
m <- map[markers,] |
|
|
267 |
d <- list(data = as.matrix(geno[,markers]), |
|
|
268 |
prob = prob[,markers,], |
|
|
269 |
map = m$pos) |
|
|
270 |
attributes(d$prob)$map <- d$map |
|
|
271 |
names(d$map) <- m$snp |
|
|
272 |
class(d) <- "A" |
|
|
273 |
|
|
|
274 |
# Store the genotypes for markers on the chromosome. |
|
|
275 |
cross$geno[[i]] <- d |
|
|
276 |
} |
|
|
277 |
|
|
|
278 |
return(cross) |
|
|
279 |
} |
|
|
280 |
|
|
|
281 |
# ---------------------------------------------------------------------- |
|
|
282 |
# Merge the results of R/qtl genome-wide scans for multiple phenotypes into |
|
|
283 |
# a single data table. |
|
|
284 |
merge.gwscan.qtl <- function (files) { |
|
|
285 |
|
|
|
286 |
# First load the marker data, and initialize the data table |
|
|
287 |
# containing the QTL mapping results. |
|
|
288 |
load(files[[1]]) |
|
|
289 |
class(gwscan.qtl) <- "data.frame" |
|
|
290 |
gwscan.merged <- gwscan.qtl[c("chr","pos")] |
|
|
291 |
rownames(gwscan.merged) <- map$snp |
|
|
292 |
|
|
|
293 |
# Initialize other data structures containing the merged mapping |
|
|
294 |
# results. |
|
|
295 |
phenotypes.merged <- list() |
|
|
296 |
covariates.merged <- list() |
|
|
297 |
perms.merged <- list() |
|
|
298 |
|
|
|
299 |
# Repeat for each file containing QTL mapping results. |
|
|
300 |
for (analysis in names(files)) { |
|
|
301 |
|
|
|
302 |
# Load the QTL mapping results. |
|
|
303 |
load(files[[analysis]]) |
|
|
304 |
|
|
|
305 |
# Get the phenotype and covariates. |
|
|
306 |
phenotypes.merged[analysis] <- list(phenotype) |
|
|
307 |
covariates.merged[analysis] <- list(covariates) |
|
|
308 |
|
|
|
309 |
# Get the QTL mapping results. |
|
|
310 |
class(gwscan.qtl) <- "data.frame" |
|
|
311 |
names(gwscan.qtl)[3] <- analysis |
|
|
312 |
gwscan.merged <- cbind(gwscan.merged,gwscan.qtl[analysis]) |
|
|
313 |
|
|
|
314 |
# Get the permutations. |
|
|
315 |
class(perms.qtl) <- "matrix" |
|
|
316 |
perms.merged[[analysis]] <- perms.qtl |
|
|
317 |
} |
|
|
318 |
|
|
|
319 |
# Convert the permutations to a matrix, in which matrix columns |
|
|
320 |
# correspond to phenotypes. |
|
|
321 |
perms.merged <- do.call(cbind,perms.merged) |
|
|
322 |
colnames(perms.merged) <- names(files) |
|
|
323 |
|
|
|
324 |
# Output the merged data. |
|
|
325 |
phenotypes.merged <- unlist(phenotypes.merged) |
|
|
326 |
class(gwscan.merged) <- c("scanone","data.frame") |
|
|
327 |
class(perms.merged) <- c("scanoneperm","matrix") |
|
|
328 |
return(list(phenotypes = phenotypes.merged, |
|
|
329 |
covariates = covariates.merged, |
|
|
330 |
gwscan.qtl = gwscan.merged, |
|
|
331 |
perms.qtl = perms.merged)) |
|
|
332 |
} |
|
|
333 |
|
|
|
334 |
# ---------------------------------------------------------------------- |
|
|
335 |
# Get the genome-wide LOD thresholds estimated in R/qtl using a |
|
|
336 |
# permutation-based test. |
|
|
337 |
get.thresholds <- function (perms, alpha) { |
|
|
338 |
thresholds <- summary(perms,alpha) |
|
|
339 |
cols <- colnames(thresholds) |
|
|
340 |
thresholds <- as.vector(thresholds) |
|
|
341 |
names(thresholds) <- cols |
|
|
342 |
return(thresholds) |
|
|
343 |
} |