# SUMMARY
# -------
# Some functions for manipulating QTL experiment data. Here is an
# overview of the functions defined in this file:
#
# prepare.pheno(pheno)
# remove.outliers(pheno,phenotype,covariates,outliers,verbose)
# genotypes2counts(geno,A,B)
# mapgeno(gp)
# data2qtl(pheno,gp,map)
# merge.gwscan.qtl(files)
# get.thresholds(perms,alpha)
#
# FUNCTION DEFINITIONS
# ----------------------------------------------------------------------
# This function prepares the phenotype data for QTL mapping.
prepare.pheno <- function (pheno) {
# Remove rows marked as "discard" and as possible sample mixups.
pheno <- subset(pheno,discard == "no" & mixup == "no")
# Create a new binary trait that is equal to 1 when the mouse has an
# "abnormal" bone-mineral density, and 0 otherwise.
pheno <- cbind(pheno,
data.frame(abBMD = factor(as.numeric(pheno$BMD > 90))))
# Create some binary traits ("pretrainbin", "d1tonebin" and
# "altcontextbin") for fear conditioning metrics that show a
# "bimodal" freezing distribution. Each of these phenotypes is equal
# to 1 when we observe some freezing, and 0 otherwise.
pheno <-
cbind(pheno,with(pheno,
data.frame(pretrainbin = factor(as.numeric(PreTrainD1 > 0.01)),
d1tonebin = factor(as.numeric(AvToneD1 > 0.01)),
altcontextbin = factor(as.numeric(AvAltContextD3 > 0.01)))))
# Create a binary trait from the p120b1 trait that indicates that
# the mouse does not respond to the pulse, possibly indicating
# deafness. I choose mice with p120b1 values on the "long tail" of
# the distribution for this phenotype.
pheno <- cbind(pheno,
data.frame(deaf = factor(as.numeric(pheno$p120b1 < 50))))
# The deaf mice effectively add "noise" to the PPI phenotypes, in
# part because the ratio has a denominator that would be close to
# zero. So I remove these samples from the analysis.
PPI.phenotypes <-
c("p120b1","p120b2","p120b3","p120b4","pp3b1","pp3b2","pp3avg",
"pp6b1","pp6b2","pp6avg","pp12b1","pp12b2","pp12avg","pp3PPIb1",
"pp3PPIb2","pp3PPIavg","pp6PPIb1","pp6PPIb2","pp6PPIavg",
"pp12PPIb1","pp12PPIb2","pp12PPIavg","PPIavg","startle")
rows <- which(pheno$p120b1 < 10)
pheno[rows,PPI.phenotypes] <- NA
# Transform bone-mineral densities, which are ratios, to the
# log-scale (in base 10), so that the distribution of this phenotype
# looks roughly normal. (Although there is a long tail of high BMD
# measurements that we will have to discard.)
pheno <- transform(pheno,BMD = log10(BMD))
# Transform the fear conditioning phenotypes, which are proportions
# (numbers between 0 and 1) to the log-odds scale using the logit
# function (in base 10).
f <- function (x) logit10(project.onto.interval(x,0.01,0.99))
pheno <- transform(pheno,
PreTrainD1 = f(PreTrainD1),
AvToneD1 = f(AvToneD1),
AvContextD2 = f(AvContextD2),
AvAltContextD3 = f(AvAltContextD3),
AvToneD3 = f(AvToneD3),
D3.180 = f(D3.180),
D3.240 = f(D3.240),
D3.300 = f(D3.300),
D3.360 = f(D3.360))
# Transform some of the prepulse inhibition (PPI) phenotypes, which
# are proportions (numbers between 0 and 1, except for a few
# negative values), to the log-odds scale.
pheno <- transform(pheno,
pp3PPIavg = f(pp3PPIavg),
pp6PPIavg = f(pp6PPIavg),
pp12PPIavg = f(pp12PPIavg),
PPIavg = f(PPIavg))
# Transform the "center time" phenotypes so that they are
# proportions, then transform the proportions to the (base 10) logit
# scale.
pheno <- transform(pheno,
D1ctrtime0to15 = f(D1ctrtime0to15/900),
D2ctrtime0to15 = f(D2ctrtime0to15/900),
D3ctrtime0to15 = f(D3ctrtime0to15/900),
D1ctrtime0to30 = f(D1ctrtime0to30/1800),
D2ctrtime0to30 = f(D2ctrtime0to30/1800),
D3ctrtime0to30 = f(D3ctrtime0to30/1800))
# Transform the "vertical activity" phenotypes so that they are
# proportions, then transform the proportions to the (base 10) logit
# scale.
pheno <- transform(pheno,
D1vact0to15 = f(D1vact0to15/900),
D2vact0to15 = f(D2vact0to15/900),
D3vact0to15 = f(D3vact0to15/900),
D1vact0to30 = f(D1vact0to30/1800),
D2vact0to30 = f(D2vact0to30/1800),
D3vact0to30 = f(D3vact0to30/1800))
# Create a new phenotype defined to be the first principal component
# of the muscle weights.
mw.pheno <- pheno[c("TA","EDL","soleus","plantaris","gastroc")]
rows <- which(rowSums(is.na(mw.pheno)) == 0)
pheno <- cbind(pheno,data.frame(mwpc = NA))
pheno[rows,"mwpc"] <- prcomp(mw.pheno[rows,],center=TRUE,scale=TRUE)$x[,1]
# Return the updated phenotype data table.
return(pheno)
}
# ----------------------------------------------------------------------
# Remove outliers from the phenotype data for the given phenotype,
# optionally conditioning on covariates. If covariates are specified,
# outliers are determined according to the residual of the phenotype
# after regressing on the covariates.
remove.outliers <- function (pheno, phenotype, covariates, outliers,
verbose = TRUE) {
# Specify the function for removing the outliers.
is.outlier <- function (x) {
y <- outliers(x)
y[is.na(y)] <- FALSE
return(y)
}
# If we are conditioning on one or more covariates, get the
# residuals of the phenotype conditioned on these covariates.
if (length(covariates) > 0) {
f <- formula(paste(phenotype,"~",paste(covariates,collapse="+")))
r <- resid(lm(f,pheno,na.action = na.exclude))
} else
r <- pheno[[phenotype]]
# If requested, report how many outlying data points are removed.
if (verbose) {
n <- sum(is.outlier(r))
if (n == 0)
cat("No outliers for",phenotype)
else
cat(n,"outliers for",phenotype)
if (length(covariates) == 0)
cat(" are removed.\n")
else
cat(" conditioned on",paste(covariates,collapse=" + "),"are removed.\n")
}
# Remove the outlying data points.
pheno[is.outlier(r),phenotype] <- NA
# Return the updated phenotype data table.
return(pheno)
}
# ----------------------------------------------------------------------
# This function takes three inputs: a vector of strings representing
# genotypes, a character giving the reference allele, and another
# character giving the alternative allele. The return value is a
# vector of allele counts; precisely, the number of times the
# alternative (B) allele appears in the genotype.
genotypes2counts <- function (geno, A, B) {
# Initialize the allele counts to be missing.
g <- rep(NA,length(geno))
# Set the allele counts according to the genotypes.
g[geno == paste0(A,A)] <- 0
g[geno == paste0(A,B)] <- 1
g[geno == paste0(B,A)] <- 1
g[geno == paste0(B,B)] <- 2
# Return the allele counts.
return(g)
}
# ----------------------------------------------------------------------
# Given the genotype probabilities, output the genotypes (allele
# counts) with the highest probabilities; i.e. the maximum a
# posteriori estimates of the genotypes.
mapgeno <- function (gp) {
# Get the largest probability for each genotype.
r <- pmax(gp$p0,gp$p1,gp$p2)
# Initialize the output.
geno <- r
geno[] <- 0
# Get the genotypes corresponding to the largest probabilities.
geno[gp$p1 == r] <- 1
geno[gp$p2 == r] <- 2
# Return the matrix of genotypes.
return(geno)
}
# ----------------------------------------------------------------------
# Convert the QTL experiment data to the format used in R/qtl. The
# return value is a 'cross' object that keeps track of all the data in
# a single QTL experiment; for more details, see the help for function
# 'read.cross' in R/qtl. The alleles are labeled as A and B, where A
# is the reference allele, and B is the alternative allele. The qtl
# library requires a paternal grandmother ("pgm") phenotype, so a
# column in the table for this phenotype is included if it is missing,
# and the entries of this column are set to zero. IMPORTANT NOTE: this
# function does not work for markers on the X chromosome.
data2qtl <- function (pheno, gp, map) {
# Get the number of markers (p) and the number of samples (n).
p <- nrow(map)
n <- nrow(pheno)
# Get the set of chromosomes.
chromosomes <- as.character(unique(map$chr))
# Create a new matrix in which the genotypes are encoded as follows:
# NA = missing, 1 = AA, 2 = AB, 3 = BB. Here, A refers to the
# reference allele, and B refers to the alternative allele (this
# matches the encoding for the F2 intercross; see the help for
# 'read.cross' in the qtl library for more details). Any genotypes
# for which we are somewhat uncertain (probability less than 0.95),
# I set these genotypes to missing (NA).
cutoff <- 0.95
geno <- matrix(NA,nrow = n,ncol = p,dimnames = list(pheno$id,map$snp))
geno[gp$p0 > cutoff] <- 1
geno[gp$p1 > cutoff] <- 2
geno[gp$p2 > cutoff] <- 3
# Convert the genotype probabilities to an n x p x 3 array.
prob <- array(NA,c(n,p,3),list(pheno$id,map$snp,c("AA","AB","BB")))
prob[,,1] <- gp$p0
prob[,,2] <- gp$p1
prob[,,3] <- gp$p2
rm(gp)
# Add the paternal grandmother ("pgm") phenotype if it does not
# already exist.
if (!is.element("pgm",names(pheno)))
pheno <- cbind(pheno,pgm = 0)
# Initialize the cross object, setting the genotypes to an empty list.
cross <- list(geno = list(),pheno = pheno)
class(cross) <- c("f2","cross")
# Set the alleles.
attributes(cross)$alleles <- c("A","B")
# Split up the markers by chromosome.
for (i in chromosomes) {
# Get the markers on the current chromosome.
markers <- which(map$chr == i)
# Get the genotype and map data corresponding to these markers.
# Note that I need to coerce the genotype data to be a matrix in
# the exceptional case when there is only a single marker
# genotyped on the chromosome.
m <- map[markers,]
d <- list(data = as.matrix(geno[,markers]),
prob = prob[,markers,],
map = m$pos)
attributes(d$prob)$map <- d$map
names(d$map) <- m$snp
class(d) <- "A"
# Store the genotypes for markers on the chromosome.
cross$geno[[i]] <- d
}
return(cross)
}
# ----------------------------------------------------------------------
# Merge the results of R/qtl genome-wide scans for multiple phenotypes into
# a single data table.
merge.gwscan.qtl <- function (files) {
# First load the marker data, and initialize the data table
# containing the QTL mapping results.
load(files[[1]])
class(gwscan.qtl) <- "data.frame"
gwscan.merged <- gwscan.qtl[c("chr","pos")]
rownames(gwscan.merged) <- map$snp
# Initialize other data structures containing the merged mapping
# results.
phenotypes.merged <- list()
covariates.merged <- list()
perms.merged <- list()
# Repeat for each file containing QTL mapping results.
for (analysis in names(files)) {
# Load the QTL mapping results.
load(files[[analysis]])
# Get the phenotype and covariates.
phenotypes.merged[analysis] <- list(phenotype)
covariates.merged[analysis] <- list(covariates)
# Get the QTL mapping results.
class(gwscan.qtl) <- "data.frame"
names(gwscan.qtl)[3] <- analysis
gwscan.merged <- cbind(gwscan.merged,gwscan.qtl[analysis])
# Get the permutations.
class(perms.qtl) <- "matrix"
perms.merged[[analysis]] <- perms.qtl
}
# Convert the permutations to a matrix, in which matrix columns
# correspond to phenotypes.
perms.merged <- do.call(cbind,perms.merged)
colnames(perms.merged) <- names(files)
# Output the merged data.
phenotypes.merged <- unlist(phenotypes.merged)
class(gwscan.merged) <- c("scanone","data.frame")
class(perms.merged) <- c("scanoneperm","matrix")
return(list(phenotypes = phenotypes.merged,
covariates = covariates.merged,
gwscan.qtl = gwscan.merged,
perms.qtl = perms.merged))
}
# ----------------------------------------------------------------------
# Get the genome-wide LOD thresholds estimated in R/qtl using a
# permutation-based test.
get.thresholds <- function (perms, alpha) {
thresholds <- summary(perms,alpha)
cols <- colnames(thresholds)
thresholds <- as.vector(thresholds)
names(thresholds) <- cols
return(thresholds)
}