[d9f92b]: / R / gprob.R

Download this file

104 lines (102 with data), 3.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
# GPROB calculates genetic probability of multiple diseases
# Copyright (C) 2020 Rachel Knevel
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#' GPROB
#'
#' Many diseases may have similar presentations in the clinic. By
#' using genetic information, it may sometimes be possible to prioritize some
#' diagnoses over others. GPROB implements calculations of genetic probability
#' from genotypes and effect sizes for genetic variants associated with
#' diseases.
#'
#' @param prevalence A numeric vector with prevalence of each phenotype in the population.
#' @param or A matrix with odds ratios for each phenotype (column) and genetic variant (row).
#' @param geno A matrix with risk allele counts for each person (row) and genetic variant (column).
#' @return A list with the population-level probabilities for each person to
#' have each phenotype, and the conditional probabilities, assuming that each
#' person has one of the phenotypes.
#' @export
#' @importFrom Rdpack reprompt
#' @importFrom stats optimize
#' @references{
#' \insertRef{Knevel2020}{GPROB}
#' }
#' @examples
#' prevalence <- c("RA" = 0.001, "SLE" = 0.001)
#' or <- read.delim(
#' sep = "",
#' row.names = 1,
#' text = "
#' snp RA SLE
#' SNP1 1.0 0.4
#' SNP2 1.0 0.9
#' SNP3 1.0 1.3
#' SNP4 0.4 1.6
#' SNP5 0.9 1.0
#' SNP6 1.3 1.0
#' SNP7 1.6 1.0
#' ")
#' or <- as.matrix(or)
#' geno <- read.delim(
#' sep = "",
#' row.names = 1,
#' text = "
#' id SNP1 SNP2 SNP3 SNP4 SNP5 SNP6
#' 1 0 1 0 2 1 0
#' 2 0 0 1 0 2 2
#' 3 1 0 1 1 0 2
#' 4 1 1 0 2 0 0
#' 5 0 1 1 1 1 0
#' 6 0 0 1 3 0 2
#' 7 2 2 2 2 2 2
#' 8 1 2 0 2 1 1
#' 9 0 2 1 NA 1 2
#' 10 1 0 2 2 2 0
#' ")
#' geno <- as.matrix(geno)
#' ix <- apply(geno, 1, function(x) !any(is.na(x)))
#' geno <- geno[ix,]
#' ix <- apply(geno, 1, function(x) !any(x < 0 | x > 2))
#' geno <- geno[ix,]
#' or <- or[colnames(geno),]
#' GPROB(prevalence, or, geno)
GPROB <- function(prevalence, or, geno) {
# Risk scores for each individual.
risk <- geno %*% log(or)
# Population-level probability for a risk score.
prob <- function(alpha, risk) {
1 / ( 1 + exp(alpha - risk) )
}
# We find alpha by optimization to ensure that the mean
# population-level probability is equal to the prevalence.
alpha <- sapply(seq(ncol(risk)), function(i) {
o <- optimize(
f = function(alpha, risk, prevalence) {
( mean(prob(alpha, risk)) - prevalence ) ^ 2
},
interval = c(-100, 100),
risk = risk[,i],
prevalence = prevalence[i]
)
o$minimum
})
# Population-level disease probability.
p <- sapply(seq_along(alpha), function(i) prob(alpha[i], risk[,i]))
colnames(p) <- names(prevalence)
# Conditional disease probability, assuming each person has one of the
# phenotypes.
cp <- p / rowSums(p)
list(pop_prob = p, cond_prob = cp)
}