--- a
+++ b/R/gprob.R
@@ -0,0 +1,103 @@
+# GPROB calculates genetic probability of multiple diseases
+# Copyright (C) 2020  Rachel Knevel
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+#' GPROB
+#'
+#' Many diseases may have similar presentations in the clinic. By
+#' using genetic information, it may sometimes be possible to prioritize some
+#' diagnoses over others. GPROB implements calculations of genetic probability
+#' from genotypes and effect sizes for genetic variants associated with
+#' diseases.
+#'
+#' @param prevalence A numeric vector with prevalence of each phenotype in the population.
+#' @param or A matrix with odds ratios for each phenotype (column) and genetic variant (row).
+#' @param geno A matrix with risk allele counts for each person (row) and genetic variant (column).
+#' @return A list with the population-level probabilities for each person to
+#' have each phenotype, and the conditional probabilities, assuming that each
+#' person has one of the phenotypes.
+#' @export
+#' @importFrom Rdpack reprompt
+#' @importFrom stats optimize
+#' @references{
+#' \insertRef{Knevel2020}{GPROB}
+#' }
+#' @examples
+#' prevalence <- c("RA" = 0.001, "SLE" = 0.001)
+#' or <- read.delim(
+#'   sep = "",
+#'   row.names = 1,
+#'   text = "
+#' snp  RA SLE
+#' SNP1 1.0 0.4
+#' SNP2 1.0 0.9
+#' SNP3 1.0 1.3
+#' SNP4 0.4 1.6
+#' SNP5 0.9 1.0
+#' SNP6 1.3 1.0
+#' SNP7 1.6 1.0
+#' ")
+#' or <- as.matrix(or)
+#' geno <- read.delim(
+#'   sep = "",
+#'   row.names = 1,
+#'   text = "
+#' id SNP1 SNP2 SNP3 SNP4 SNP5 SNP6
+#'  1    0    1    0    2    1    0
+#'  2    0    0    1    0    2    2
+#'  3    1    0    1    1    0    2
+#'  4    1    1    0    2    0    0
+#'  5    0    1    1    1    1    0
+#'  6    0    0    1    3    0    2
+#'  7    2    2    2    2    2    2
+#'  8    1    2    0    2    1    1
+#'  9    0    2    1   NA    1    2
+#' 10    1    0    2    2    2    0
+#' ")
+#' geno <- as.matrix(geno)
+#' ix <- apply(geno, 1, function(x) !any(is.na(x)))
+#' geno <- geno[ix,]
+#' ix <- apply(geno, 1, function(x) !any(x < 0 | x > 2))
+#' geno <- geno[ix,]
+#' or <- or[colnames(geno),]
+#' GPROB(prevalence, or, geno)
+GPROB <- function(prevalence, or, geno) {
+  # Risk scores for each individual.
+  risk <- geno %*% log(or)
+  # Population-level probability for a risk score.
+  prob <- function(alpha, risk) {
+    1 / ( 1 + exp(alpha - risk) )
+  }
+  # We find alpha by optimization to ensure that the mean
+  # population-level probability is equal to the prevalence.
+  alpha <- sapply(seq(ncol(risk)), function(i) {
+    o <- optimize(
+      f = function(alpha, risk, prevalence) {
+        ( mean(prob(alpha, risk)) - prevalence ) ^ 2
+      },
+      interval = c(-100, 100),
+      risk = risk[,i],
+      prevalence = prevalence[i]
+    )
+    o$minimum
+  })
+  # Population-level disease probability.
+  p <- sapply(seq_along(alpha), function(i) prob(alpha[i], risk[,i]))
+  colnames(p) <- names(prevalence)
+  # Conditional disease probability, assuming each person has one of the
+  # phenotypes.
+  cp <- p / rowSums(p)
+  list(pop_prob = p, cond_prob = cp)
+}